001/* 002 * $Id: PdfTextExtractor.java 4784 2011-03-15 08:33:00Z blowagie $ 003 * 004 * This file is part of the iText (R) project. 005 * Copyright (c) 1998-2011 1T3XT BVBA 006 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al. 007 * 008 * This program is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Affero General Public License version 3 010 * as published by the Free Software Foundation with the addition of the 011 * following permission added to Section 15 as permitted in Section 7(a): 012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT, 013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. 014 * 015 * This program is distributed in the hope that it will be useful, but 016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 017 * or FITNESS FOR A PARTICULAR PURPOSE. 018 * See the GNU Affero General Public License for more details. 019 * You should have received a copy of the GNU Affero General Public License 020 * along with this program; if not, see http://www.gnu.org/licenses or write to 021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 022 * Boston, MA, 02110-1301 USA, or download the license from the following URL: 023 * http://itextpdf.com/terms-of-use/ 024 * 025 * The interactive user interfaces in modified source and object code versions 026 * of this program must display Appropriate Legal Notices, as required under 027 * Section 5 of the GNU Affero General Public License. 028 * 029 * In accordance with Section 7(b) of the GNU Affero General Public License, 030 * a covered work must retain the producer line in every PDF that is created 031 * or manipulated using iText. 032 * 033 * You can be released from the requirements of the license by purchasing 034 * a commercial license. Buying such a license is mandatory as soon as you 035 * develop commercial activities involving the iText software without 036 * disclosing the source code of your own applications. 037 * These activities include: offering paid services to customers as an ASP, 038 * serving PDFs on the fly in a web application, shipping iText with a closed 039 * source product. 040 * 041 * For more information, please contact iText Software Corp. at this 042 * address: sales@itextpdf.com 043 */ 044package com.itextpdf.text.pdf.parser; 045 046import java.io.IOException; 047 048import com.itextpdf.text.pdf.PdfReader; 049 050/** 051 * Extracts text from a PDF file. 052 * @since 2.1.4 053 */ 054public final class PdfTextExtractor { 055 056 /** 057 * This class only contains static methods. 058 */ 059 private PdfTextExtractor() { 060 } 061 062 /** 063 * Extract text from a specified page using an extraction strategy. 064 * @param reader the reader to extract text from 065 * @param pageNumber the page to extract text from 066 * @param strategy the strategy to use for extracting text 067 * @return the extracted text 068 * @throws IOException if any operation fails while reading from the provided PdfReader 069 * @since 5.0.2 070 */ 071 public static String getTextFromPage(PdfReader reader, int pageNumber, TextExtractionStrategy strategy) throws IOException{ 072 PdfReaderContentParser parser = new PdfReaderContentParser(reader); 073 return parser.processContent(pageNumber, strategy).getResultantText(); 074 075 } 076 077 /** 078 * Extract text from a specified page using the default strategy. 079 * <p><strong>Note:</strong> the default strategy is subject to change. If using a specific strategy 080 * is important, use {@link PdfTextExtractor#getTextFromPage(PdfReader, int, TextExtractionStrategy)} 081 * @param reader the reader to extract text from 082 * @param pageNumber the page to extract text from 083 * @return the extracted text 084 * @throws IOException if any operation fails while reading from the provided PdfReader 085 * @since 5.0.2 086 */ 087 public static String getTextFromPage(PdfReader reader, int pageNumber) throws IOException{ 088 return getTextFromPage(reader, pageNumber, new LocationTextExtractionStrategy()); 089 } 090 091}