001/* 002 * $Id: SimpleTextExtractionStrategy.java 4898 2011-06-07 20:41:18Z psoares33 $ 003 * 004 * This file is part of the iText (R) project. 005 * Copyright (c) 1998-2011 1T3XT BVBA 006 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al. 007 * 008 * This program is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Affero General Public License version 3 010 * as published by the Free Software Foundation with the addition of the 011 * following permission added to Section 15 as permitted in Section 7(a): 012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT, 013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. 014 * 015 * This program is distributed in the hope that it will be useful, but 016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 017 * or FITNESS FOR A PARTICULAR PURPOSE. 018 * See the GNU Affero General Public License for more details. 019 * You should have received a copy of the GNU Affero General Public License 020 * along with this program; if not, see http://www.gnu.org/licenses or write to 021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 022 * Boston, MA, 02110-1301 USA, or download the license from the following URL: 023 * http://itextpdf.com/terms-of-use/ 024 * 025 * The interactive user interfaces in modified source and object code versions 026 * of this program must display Appropriate Legal Notices, as required under 027 * Section 5 of the GNU Affero General Public License. 028 * 029 * In accordance with Section 7(b) of the GNU Affero General Public License, 030 * a covered work must retain the producer line in every PDF that is created 031 * or manipulated using iText. 032 * 033 * You can be released from the requirements of the license by purchasing 034 * a commercial license. Buying such a license is mandatory as soon as you 035 * develop commercial activities involving the iText software without 036 * disclosing the source code of your own applications. 037 * These activities include: offering paid services to customers as an ASP, 038 * serving PDFs on the fly in a web application, shipping iText with a closed 039 * source product. 040 * 041 * For more information, please contact iText Software Corp. at this 042 * address: sales@itextpdf.com 043 */ 044package com.itextpdf.text.pdf.parser; 045 046 047/** 048 * A simple text extraction renderer. 049 * 050 * This renderer keeps track of the current Y position of each string. If it detects 051 * that the y position has changed, it inserts a line break into the output. If the 052 * PDF renders text in a non-top-to-bottom fashion, this will result in the text not 053 * being a true representation of how it appears in the PDF. 054 * 055 * This renderer also uses a simple strategy based on the font metrics to determine if 056 * a blank space should be inserted into the output. 057 * 058 * @since 2.1.5 059 */ 060public class SimpleTextExtractionStrategy implements TextExtractionStrategy { 061 062 private Vector lastStart; 063 private Vector lastEnd; 064 065 /** used to store the resulting String. */ 066 private final StringBuffer result = new StringBuffer();; 067 068 /** 069 * Creates a new text extraction renderer. 070 */ 071 public SimpleTextExtractionStrategy() { 072 } 073 074 /** 075 * @since 5.0.1 076 */ 077 public void beginTextBlock() { 078 } 079 080 /** 081 * @since 5.0.1 082 */ 083 public void endTextBlock() { 084 } 085 086 /** 087 * Returns the result so far. 088 * @return a String with the resulting text. 089 */ 090 public String getResultantText(){ 091 return result.toString(); 092 } 093 094 /** 095 * Captures text using a simplified algorithm for inserting hard returns and spaces 096 * @param renderInfo render info 097 */ 098 public void renderText(TextRenderInfo renderInfo) { 099 boolean firstRender = result.length() == 0; 100 boolean hardReturn = false; 101 102 LineSegment segment = renderInfo.getBaseline(); 103 Vector start = segment.getStartPoint(); 104 Vector end = segment.getEndPoint(); 105 106 if (!firstRender){ 107 Vector x0 = start; 108 Vector x1 = lastStart; 109 Vector x2 = lastEnd; 110 111 // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html 112 float dist = (x2.subtract(x1)).cross((x1.subtract(x0))).lengthSquared() / x2.subtract(x1).lengthSquared(); 113 114 float sameLineThreshold = 1f; // we should probably base this on the current font metrics, but 1 pt seems to be sufficient for the time being 115 if (dist > sameLineThreshold) 116 hardReturn = true; 117 118 // Note: Technically, we should check both the start and end positions, in case the angle of the text changed without any displacement 119 // but this sort of thing probably doesn't happen much in reality, so we'll leave it alone for now 120 } 121 122 if (hardReturn){ 123 //System.out.println("<< Hard Return >>"); 124 result.append('\n'); 125 } else if (!firstRender){ 126 if (result.charAt(result.length()-1) != ' ' && renderInfo.getText().length() > 0 && renderInfo.getText().charAt(0) != ' '){ // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space 127 float spacing = lastEnd.subtract(start).length(); 128 if (spacing > renderInfo.getSingleSpaceWidth()/2f){ 129 result.append(' '); 130 //System.out.println("Inserting implied space before '" + renderInfo.getText() + "'"); 131 } 132 } 133 } else { 134 //System.out.println("Displaying first string of content '" + text + "' :: x1 = " + x1); 135 } 136 137 //System.out.println("[" + renderInfo.getStartPoint() + "]->[" + renderInfo.getEndPoint() + "] " + renderInfo.getText()); 138 result.append(renderInfo.getText()); 139 140 lastStart = start; 141 lastEnd = end; 142 143 } 144 145 /** 146 * no-op method - this renderer isn't interested in image events 147 * @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(com.itextpdf.text.pdf.parser.ImageRenderInfo) 148 * @since 5.0.1 149 */ 150 public void renderImage(ImageRenderInfo renderInfo) { 151 // do nothing - we aren't tracking images in this renderer 152 } 153 154 155}