001/* 002 * $Id: LocationTextExtractionStrategy.java 4784 2011-03-15 08:33:00Z blowagie $ 003 * 004 * This file is part of the iText (R) project. 005 * Copyright (c) 1998-2011 1T3XT BVBA 006 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al. 007 * 008 * This program is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Affero General Public License version 3 010 * as published by the Free Software Foundation with the addition of the 011 * following permission added to Section 15 as permitted in Section 7(a): 012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT, 013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. 014 * 015 * This program is distributed in the hope that it will be useful, but 016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 017 * or FITNESS FOR A PARTICULAR PURPOSE. 018 * See the GNU Affero General Public License for more details. 019 * You should have received a copy of the GNU Affero General Public License 020 * along with this program; if not, see http://www.gnu.org/licenses or write to 021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 022 * Boston, MA, 02110-1301 USA, or download the license from the following URL: 023 * http://itextpdf.com/terms-of-use/ 024 * 025 * The interactive user interfaces in modified source and object code versions 026 * of this program must display Appropriate Legal Notices, as required under 027 * Section 5 of the GNU Affero General Public License. 028 * 029 * In accordance with Section 7(b) of the GNU Affero General Public License, 030 * a covered work must retain the producer line in every PDF that is created 031 * or manipulated using iText. 032 * 033 * You can be released from the requirements of the license by purchasing 034 * a commercial license. Buying such a license is mandatory as soon as you 035 * develop commercial activities involving the iText software without 036 * disclosing the source code of your own applications. 037 * These activities include: offering paid services to customers as an ASP, 038 * serving PDFs on the fly in a web application, shipping iText with a closed 039 * source product. 040 * 041 * For more information, please contact iText Software Corp. at this 042 * address: sales@itextpdf.com 043 */ 044package com.itextpdf.text.pdf.parser; 045 046import java.util.ArrayList; 047import java.util.Collections; 048import java.util.Iterator; 049import java.util.List; 050 051 052/** 053 * <b>Development preview</b> - this class (and all of the parser classes) are still experiencing 054 * heavy development, and are subject to change both behavior and interface. 055 * <br> 056 * A text extraction renderer that keeps track of relative position of text on page 057 * The resultant text will be relatively consistent with the physical layout that most 058 * PDF files have on screen. 059 * <br> 060 * This renderer keeps track of the orientation and distance (both perpendicular 061 * and parallel) to the unit vector of the orientation. Text is ordered by 062 * orientation, then perpendicular, then parallel distance. Text with the same 063 * perpendicular distance, but different parallel distance is treated as being on 064 * the same line. 065 * <br> 066 * This renderer also uses a simple strategy based on the font metrics to determine if 067 * a blank space should be inserted into the output. 068 * 069 * @since 5.0.2 070 */ 071public class LocationTextExtractionStrategy implements TextExtractionStrategy { 072 073 /** set to true for debugging */ 074 static boolean DUMP_STATE = false; 075 076 /** a summary of all found text */ 077 private final List<TextChunk> locationalResult = new ArrayList<TextChunk>(); 078 079 /** 080 * Creates a new text extraction renderer. 081 */ 082 public LocationTextExtractionStrategy() { 083 } 084 085 /** 086 * @see com.itextpdf.text.pdf.parser.RenderListener#beginTextBlock() 087 */ 088 public void beginTextBlock(){ 089 } 090 091 /** 092 * @see com.itextpdf.text.pdf.parser.RenderListener#endTextBlock() 093 */ 094 public void endTextBlock(){ 095 } 096 097 /** 098 * Returns the result so far. 099 * @return a String with the resulting text. 100 */ 101 public String getResultantText(){ 102 103 if (DUMP_STATE) dumpState(); 104 105 Collections.sort(locationalResult); 106 107 StringBuffer sb = new StringBuffer(); 108 TextChunk lastChunk = null; 109 for (TextChunk chunk : locationalResult) { 110 111 if (lastChunk == null){ 112 sb.append(chunk.text); 113 } else { 114 if (chunk.sameLine(lastChunk)){ 115 float dist = chunk.distanceFromEndOf(lastChunk); 116 117 if (dist < -chunk.charSpaceWidth) 118 sb.append(' '); 119 120 // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space 121 else if (dist > chunk.charSpaceWidth/2.0f && chunk.text.charAt(0) != ' ' && lastChunk.text.charAt(lastChunk.text.length()-1) != ' ') 122 sb.append(' '); 123 124 sb.append(chunk.text); 125 } else { 126 sb.append('\n'); 127 sb.append(chunk.text); 128 } 129 } 130 lastChunk = chunk; 131 } 132 133 return sb.toString(); 134 135 } 136 137 /** Used for debugging only */ 138 private void dumpState(){ 139 for (Iterator<TextChunk> iterator = locationalResult.iterator(); iterator.hasNext(); ) { 140 TextChunk location = (TextChunk) iterator.next(); 141 142 location.printDiagnostics(); 143 144 System.out.println(); 145 } 146 147 } 148 149 /** 150 * 151 * @see com.itextpdf.text.pdf.parser.RenderListener#renderText(com.itextpdf.text.pdf.parser.TextRenderInfo) 152 */ 153 public void renderText(TextRenderInfo renderInfo) { 154 LineSegment segment = renderInfo.getBaseline(); 155 TextChunk location = new TextChunk(renderInfo.getText(), segment.getStartPoint(), segment.getEndPoint(), renderInfo.getSingleSpaceWidth()); 156 locationalResult.add(location); 157 } 158 159 160 161 /** 162 * Represents a chunk of text, it's orientation, and location relative to the orientation vector 163 */ 164 private static class TextChunk implements Comparable<TextChunk>{ 165 /** the text of the chunk */ 166 final String text; 167 /** the starting location of the chunk */ 168 final Vector startLocation; 169 /** the ending location of the chunk */ 170 final Vector endLocation; 171 /** unit vector in the orientation of the chunk */ 172 final Vector orientationVector; 173 /** the orientation as a scalar for quick sorting */ 174 final int orientationMagnitude; 175 /** perpendicular distance to the orientation unit vector (i.e. the Y position in an unrotated coordinate system) 176 * we round to the nearest integer to handle the fuzziness of comparing floats */ 177 final int distPerpendicular; 178 /** distance of the start of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated coordinate system) */ 179 final float distParallelStart; 180 /** distance of the end of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated coordinate system) */ 181 final float distParallelEnd; 182 /** the width of a single space character in the font of the chunk */ 183 final float charSpaceWidth; 184 185 public TextChunk(String string, Vector startLocation, Vector endLocation, float charSpaceWidth) { 186 this.text = string; 187 this.startLocation = startLocation; 188 this.endLocation = endLocation; 189 this.charSpaceWidth = charSpaceWidth; 190 191 orientationVector = endLocation.subtract(startLocation).normalize(); 192 orientationMagnitude = (int)(Math.atan2(orientationVector.get(Vector.I2), orientationVector.get(Vector.I1))*1000); 193 194 // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html 195 // the two vectors we are crossing are in the same plane, so the result will be purely 196 // in the z-axis (out of plane) direction, so we just take the I3 component of the result 197 Vector origin = new Vector(0,0,1); 198 distPerpendicular = (int)(startLocation.subtract(origin)).cross(orientationVector).get(Vector.I3); 199 200 distParallelStart = orientationVector.dot(startLocation); 201 distParallelEnd = orientationVector.dot(endLocation); 202 } 203 204 private void printDiagnostics(){ 205 System.out.println("Text (@" + startLocation + " -> " + endLocation + "): " + text); 206 System.out.println("orientationMagnitude: " + orientationMagnitude); 207 System.out.println("distPerpendicular: " + distPerpendicular); 208 System.out.println("distParallel: " + distParallelStart); 209 } 210 211 /** 212 * @param as the location to compare to 213 * @return true is this location is on the the same line as the other 214 */ 215 public boolean sameLine(TextChunk as){ 216 if (orientationMagnitude != as.orientationMagnitude) return false; 217 if (distPerpendicular != as.distPerpendicular) return false; 218 return true; 219 } 220 221 /** 222 * Computes the distance between the end of 'other' and the beginning of this chunk 223 * in the direction of this chunk's orientation vector. Note that it's a bad idea 224 * to call this for chunks that aren't on the same line and orientation, but we don't 225 * explicitly check for that condition for performance reasons. 226 * @param other 227 * @return the number of spaces between the end of 'other' and the beginning of this chunk 228 */ 229 public float distanceFromEndOf(TextChunk other){ 230 float distance = distParallelStart - other.distParallelEnd; 231 return distance; 232 } 233 234 /** 235 * Compares based on orientation, perpendicular distance, then parallel distance 236 * @see java.lang.Comparable#compareTo(java.lang.Object) 237 */ 238 public int compareTo(TextChunk rhs) { 239 if (this == rhs) return 0; // not really needed, but just in case 240 241 int rslt; 242 rslt = compareInts(orientationMagnitude, rhs.orientationMagnitude); 243 if (rslt != 0) return rslt; 244 245 rslt = compareInts(distPerpendicular, rhs.distPerpendicular); 246 if (rslt != 0) return rslt; 247 248 // note: it's never safe to check floating point numbers for equality, and if two chunks 249 // are truly right on top of each other, which one comes first or second just doesn't matter 250 // so we arbitrarily choose this way. 251 rslt = distParallelStart < rhs.distParallelStart ? -1 : 1; 252 253 return rslt; 254 } 255 256 /** 257 * 258 * @param int1 259 * @param int2 260 * @return comparison of the two integers 261 */ 262 private static int compareInts(int int1, int int2){ 263 return int1 == int2 ? 0 : int1 < int2 ? -1 : 1; 264 } 265 266 267 } 268 269 /** 270 * no-op method - this renderer isn't interested in image events 271 * @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(com.itextpdf.text.pdf.parser.ImageRenderInfo) 272 * @since 5.0.1 273 */ 274 public void renderImage(ImageRenderInfo renderInfo) { 275 // do nothing 276 } 277 278 279 280}