001/*
002 * $Id: LocationTextExtractionStrategy.java 4784 2011-03-15 08:33:00Z blowagie $
003 *
004 * This file is part of the iText (R) project.
005 * Copyright (c) 1998-2011 1T3XT BVBA
006 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al.
007 *
008 * This program is free software; you can redistribute it and/or modify
009 * it under the terms of the GNU Affero General Public License version 3
010 * as published by the Free Software Foundation with the addition of the
011 * following permission added to Section 15 as permitted in Section 7(a):
012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
014 *
015 * This program is distributed in the hope that it will be useful, but
016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
017 * or FITNESS FOR A PARTICULAR PURPOSE.
018 * See the GNU Affero General Public License for more details.
019 * You should have received a copy of the GNU Affero General Public License
020 * along with this program; if not, see http://www.gnu.org/licenses or write to
021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
022 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
023 * http://itextpdf.com/terms-of-use/
024 *
025 * The interactive user interfaces in modified source and object code versions
026 * of this program must display Appropriate Legal Notices, as required under
027 * Section 5 of the GNU Affero General Public License.
028 *
029 * In accordance with Section 7(b) of the GNU Affero General Public License,
030 * a covered work must retain the producer line in every PDF that is created
031 * or manipulated using iText.
032 *
033 * You can be released from the requirements of the license by purchasing
034 * a commercial license. Buying such a license is mandatory as soon as you
035 * develop commercial activities involving the iText software without
036 * disclosing the source code of your own applications.
037 * These activities include: offering paid services to customers as an ASP,
038 * serving PDFs on the fly in a web application, shipping iText with a closed
039 * source product.
040 *
041 * For more information, please contact iText Software Corp. at this
042 * address: sales@itextpdf.com
043 */
044package com.itextpdf.text.pdf.parser;
045
046import java.util.ArrayList;
047import java.util.Collections;
048import java.util.Iterator;
049import java.util.List;
050
051
052/**
053 * <b>Development preview</b> - this class (and all of the parser classes) are still experiencing
054 * heavy development, and are subject to change both behavior and interface.
055 * <br>
056 * A text extraction renderer that keeps track of relative position of text on page
057 * The resultant text will be relatively consistent with the physical layout that most
058 * PDF files have on screen.
059 * <br>
060 * This renderer keeps track of the orientation and distance (both perpendicular
061 * and parallel) to the unit vector of the orientation.  Text is ordered by
062 * orientation, then perpendicular, then parallel distance.  Text with the same
063 * perpendicular distance, but different parallel distance is treated as being on
064 * the same line.
065 * <br>
066 * This renderer also uses a simple strategy based on the font metrics to determine if
067 * a blank space should be inserted into the output.
068 *
069 * @since   5.0.2
070 */
071public class LocationTextExtractionStrategy implements TextExtractionStrategy {
072
073    /** set to true for debugging */
074    static boolean DUMP_STATE = false;
075    
076    /** a summary of all found text */
077    private final List<TextChunk> locationalResult = new ArrayList<TextChunk>();
078
079    /**
080     * Creates a new text extraction renderer.
081     */
082    public LocationTextExtractionStrategy() {
083    }
084
085    /**
086     * @see com.itextpdf.text.pdf.parser.RenderListener#beginTextBlock()
087     */
088    public void beginTextBlock(){
089    }
090
091    /**
092     * @see com.itextpdf.text.pdf.parser.RenderListener#endTextBlock()
093     */
094    public void endTextBlock(){
095    }
096
097    /**
098     * Returns the result so far.
099     * @return  a String with the resulting text.
100     */
101    public String getResultantText(){
102
103        if (DUMP_STATE) dumpState();
104        
105        Collections.sort(locationalResult);
106
107        StringBuffer sb = new StringBuffer();
108        TextChunk lastChunk = null;
109        for (TextChunk chunk : locationalResult) {
110
111            if (lastChunk == null){
112                sb.append(chunk.text);
113            } else {
114                if (chunk.sameLine(lastChunk)){
115                    float dist = chunk.distanceFromEndOf(lastChunk);
116                    
117                    if (dist < -chunk.charSpaceWidth)
118                        sb.append(' ');
119
120                    // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
121                    else if (dist > chunk.charSpaceWidth/2.0f && chunk.text.charAt(0) != ' ' && lastChunk.text.charAt(lastChunk.text.length()-1) != ' ')
122                        sb.append(' ');
123
124                    sb.append(chunk.text);
125                } else {
126                    sb.append('\n');
127                    sb.append(chunk.text);
128                }
129            }
130            lastChunk = chunk;
131        }
132
133        return sb.toString();
134
135    }
136
137    /** Used for debugging only */
138    private void dumpState(){
139        for (Iterator<TextChunk> iterator = locationalResult.iterator(); iterator.hasNext(); ) {
140            TextChunk location = (TextChunk) iterator.next();
141            
142            location.printDiagnostics();
143            
144            System.out.println();
145        }
146        
147    }
148    
149    /**
150     * 
151     * @see com.itextpdf.text.pdf.parser.RenderListener#renderText(com.itextpdf.text.pdf.parser.TextRenderInfo)
152     */
153    public void renderText(TextRenderInfo renderInfo) {
154        LineSegment segment = renderInfo.getBaseline();
155        TextChunk location = new TextChunk(renderInfo.getText(), segment.getStartPoint(), segment.getEndPoint(), renderInfo.getSingleSpaceWidth());
156        locationalResult.add(location);        
157    }
158    
159
160
161    /**
162     * Represents a chunk of text, it's orientation, and location relative to the orientation vector
163     */
164    private static class TextChunk implements Comparable<TextChunk>{
165        /** the text of the chunk */
166        final String text;
167        /** the starting location of the chunk */
168        final Vector startLocation;
169        /** the ending location of the chunk */
170        final Vector endLocation;
171        /** unit vector in the orientation of the chunk */
172        final Vector orientationVector;
173        /** the orientation as a scalar for quick sorting */
174        final int orientationMagnitude;
175        /** perpendicular distance to the orientation unit vector (i.e. the Y position in an unrotated coordinate system)
176         * we round to the nearest integer to handle the fuzziness of comparing floats */
177        final int distPerpendicular;
178        /** distance of the start of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated coordinate system) */
179        final float distParallelStart;
180        /** distance of the end of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated coordinate system) */
181        final float distParallelEnd;
182        /** the width of a single space character in the font of the chunk */
183        final float charSpaceWidth;
184        
185        public TextChunk(String string, Vector startLocation, Vector endLocation, float charSpaceWidth) {
186            this.text = string;
187            this.startLocation = startLocation;
188            this.endLocation = endLocation;
189            this.charSpaceWidth = charSpaceWidth;
190            
191            orientationVector = endLocation.subtract(startLocation).normalize();
192            orientationMagnitude = (int)(Math.atan2(orientationVector.get(Vector.I2), orientationVector.get(Vector.I1))*1000);
193
194            // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
195            // the two vectors we are crossing are in the same plane, so the result will be purely
196            // in the z-axis (out of plane) direction, so we just take the I3 component of the result
197            Vector origin = new Vector(0,0,1);
198            distPerpendicular = (int)(startLocation.subtract(origin)).cross(orientationVector).get(Vector.I3);
199
200            distParallelStart = orientationVector.dot(startLocation);
201            distParallelEnd = orientationVector.dot(endLocation);
202        }
203
204        private void printDiagnostics(){
205            System.out.println("Text (@" + startLocation + " -> " + endLocation + "): " + text);
206            System.out.println("orientationMagnitude: " + orientationMagnitude);
207            System.out.println("distPerpendicular: " + distPerpendicular);
208            System.out.println("distParallel: " + distParallelStart);
209        }
210        
211        /**
212         * @param as the location to compare to
213         * @return true is this location is on the the same line as the other
214         */
215        public boolean sameLine(TextChunk as){
216            if (orientationMagnitude != as.orientationMagnitude) return false;
217            if (distPerpendicular != as.distPerpendicular) return false;
218            return true;
219        }
220
221        /**
222         * Computes the distance between the end of 'other' and the beginning of this chunk
223         * in the direction of this chunk's orientation vector.  Note that it's a bad idea
224         * to call this for chunks that aren't on the same line and orientation, but we don't
225         * explicitly check for that condition for performance reasons.
226         * @param other
227         * @return the number of spaces between the end of 'other' and the beginning of this chunk
228         */
229        public float distanceFromEndOf(TextChunk other){
230            float distance = distParallelStart - other.distParallelEnd;
231            return distance;
232        }
233        
234        /**
235         * Compares based on orientation, perpendicular distance, then parallel distance
236         * @see java.lang.Comparable#compareTo(java.lang.Object)
237         */
238        public int compareTo(TextChunk rhs) {
239            if (this == rhs) return 0; // not really needed, but just in case
240            
241            int rslt;
242            rslt = compareInts(orientationMagnitude, rhs.orientationMagnitude);
243            if (rslt != 0) return rslt;
244
245            rslt = compareInts(distPerpendicular, rhs.distPerpendicular);
246            if (rslt != 0) return rslt;
247
248            // note: it's never safe to check floating point numbers for equality, and if two chunks
249            // are truly right on top of each other, which one comes first or second just doesn't matter
250            // so we arbitrarily choose this way.
251            rslt = distParallelStart < rhs.distParallelStart ? -1 : 1;
252
253            return rslt;
254        }
255
256        /**
257         *
258         * @param int1
259         * @param int2
260         * @return comparison of the two integers
261         */
262        private static int compareInts(int int1, int int2){
263            return int1 == int2 ? 0 : int1 < int2 ? -1 : 1;
264        }
265
266        
267    }
268
269    /**
270     * no-op method - this renderer isn't interested in image events
271     * @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(com.itextpdf.text.pdf.parser.ImageRenderInfo)
272     * @since 5.0.1
273     */
274    public void renderImage(ImageRenderInfo renderInfo) {
275        // do nothing
276    }
277
278
279
280}