001/*
002 * $Id: SimpleTextExtractionStrategy.java 4898 2011-06-07 20:41:18Z psoares33 $
003 *
004 * This file is part of the iText (R) project.
005 * Copyright (c) 1998-2011 1T3XT BVBA
006 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al.
007 *
008 * This program is free software; you can redistribute it and/or modify
009 * it under the terms of the GNU Affero General Public License version 3
010 * as published by the Free Software Foundation with the addition of the
011 * following permission added to Section 15 as permitted in Section 7(a):
012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
014 *
015 * This program is distributed in the hope that it will be useful, but
016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
017 * or FITNESS FOR A PARTICULAR PURPOSE.
018 * See the GNU Affero General Public License for more details.
019 * You should have received a copy of the GNU Affero General Public License
020 * along with this program; if not, see http://www.gnu.org/licenses or write to
021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
022 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
023 * http://itextpdf.com/terms-of-use/
024 *
025 * The interactive user interfaces in modified source and object code versions
026 * of this program must display Appropriate Legal Notices, as required under
027 * Section 5 of the GNU Affero General Public License.
028 *
029 * In accordance with Section 7(b) of the GNU Affero General Public License,
030 * a covered work must retain the producer line in every PDF that is created
031 * or manipulated using iText.
032 *
033 * You can be released from the requirements of the license by purchasing
034 * a commercial license. Buying such a license is mandatory as soon as you
035 * develop commercial activities involving the iText software without
036 * disclosing the source code of your own applications.
037 * These activities include: offering paid services to customers as an ASP,
038 * serving PDFs on the fly in a web application, shipping iText with a closed
039 * source product.
040 *
041 * For more information, please contact iText Software Corp. at this
042 * address: sales@itextpdf.com
043 */
044package com.itextpdf.text.pdf.parser;
045
046
047/**
048 * A simple text extraction renderer.
049 * 
050 * This renderer keeps track of the current Y position of each string.  If it detects
051 * that the y position has changed, it inserts a line break into the output.  If the
052 * PDF renders text in a non-top-to-bottom fashion, this will result in the text not
053 * being a true representation of how it appears in the PDF.
054 * 
055 * This renderer also uses a simple strategy based on the font metrics to determine if
056 * a blank space should be inserted into the output.
057 * 
058 * @since       2.1.5
059 */
060public class SimpleTextExtractionStrategy implements TextExtractionStrategy {
061
062    private Vector lastStart;
063    private Vector lastEnd;
064    
065    /** used to store the resulting String. */
066    private final StringBuffer result = new StringBuffer();;
067
068    /**
069     * Creates a new text extraction renderer.
070     */
071    public SimpleTextExtractionStrategy() {
072    }
073
074    /**
075     * @since 5.0.1
076     */
077    public void beginTextBlock() {
078    }
079
080    /**
081     * @since 5.0.1
082     */
083    public void endTextBlock() {
084    }
085    
086    /**
087     * Returns the result so far.
088     * @return  a String with the resulting text.
089     */
090    public String getResultantText(){
091        return result.toString();
092    }
093
094    /**
095     * Captures text using a simplified algorithm for inserting hard returns and spaces
096     * @param   renderInfo      render info
097     */
098    public void renderText(TextRenderInfo renderInfo) {
099        boolean firstRender = result.length() == 0;
100        boolean hardReturn = false;
101
102        LineSegment segment = renderInfo.getBaseline();
103        Vector start = segment.getStartPoint();
104        Vector end = segment.getEndPoint();
105        
106        if (!firstRender){
107            Vector x0 = start;
108            Vector x1 = lastStart;
109            Vector x2 = lastEnd;
110            
111            // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
112            float dist = (x2.subtract(x1)).cross((x1.subtract(x0))).lengthSquared() / x2.subtract(x1).lengthSquared();
113
114            float sameLineThreshold = 1f; // we should probably base this on the current font metrics, but 1 pt seems to be sufficient for the time being
115            if (dist > sameLineThreshold)
116                hardReturn = true;
117            
118            // Note:  Technically, we should check both the start and end positions, in case the angle of the text changed without any displacement
119            // but this sort of thing probably doesn't happen much in reality, so we'll leave it alone for now
120        }
121        
122        if (hardReturn){
123            //System.out.println("<< Hard Return >>");
124            result.append('\n');
125        } else if (!firstRender){ 
126            if (result.charAt(result.length()-1) != ' ' && renderInfo.getText().length() > 0 && renderInfo.getText().charAt(0) != ' '){ // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
127                float spacing = lastEnd.subtract(start).length();
128                if (spacing > renderInfo.getSingleSpaceWidth()/2f){
129                    result.append(' ');
130                    //System.out.println("Inserting implied space before '" + renderInfo.getText() + "'");
131                }
132            }
133        } else {
134            //System.out.println("Displaying first string of content '" + text + "' :: x1 = " + x1);
135        }
136        
137        //System.out.println("[" + renderInfo.getStartPoint() + "]->[" + renderInfo.getEndPoint() + "] " + renderInfo.getText());
138        result.append(renderInfo.getText());
139
140        lastStart = start;
141        lastEnd = end;
142        
143    }
144
145    /**
146     * no-op method - this renderer isn't interested in image events
147     * @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(com.itextpdf.text.pdf.parser.ImageRenderInfo)
148     * @since 5.0.1
149     */
150    public void renderImage(ImageRenderInfo renderInfo) {
151        // do nothing - we aren't tracking images in this renderer
152    }
153
154
155}