001/*
002 * $Id: CMapAwareDocumentFont.java 4874 2011-05-19 21:32:56Z psoares33 $
003 *
004 * This file is part of the iText (R) project.
005 * Copyright (c) 1998-2011 1T3XT BVBA
006 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al.
007 *
008 * This program is free software; you can redistribute it and/or modify
009 * it under the terms of the GNU Affero General Public License version 3
010 * as published by the Free Software Foundation with the addition of the
011 * following permission added to Section 15 as permitted in Section 7(a):
012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
014 *
015 * This program is distributed in the hope that it will be useful, but
016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
017 * or FITNESS FOR A PARTICULAR PURPOSE.
018 * See the GNU Affero General Public License for more details.
019 * You should have received a copy of the GNU Affero General Public License
020 * along with this program; if not, see http://www.gnu.org/licenses or write to
021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
022 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
023 * http://itextpdf.com/terms-of-use/
024 *
025 * The interactive user interfaces in modified source and object code versions
026 * of this program must display Appropriate Legal Notices, as required under
027 * Section 5 of the GNU Affero General Public License.
028 *
029 * In accordance with Section 7(b) of the GNU Affero General Public License,
030 * a covered work must retain the producer line in every PDF that is created
031 * or manipulated using iText.
032 *
033 * You can be released from the requirements of the license by purchasing
034 * a commercial license. Buying such a license is mandatory as soon as you
035 * develop commercial activities involving the iText software without
036 * disclosing the source code of your own applications.
037 * These activities include: offering paid services to customers as an ASP,
038 * serving PDFs on the fly in a web application, shipping iText with a closed
039 * source product.
040 *
041 * For more information, please contact iText Software Corp. at this
042 * address: sales@itextpdf.com
043 */
044package com.itextpdf.text.pdf;
045
046import java.io.ByteArrayInputStream;
047import java.io.IOException;
048import com.itextpdf.text.error_messages.MessageLocalization;
049
050import com.itextpdf.text.pdf.fonts.cmaps.CMap;
051import com.itextpdf.text.pdf.fonts.cmaps.CMapParser;
052
053
054/**
055 * Implementation of DocumentFont used while parsing PDF streams.
056 * @since 2.1.4
057 */
058public class CMapAwareDocumentFont extends DocumentFont {
059
060        /** The font dictionary. */
061    private PdfDictionary fontDic;
062    /** the width of a space for this font, in normalized 1000 point units */
063    private int spaceWidth;
064    /** The CMap constructed from the ToUnicode map from the font's dictionary, if present.
065         *  This CMap transforms CID values into unicode equivalent
066         */
067    private CMap toUnicodeCmap;
068        /**
069         *      Mapping between CID code (single byte only for now) and unicode equivalent
070         *  as derived by the font's encoding.  Only needed if the ToUnicode CMap is not provided.
071         */
072    private char[] cidbyte2uni;
073    
074    /**
075     * Creates an instance of a CMapAwareFont based on an indirect reference to a font.
076     * @param refFont   the indirect reference to a font
077     */
078    public CMapAwareDocumentFont(PRIndirectReference refFont) {
079        super(refFont);
080        fontDic = (PdfDictionary)PdfReader.getPdfObjectRelease(refFont);
081
082        processToUnicode();
083        //if (toUnicodeCmap == null)
084            processUni2Byte();
085        
086        spaceWidth = super.getWidth(' ');
087        if (spaceWidth == 0){
088            spaceWidth = computeAverageWidth();
089        }
090        
091    }
092
093    /**
094     * Parses the ToUnicode entry, if present, and constructs a CMap for it
095     * @since 2.1.7
096     */
097    private void processToUnicode(){
098        
099        PdfObject toUni = PdfReader.getPdfObjectRelease(fontDic.get(PdfName.TOUNICODE));
100        if (toUni instanceof PRStream){
101            
102            try {
103                byte[] touni = PdfReader.getStreamBytes((PRStream)toUni);
104    
105                CMapParser cmapParser = new CMapParser();
106                toUnicodeCmap = cmapParser.parse(new ByteArrayInputStream(touni));
107            } catch (IOException e) {
108                // technically, we should log this or provide some sort of feedback... but sometimes the cmap will be junk, but it's still possible to get text, so we don't want to throw an exception
109                //throw new IllegalStateException("Unable to process ToUnicode map - " + e.getMessage(), e);
110            }
111        }
112    }
113    
114    /**
115     * Inverts DocumentFont's uni2byte mapping to obtain a cid-to-unicode mapping based
116     * on the font's encoding
117     * @since 2.1.7
118     */
119    private void processUni2Byte(){
120        IntHashtable uni2byte = getUni2Byte();
121        int e[] = uni2byte.toOrderedKeys();
122        if (e.length == 0)
123            return;
124        
125        cidbyte2uni = new char[256];
126        for (int k = 0; k < e.length; ++k) {
127            int n = uni2byte.get(e[k]);
128            
129            // this is messy, messy - an encoding can have multiple unicode values mapping to the same cid - we are going to arbitrarily choose the first one
130            // what we really need to do is to parse the encoding, and handle the differences info ourselves.  This is a huge duplication of code of what is already
131            // being done in DocumentFont, so I really hate to go down that path without seriously thinking about a change in the organization of the Font class hierarchy
132            if (n < 256 && cidbyte2uni[n] == 0)
133                cidbyte2uni[n] = (char)e[k];
134        }
135        IntHashtable diffmap = getDiffmap();
136        if (diffmap != null) {
137            // the difference array overrides the existing encoding
138            e = diffmap.toOrderedKeys();
139            for (int k = 0; k < e.length; ++k) {
140                int n = diffmap.get(e[k]);
141                if (n < 256)
142                    cidbyte2uni[n] = (char)e[k];
143            }
144        }
145    }
146    
147
148    
149    /**
150     * For all widths of all glyphs, compute the average width in normalized 1000 point units.
151     * This is used to give some meaningful width in cases where we need an average font width 
152     * (such as if the width of a space isn't specified by a given font)
153     * @return the average width of all non-zero width glyphs in the font
154     */
155    private int computeAverageWidth(){
156        int count = 0;
157        int total = 0;
158        for(int i = 0; i < super.widths.length; i++){
159            if(super.widths[i] != 0){
160                total += super.widths[i];
161                count++;
162            }
163        }
164        return count != 0 ? total/count : 0;
165    }
166    
167    /**
168     * @since 2.1.5
169     * Override to allow special handling for fonts that don't specify width of space character
170     * @see com.itextpdf.text.pdf.DocumentFont#getWidth(int)
171     */
172    public int getWidth(int char1) {
173        if (char1 == ' ')
174            return spaceWidth;
175        
176        return super.getWidth(char1);
177    }
178    
179    /**
180     * Decodes a single CID (represented by one or two bytes) to a unicode String.
181     * @param bytes             the bytes making up the character code to convert
182     * @param offset    an offset
183     * @param len               a length
184     * @return  a String containing the encoded form of the input bytes using the font's encoding.
185     */
186    private String decodeSingleCID(byte[] bytes, int offset, int len){
187        if (toUnicodeCmap != null){
188            if (offset + len > bytes.length)
189                throw new ArrayIndexOutOfBoundsException(MessageLocalization.getComposedMessage("invalid.index.1", offset + len));
190            String s = toUnicodeCmap.lookup(bytes, offset, len);
191            if (s != null)
192                return s;
193            if (len != 1 || cidbyte2uni == null)
194                return null;
195        }
196
197        if (len == 1){
198            return new String(cidbyte2uni, 0xff & bytes[offset], 1);
199        }
200        
201        throw new Error("Multi-byte glyphs not implemented yet");
202    }
203
204    /**
205     * Decodes a string of bytes (encoded in the font's encoding) into a unicode string
206     * This will use the ToUnicode map of the font, if available, otherwise it uses
207     * the font's encoding
208     * @param cidbytes    the bytes that need to be decoded
209     * @return  the unicode String that results from decoding
210     * @since 2.1.7
211     */
212    public String decode(byte[] cidbytes, final int offset, final int len){
213        StringBuffer sb = new StringBuffer(); // it's a shame we can't make this StringBuilder
214        for(int i = offset; i < offset + len; i++){
215            String rslt = decodeSingleCID(cidbytes, i, 1);
216            if (rslt == null && i < offset + len - 1){
217                rslt = decodeSingleCID(cidbytes, i, 2);
218                i++;
219            }
220            sb.append(rslt);
221        }
222
223        return sb.toString();
224    }
225
226    /**
227     * Encodes bytes to a String.
228     * @param bytes             the bytes from a stream
229     * @param offset    an offset
230     * @param len               a length
231     * @return  a String encoded taking into account if the bytes are in unicode or not.
232     * @deprecated method name is not indicative of what it does.  Use <code>decode</code> instead.
233     */
234    public String encode(byte[] bytes, int offset, int len){
235        return decode(bytes, offset, len);    
236    }
237}