001/* 002 * $Id: CMapAwareDocumentFont.java 4874 2011-05-19 21:32:56Z psoares33 $ 003 * 004 * This file is part of the iText (R) project. 005 * Copyright (c) 1998-2011 1T3XT BVBA 006 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al. 007 * 008 * This program is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Affero General Public License version 3 010 * as published by the Free Software Foundation with the addition of the 011 * following permission added to Section 15 as permitted in Section 7(a): 012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT, 013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. 014 * 015 * This program is distributed in the hope that it will be useful, but 016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 017 * or FITNESS FOR A PARTICULAR PURPOSE. 018 * See the GNU Affero General Public License for more details. 019 * You should have received a copy of the GNU Affero General Public License 020 * along with this program; if not, see http://www.gnu.org/licenses or write to 021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 022 * Boston, MA, 02110-1301 USA, or download the license from the following URL: 023 * http://itextpdf.com/terms-of-use/ 024 * 025 * The interactive user interfaces in modified source and object code versions 026 * of this program must display Appropriate Legal Notices, as required under 027 * Section 5 of the GNU Affero General Public License. 028 * 029 * In accordance with Section 7(b) of the GNU Affero General Public License, 030 * a covered work must retain the producer line in every PDF that is created 031 * or manipulated using iText. 032 * 033 * You can be released from the requirements of the license by purchasing 034 * a commercial license. Buying such a license is mandatory as soon as you 035 * develop commercial activities involving the iText software without 036 * disclosing the source code of your own applications. 037 * These activities include: offering paid services to customers as an ASP, 038 * serving PDFs on the fly in a web application, shipping iText with a closed 039 * source product. 040 * 041 * For more information, please contact iText Software Corp. at this 042 * address: sales@itextpdf.com 043 */ 044package com.itextpdf.text.pdf; 045 046import java.io.ByteArrayInputStream; 047import java.io.IOException; 048import com.itextpdf.text.error_messages.MessageLocalization; 049 050import com.itextpdf.text.pdf.fonts.cmaps.CMap; 051import com.itextpdf.text.pdf.fonts.cmaps.CMapParser; 052 053 054/** 055 * Implementation of DocumentFont used while parsing PDF streams. 056 * @since 2.1.4 057 */ 058public class CMapAwareDocumentFont extends DocumentFont { 059 060 /** The font dictionary. */ 061 private PdfDictionary fontDic; 062 /** the width of a space for this font, in normalized 1000 point units */ 063 private int spaceWidth; 064 /** The CMap constructed from the ToUnicode map from the font's dictionary, if present. 065 * This CMap transforms CID values into unicode equivalent 066 */ 067 private CMap toUnicodeCmap; 068 /** 069 * Mapping between CID code (single byte only for now) and unicode equivalent 070 * as derived by the font's encoding. Only needed if the ToUnicode CMap is not provided. 071 */ 072 private char[] cidbyte2uni; 073 074 /** 075 * Creates an instance of a CMapAwareFont based on an indirect reference to a font. 076 * @param refFont the indirect reference to a font 077 */ 078 public CMapAwareDocumentFont(PRIndirectReference refFont) { 079 super(refFont); 080 fontDic = (PdfDictionary)PdfReader.getPdfObjectRelease(refFont); 081 082 processToUnicode(); 083 //if (toUnicodeCmap == null) 084 processUni2Byte(); 085 086 spaceWidth = super.getWidth(' '); 087 if (spaceWidth == 0){ 088 spaceWidth = computeAverageWidth(); 089 } 090 091 } 092 093 /** 094 * Parses the ToUnicode entry, if present, and constructs a CMap for it 095 * @since 2.1.7 096 */ 097 private void processToUnicode(){ 098 099 PdfObject toUni = PdfReader.getPdfObjectRelease(fontDic.get(PdfName.TOUNICODE)); 100 if (toUni instanceof PRStream){ 101 102 try { 103 byte[] touni = PdfReader.getStreamBytes((PRStream)toUni); 104 105 CMapParser cmapParser = new CMapParser(); 106 toUnicodeCmap = cmapParser.parse(new ByteArrayInputStream(touni)); 107 } catch (IOException e) { 108 // technically, we should log this or provide some sort of feedback... but sometimes the cmap will be junk, but it's still possible to get text, so we don't want to throw an exception 109 //throw new IllegalStateException("Unable to process ToUnicode map - " + e.getMessage(), e); 110 } 111 } 112 } 113 114 /** 115 * Inverts DocumentFont's uni2byte mapping to obtain a cid-to-unicode mapping based 116 * on the font's encoding 117 * @since 2.1.7 118 */ 119 private void processUni2Byte(){ 120 IntHashtable uni2byte = getUni2Byte(); 121 int e[] = uni2byte.toOrderedKeys(); 122 if (e.length == 0) 123 return; 124 125 cidbyte2uni = new char[256]; 126 for (int k = 0; k < e.length; ++k) { 127 int n = uni2byte.get(e[k]); 128 129 // this is messy, messy - an encoding can have multiple unicode values mapping to the same cid - we are going to arbitrarily choose the first one 130 // what we really need to do is to parse the encoding, and handle the differences info ourselves. This is a huge duplication of code of what is already 131 // being done in DocumentFont, so I really hate to go down that path without seriously thinking about a change in the organization of the Font class hierarchy 132 if (n < 256 && cidbyte2uni[n] == 0) 133 cidbyte2uni[n] = (char)e[k]; 134 } 135 IntHashtable diffmap = getDiffmap(); 136 if (diffmap != null) { 137 // the difference array overrides the existing encoding 138 e = diffmap.toOrderedKeys(); 139 for (int k = 0; k < e.length; ++k) { 140 int n = diffmap.get(e[k]); 141 if (n < 256) 142 cidbyte2uni[n] = (char)e[k]; 143 } 144 } 145 } 146 147 148 149 /** 150 * For all widths of all glyphs, compute the average width in normalized 1000 point units. 151 * This is used to give some meaningful width in cases where we need an average font width 152 * (such as if the width of a space isn't specified by a given font) 153 * @return the average width of all non-zero width glyphs in the font 154 */ 155 private int computeAverageWidth(){ 156 int count = 0; 157 int total = 0; 158 for(int i = 0; i < super.widths.length; i++){ 159 if(super.widths[i] != 0){ 160 total += super.widths[i]; 161 count++; 162 } 163 } 164 return count != 0 ? total/count : 0; 165 } 166 167 /** 168 * @since 2.1.5 169 * Override to allow special handling for fonts that don't specify width of space character 170 * @see com.itextpdf.text.pdf.DocumentFont#getWidth(int) 171 */ 172 public int getWidth(int char1) { 173 if (char1 == ' ') 174 return spaceWidth; 175 176 return super.getWidth(char1); 177 } 178 179 /** 180 * Decodes a single CID (represented by one or two bytes) to a unicode String. 181 * @param bytes the bytes making up the character code to convert 182 * @param offset an offset 183 * @param len a length 184 * @return a String containing the encoded form of the input bytes using the font's encoding. 185 */ 186 private String decodeSingleCID(byte[] bytes, int offset, int len){ 187 if (toUnicodeCmap != null){ 188 if (offset + len > bytes.length) 189 throw new ArrayIndexOutOfBoundsException(MessageLocalization.getComposedMessage("invalid.index.1", offset + len)); 190 String s = toUnicodeCmap.lookup(bytes, offset, len); 191 if (s != null) 192 return s; 193 if (len != 1 || cidbyte2uni == null) 194 return null; 195 } 196 197 if (len == 1){ 198 return new String(cidbyte2uni, 0xff & bytes[offset], 1); 199 } 200 201 throw new Error("Multi-byte glyphs not implemented yet"); 202 } 203 204 /** 205 * Decodes a string of bytes (encoded in the font's encoding) into a unicode string 206 * This will use the ToUnicode map of the font, if available, otherwise it uses 207 * the font's encoding 208 * @param cidbytes the bytes that need to be decoded 209 * @return the unicode String that results from decoding 210 * @since 2.1.7 211 */ 212 public String decode(byte[] cidbytes, final int offset, final int len){ 213 StringBuffer sb = new StringBuffer(); // it's a shame we can't make this StringBuilder 214 for(int i = offset; i < offset + len; i++){ 215 String rslt = decodeSingleCID(cidbytes, i, 1); 216 if (rslt == null && i < offset + len - 1){ 217 rslt = decodeSingleCID(cidbytes, i, 2); 218 i++; 219 } 220 sb.append(rslt); 221 } 222 223 return sb.toString(); 224 } 225 226 /** 227 * Encodes bytes to a String. 228 * @param bytes the bytes from a stream 229 * @param offset an offset 230 * @param len a length 231 * @return a String encoded taking into account if the bytes are in unicode or not. 232 * @deprecated method name is not indicative of what it does. Use <code>decode</code> instead. 233 */ 234 public String encode(byte[] bytes, int offset, int len){ 235 return decode(bytes, offset, len); 236 } 237}