001/** 002 * Copyright (c) 2005-2006, www.fontbox.org 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * 1. Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 2. Redistributions in binary form must reproduce the above copyright notice, 011 * this list of conditions and the following disclaimer in the documentation 012 * and/or other materials provided with the distribution. 013 * 3. Neither the name of fontbox; nor the names of its 014 * contributors may be used to endorse or promote products derived from this 015 * software without specific prior written permission. 016 * 017 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 018 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 020 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY 021 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 022 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 023 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 024 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 025 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 026 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 027 * 028 * http://www.fontbox.org 029 * 030 */ 031package com.itextpdf.text.pdf.fonts.cmaps; 032 033import java.io.FileInputStream; 034import java.io.IOException; 035import java.io.InputStream; 036import java.io.PushbackInputStream; 037import java.util.ArrayList; 038import java.util.HashMap; 039import java.util.List; 040import java.util.Map; 041 042import com.itextpdf.text.error_messages.MessageLocalization; 043 044/** 045 * This will parser a CMap stream. 046 * 047 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> 048 * @version $Revision: 4645 $ 049 * @since 2.1.4 050 */ 051public class CMapParser 052{ 053 private static final String BEGIN_CODESPACE_RANGE = "begincodespacerange"; 054 private static final String BEGIN_BASE_FONT_CHAR = "beginbfchar"; 055 private static final String BEGIN_BASE_FONT_RANGE = "beginbfrange"; 056 057 private static final String MARK_END_OF_DICTIONARY = ">>"; 058 private static final String MARK_END_OF_ARRAY = "]"; 059 060 private byte[] tokenParserByteBuffer = new byte[512]; 061 062 /** 063 * Creates a new instance of CMapParser. 064 */ 065 public CMapParser() 066 { 067 } 068 069 /** 070 * This will parse the stream and create a cmap object. 071 * 072 * @param input The CMAP stream to parse. 073 * @return The parsed stream as a java object. 074 * 075 * @throws IOException If there is an error parsing the stream. 076 */ 077 @SuppressWarnings("unchecked") 078 public CMap parse( InputStream input ) throws IOException 079 { 080 PushbackInputStream cmapStream = new PushbackInputStream( input ); 081 CMap result = new CMap(); 082 Object token = null; 083 while( (token = parseNextToken( cmapStream )) != null ) 084 { 085 if( token instanceof Operator ) 086 { 087 Operator op = (Operator)token; 088 if( op.op.equals( BEGIN_CODESPACE_RANGE ) ) 089 { 090 while (true) 091 { 092 Object nx = parseNextToken( cmapStream ); 093 if (nx instanceof Operator && ((Operator)nx).op.equals("endcodespacerange")) 094 break; 095 byte[] startRange = (byte[])nx; 096 byte[] endRange = (byte[])parseNextToken( cmapStream ); 097 CodespaceRange range = new CodespaceRange(); 098 range.setStart( startRange ); 099 range.setEnd( endRange ); 100 result.addCodespaceRange( range ); 101 } 102 } 103 else if( op.op.equals( BEGIN_BASE_FONT_CHAR ) ) 104 { 105 while (true) 106 { 107 Object nx = parseNextToken( cmapStream ); 108 if (nx instanceof Operator && ((Operator)nx).op.equals("endbfchar")) 109 break; 110 byte[] inputCode = (byte[])nx; 111 Object nextToken = parseNextToken( cmapStream ); 112 if( nextToken instanceof byte[] ) 113 { 114 byte[] bytes = (byte[])nextToken; 115 String value = createStringFromBytes( bytes ); 116 result.addMapping( inputCode, value ); 117 } 118 else if( nextToken instanceof LiteralName ) 119 { 120 result.addMapping( inputCode, ((LiteralName)nextToken).name ); 121 } 122 else 123 { 124 throw new IOException(MessageLocalization.getComposedMessage("error.parsing.cmap.beginbfchar.expected.cosstring.or.cosname.and.not.1", nextToken)); 125 } 126 } 127 } 128 else if( op.op.equals( BEGIN_BASE_FONT_RANGE ) ) 129 { 130 while (true) 131 { 132 Object nx = parseNextToken( cmapStream ); 133 if (nx instanceof Operator && ((Operator)nx).op.equals("endbfrange")) 134 break; 135 byte[] startCode = (byte[])nx; 136 byte[] endCode = (byte[])parseNextToken( cmapStream ); 137 Object nextToken = parseNextToken( cmapStream ); 138 List<byte[]> array = null; 139 byte[] tokenBytes = null; 140 if( nextToken instanceof List ) 141 { 142 array = (List<byte[]>)nextToken; 143 tokenBytes = array.get( 0 ); 144 } 145 else 146 { 147 tokenBytes = (byte[])nextToken; 148 } 149 150 String value = null; 151 152 int arrayIndex = 0; 153 boolean done = false; 154 while( !done ) 155 { 156 if( compare( startCode, endCode ) >= 0 ) 157 { 158 done = true; 159 } 160 value = createStringFromBytes( tokenBytes ); 161 result.addMapping( startCode, value ); 162 increment( startCode ); 163 164 if( array == null ) 165 { 166 increment( tokenBytes ); 167 } 168 else 169 { 170 arrayIndex++; 171 if( arrayIndex < array.size() ) 172 { 173 tokenBytes = array.get( arrayIndex ); 174 } 175 } 176 } 177 } 178 } 179 } 180 } 181 return result; 182 } 183 184 private Object parseNextToken( PushbackInputStream is ) throws IOException 185 { 186 Object retval = null; 187 int nextByte = is.read(); 188 //skip whitespace 189 while( nextByte == 0x09 || nextByte == 0x20 || nextByte == 0x0D || nextByte == 0x0A ) 190 { 191 nextByte = is.read(); 192 } 193 switch( nextByte ) 194 { 195 case '%': 196 { 197 //header operations, for now return the entire line 198 //may need to smarter in the future 199 StringBuffer buffer = new StringBuffer(); 200 buffer.append( (char)nextByte ); 201 readUntilEndOfLine( is, buffer ); 202 retval = buffer.toString(); 203 break; 204 } 205 case '(': 206 { 207 StringBuffer buffer = new StringBuffer(); 208 int stringByte = is.read(); 209 210 while( stringByte != -1 && stringByte != ')' ) 211 { 212 buffer.append( (char)stringByte ); 213 stringByte = is.read(); 214 } 215 retval = buffer.toString(); 216 break; 217 } 218 case '>': 219 { 220 int secondCloseBrace = is.read(); 221 if( secondCloseBrace == '>' ) 222 { 223 retval = MARK_END_OF_DICTIONARY; 224 } 225 else 226 { 227 throw new IOException(MessageLocalization.getComposedMessage("error.expected.the.end.of.a.dictionary")); 228 } 229 break; 230 } 231 case ']': 232 { 233 retval = MARK_END_OF_ARRAY; 234 break; 235 } 236 case '[': 237 { 238 List<Object> list = new ArrayList<Object>(); 239 240 Object nextToken = parseNextToken( is ); 241 while( nextToken != MARK_END_OF_ARRAY ) 242 { 243 list.add( nextToken ); 244 nextToken = parseNextToken( is ); 245 } 246 retval = list; 247 break; 248 } 249 case '<': 250 { 251 int theNextByte = is.read(); 252 if( theNextByte == '<' ) 253 { 254 Map<String, Object> result = new HashMap<String, Object>(); 255 //we are reading a dictionary 256 Object key = parseNextToken( is ); 257 while( key instanceof LiteralName && key != MARK_END_OF_DICTIONARY ) 258 { 259 Object value = parseNextToken( is ); 260 result.put( ((LiteralName)key).name, value ); 261 key = parseNextToken( is ); 262 } 263 retval = result; 264 } 265 else 266 { 267 //won't read more than 512 bytes 268 269 int multiplyer = 16; 270 int bufferIndex = -1; 271 while( theNextByte != -1 && theNextByte != '>' ) 272 { 273 int intValue = 0; 274 if( theNextByte >= '0' && theNextByte <= '9' ) 275 { 276 intValue = theNextByte - '0'; 277 } 278 else if( theNextByte >= 'A' && theNextByte <= 'F' ) 279 { 280 intValue = 10 + theNextByte - 'A'; 281 } 282 else if( theNextByte >= 'a' && theNextByte <= 'f' ) 283 { 284 intValue = 10 + theNextByte - 'a'; 285 } 286 else if( theNextByte == 0x20 || theNextByte == 0x09 ) 287 { 288 // skipping whitespaces - from pdf's generated by Mac osx 289 theNextByte = is.read(); 290 continue; 291 } 292 else 293 { 294 throw new IOException(MessageLocalization.getComposedMessage("error.expected.hex.character.and.not.char.thenextbyte.1", theNextByte)); 295 } 296 intValue *= multiplyer; 297 if( multiplyer == 16 ) 298 { 299 bufferIndex++; 300 tokenParserByteBuffer[bufferIndex] = 0; 301 multiplyer = 1; 302 } 303 else 304 { 305 multiplyer = 16; 306 } 307 tokenParserByteBuffer[bufferIndex]+= intValue; 308 theNextByte = is.read(); 309 } 310 byte[] finalResult = new byte[bufferIndex+1]; 311 System.arraycopy(tokenParserByteBuffer,0,finalResult, 0, bufferIndex+1); 312 retval = finalResult; 313 } 314 break; 315 } 316 case '/': 317 { 318 StringBuffer buffer = new StringBuffer(); 319 int stringByte = is.read(); 320 321 while( !isWhitespaceOrEOF( stringByte ) ) 322 { 323 buffer.append( (char)stringByte ); 324 stringByte = is.read(); 325 } 326 retval = new LiteralName( buffer.toString() ); 327 break; 328 } 329 case -1: 330 { 331 //EOF return null; 332 break; 333 } 334 case '0': 335 case '1': 336 case '2': 337 case '3': 338 case '4': 339 case '5': 340 case '6': 341 case '7': 342 case '8': 343 case '9': 344 { 345 StringBuffer buffer = new StringBuffer(); 346 buffer.append( (char)nextByte ); 347 nextByte = is.read(); 348 349 while( !isWhitespaceOrEOF( nextByte ) && 350 (Character.isDigit( (char)nextByte )|| 351 nextByte == '.' ) ) 352 { 353 buffer.append( (char)nextByte ); 354 nextByte = is.read(); 355 } 356 is.unread( nextByte ); 357 String value = buffer.toString(); 358 if( value.indexOf( '.' ) >=0 ) 359 { 360 retval = new Double( value ); 361 } 362 else 363 { 364 retval = Integer.valueOf( buffer.toString() ); 365 } 366 break; 367 } 368 default: 369 { 370 StringBuffer buffer = new StringBuffer(); 371 buffer.append( (char)nextByte ); 372 nextByte = is.read(); 373 374 while( !isWhitespaceOrEOF( nextByte ) ) 375 { 376 buffer.append( (char)nextByte ); 377 nextByte = is.read(); 378 } 379 retval = new Operator( buffer.toString() ); 380 381 break; 382 } 383 } 384 return retval; 385 } 386 387 private void readUntilEndOfLine( InputStream is, StringBuffer buf ) throws IOException 388 { 389 int nextByte = is.read(); 390 while( nextByte != -1 && nextByte != 0x0D && nextByte != 0x0A ) 391 { 392 buf.append( (char)nextByte ); 393 nextByte = is.read(); 394 } 395 } 396 397 private boolean isWhitespaceOrEOF( int aByte ) 398 { 399 return aByte == -1 || aByte == 0x20 || aByte == 0x0D || aByte == 0x0A; 400 } 401 402 403 private void increment( byte[] data ) 404 { 405 increment( data, data.length-1 ); 406 } 407 408 private void increment( byte[] data, int position ) 409 { 410 if( position > 0 && (data[position]+256)%256 == 255 ) 411 { 412 data[position]=0; 413 increment( data, position-1); 414 } 415 else 416 { 417 data[position] = (byte)(data[position]+1); 418 } 419 } 420 421 private String createStringFromBytes( byte[] bytes ) throws IOException 422 { 423 String retval = null; 424 if( bytes.length == 1 ) 425 { 426 retval = new String( bytes ); 427 } 428 else 429 { 430 retval = new String( bytes, "UTF-16BE" ); 431 } 432 return retval; 433 } 434 435 private int compare( byte[] first, byte[] second ) 436 { 437 int retval = 1; 438 boolean done = false; 439 for( int i=0; i<first.length && !done; i++ ) 440 { 441 if( first[i] == second[i] ) 442 { 443 //move to next position 444 } 445 else if( (first[i]+256)%256 < (second[i]+256)%256 ) 446 { 447 done = true; 448 retval = -1; 449 } 450 else 451 { 452 done = true; 453 retval = 1; 454 } 455 } 456 return retval; 457 } 458 459 /** 460 * Internal class. 461 */ 462 private class LiteralName 463 { 464 private String name; 465 private LiteralName( String theName ) 466 { 467 name = theName; 468 } 469 } 470 471 /** 472 * Internal class. 473 */ 474 private class Operator 475 { 476 private String op; 477 private Operator( String theOp ) 478 { 479 op = theOp; 480 } 481 } 482 483 /** 484 * A simple class to test parsing of cmap files. 485 * 486 * @param args Some command line arguments. 487 * 488 * @throws Exception If there is an error parsing the file. 489 public static void main( String[] args ) throws Exception 490 { 491 if( args.length != 1 ) 492 { 493 System.err.println( "usage: java org.pdfbox.cmapparser.CMapParser <CMAP File>" ); 494 System.exit( -1 ); 495 } 496 CMapParser parser = new CMapParser( ); 497 CMap result = parser.parse( new FileInputStream( args[0] ) ); 498 System.out.println( "Result:" + result ); 499 } 500 */ 501}