001/**
002 * Copyright (c) 2005-2006, www.fontbox.org
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 *
008 * 1. Redistributions of source code must retain the above copyright notice,
009 *    this list of conditions and the following disclaimer.
010 * 2. Redistributions in binary form must reproduce the above copyright notice,
011 *    this list of conditions and the following disclaimer in the documentation
012 *    and/or other materials provided with the distribution.
013 * 3. Neither the name of fontbox; nor the names of its
014 *    contributors may be used to endorse or promote products derived from this
015 *    software without specific prior written permission.
016 *
017 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020 * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027 *
028 * http://www.fontbox.org
029 *
030 */
031package com.itextpdf.text.pdf.fonts.cmaps;
032
033import java.io.FileInputStream;
034import java.io.IOException;
035import java.io.InputStream;
036import java.io.PushbackInputStream;
037import java.util.ArrayList;
038import java.util.HashMap;
039import java.util.List;
040import java.util.Map;
041
042import com.itextpdf.text.error_messages.MessageLocalization;
043
044/**
045 * This will parser a CMap stream.
046 *
047 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
048 * @version $Revision: 4645 $
049 * @since       2.1.4
050 */
051public class CMapParser
052{
053    private static final String BEGIN_CODESPACE_RANGE = "begincodespacerange";
054    private static final String BEGIN_BASE_FONT_CHAR = "beginbfchar";
055    private static final String BEGIN_BASE_FONT_RANGE = "beginbfrange";
056
057    private static final String MARK_END_OF_DICTIONARY = ">>";
058    private static final String MARK_END_OF_ARRAY = "]";
059
060    private byte[] tokenParserByteBuffer = new byte[512];
061
062    /**
063     * Creates a new instance of CMapParser.
064     */
065    public CMapParser()
066    {
067    }
068
069    /**
070     * This will parse the stream and create a cmap object.
071     *
072     * @param input The CMAP stream to parse.
073     * @return The parsed stream as a java object.
074     *
075     * @throws IOException If there is an error parsing the stream.
076     */
077    @SuppressWarnings("unchecked")
078    public CMap parse( InputStream input ) throws IOException
079    {
080        PushbackInputStream cmapStream = new PushbackInputStream( input );
081        CMap result = new CMap();
082        Object token = null;
083        while( (token = parseNextToken( cmapStream )) != null )
084        {
085            if( token instanceof Operator )
086            {
087                Operator op = (Operator)token;
088                if( op.op.equals( BEGIN_CODESPACE_RANGE ) )
089                {
090                    while (true)
091                    {
092                        Object nx = parseNextToken( cmapStream );
093                        if (nx instanceof Operator && ((Operator)nx).op.equals("endcodespacerange"))
094                            break;
095                        byte[] startRange = (byte[])nx;
096                        byte[] endRange = (byte[])parseNextToken( cmapStream );
097                        CodespaceRange range = new CodespaceRange();
098                        range.setStart( startRange );
099                        range.setEnd( endRange );
100                        result.addCodespaceRange( range );
101                    }
102                }
103                else if( op.op.equals( BEGIN_BASE_FONT_CHAR ) )
104                {
105                    while (true)
106                    {
107                        Object nx = parseNextToken( cmapStream );
108                        if (nx instanceof Operator && ((Operator)nx).op.equals("endbfchar"))
109                            break;
110                        byte[] inputCode = (byte[])nx;
111                        Object nextToken = parseNextToken( cmapStream );
112                        if( nextToken instanceof byte[] )
113                        {
114                            byte[] bytes = (byte[])nextToken;
115                            String value = createStringFromBytes( bytes );
116                            result.addMapping( inputCode, value );
117                        }
118                        else if( nextToken instanceof LiteralName )
119                        {
120                            result.addMapping( inputCode, ((LiteralName)nextToken).name );
121                        }
122                        else
123                        {
124                            throw new IOException(MessageLocalization.getComposedMessage("error.parsing.cmap.beginbfchar.expected.cosstring.or.cosname.and.not.1", nextToken));
125                        }
126                    }
127                }
128               else if( op.op.equals( BEGIN_BASE_FONT_RANGE ) )
129                {
130                    while (true)
131                    {
132                        Object nx = parseNextToken( cmapStream );
133                        if (nx instanceof Operator && ((Operator)nx).op.equals("endbfrange"))
134                            break;
135                        byte[] startCode = (byte[])nx;
136                        byte[] endCode = (byte[])parseNextToken( cmapStream );
137                        Object nextToken = parseNextToken( cmapStream );
138                        List<byte[]> array = null;
139                        byte[] tokenBytes = null;
140                        if( nextToken instanceof List )
141                        {
142                            array = (List<byte[]>)nextToken;
143                            tokenBytes = array.get( 0 );
144                        }
145                        else
146                        {
147                            tokenBytes = (byte[])nextToken;
148                        }
149
150                        String value = null;
151
152                        int arrayIndex = 0;
153                        boolean done = false;
154                        while( !done )
155                        {
156                            if( compare( startCode, endCode ) >= 0 )
157                            {
158                                done = true;
159                            }
160                            value = createStringFromBytes( tokenBytes );
161                            result.addMapping( startCode, value );
162                            increment( startCode );
163
164                            if( array == null )
165                            {
166                                increment( tokenBytes );
167                            }
168                            else
169                            {
170                                arrayIndex++;
171                                if( arrayIndex < array.size() )
172                                {
173                                    tokenBytes = array.get( arrayIndex );
174                                }
175                            }
176                        }
177                    }
178                }
179            }
180        }
181        return result;
182    }
183
184    private Object parseNextToken( PushbackInputStream is ) throws IOException
185    {
186        Object retval = null;
187        int nextByte = is.read();
188        //skip whitespace
189        while( nextByte == 0x09 || nextByte == 0x20 || nextByte == 0x0D || nextByte == 0x0A )
190        {
191            nextByte = is.read();
192        }
193        switch( nextByte )
194        {
195            case '%':
196            {
197                //header operations, for now return the entire line
198                //may need to smarter in the future
199                StringBuffer buffer = new StringBuffer();
200                buffer.append( (char)nextByte );
201                readUntilEndOfLine( is, buffer );
202                retval = buffer.toString();
203                break;
204            }
205            case '(':
206            {
207                StringBuffer buffer = new StringBuffer();
208                int stringByte = is.read();
209
210                while( stringByte != -1 && stringByte != ')' )
211                {
212                    buffer.append( (char)stringByte );
213                    stringByte = is.read();
214                }
215                retval = buffer.toString();
216                break;
217            }
218            case '>':
219            {
220                int secondCloseBrace = is.read();
221                if( secondCloseBrace == '>' )
222                {
223                    retval = MARK_END_OF_DICTIONARY;
224                }
225                else
226                {
227                    throw new IOException(MessageLocalization.getComposedMessage("error.expected.the.end.of.a.dictionary"));
228                }
229                break;
230            }
231            case ']':
232            {
233                retval = MARK_END_OF_ARRAY;
234                break;
235            }
236            case '[':
237            {
238                List<Object> list = new ArrayList<Object>();
239
240                Object nextToken = parseNextToken( is );
241                while( nextToken != MARK_END_OF_ARRAY )
242                {
243                    list.add( nextToken );
244                    nextToken = parseNextToken( is );
245                }
246                retval = list;
247                break;
248            }
249            case '<':
250            {
251                int theNextByte = is.read();
252                if( theNextByte == '<' )
253                {
254                    Map<String, Object> result = new HashMap<String, Object>();
255                    //we are reading a dictionary
256                    Object key = parseNextToken( is );
257                    while( key instanceof LiteralName && key != MARK_END_OF_DICTIONARY )
258                    {
259                        Object value = parseNextToken( is );
260                        result.put( ((LiteralName)key).name, value );
261                        key = parseNextToken( is );
262                    }
263                    retval = result;
264                }
265                else
266                {
267                    //won't read more than 512 bytes
268
269                    int multiplyer = 16;
270                    int bufferIndex = -1;
271                    while( theNextByte != -1 && theNextByte != '>' )
272                    {
273                        int intValue = 0;
274                        if( theNextByte >= '0' && theNextByte <= '9' )
275                        {
276                            intValue = theNextByte - '0';
277                        }
278                        else if( theNextByte >= 'A' && theNextByte <= 'F' )
279                        {
280                            intValue = 10 + theNextByte - 'A';
281                        }
282                        else if( theNextByte >= 'a' && theNextByte <= 'f' )
283                        {
284                            intValue = 10 + theNextByte - 'a';
285                        }
286                        else if( theNextByte == 0x20 || theNextByte == 0x09 )
287                        {
288                            // skipping whitespaces - from pdf's generated by Mac osx
289                            theNextByte = is.read();
290                            continue;
291                        }
292                        else
293                        {
294                            throw new IOException(MessageLocalization.getComposedMessage("error.expected.hex.character.and.not.char.thenextbyte.1", theNextByte));
295                        }
296                        intValue *= multiplyer;
297                        if( multiplyer == 16 )
298                        {
299                            bufferIndex++;
300                            tokenParserByteBuffer[bufferIndex] = 0;
301                            multiplyer = 1;
302                        }
303                        else
304                        {
305                            multiplyer = 16;
306                        }
307                        tokenParserByteBuffer[bufferIndex]+= intValue;
308                        theNextByte = is.read();
309                    }
310                    byte[] finalResult = new byte[bufferIndex+1];
311                    System.arraycopy(tokenParserByteBuffer,0,finalResult, 0, bufferIndex+1);
312                    retval = finalResult;
313                }
314                break;
315            }
316            case '/':
317            {
318                StringBuffer buffer = new StringBuffer();
319                int stringByte = is.read();
320
321                while( !isWhitespaceOrEOF( stringByte ) )
322                {
323                    buffer.append( (char)stringByte );
324                    stringByte = is.read();
325                }
326                retval = new LiteralName( buffer.toString() );
327                break;
328            }
329            case -1:
330            {
331                //EOF return null;
332                break;
333            }
334            case '0':
335            case '1':
336            case '2':
337            case '3':
338            case '4':
339            case '5':
340            case '6':
341            case '7':
342            case '8':
343            case '9':
344            {
345                StringBuffer buffer = new StringBuffer();
346                buffer.append( (char)nextByte );
347                nextByte = is.read();
348
349                while( !isWhitespaceOrEOF( nextByte ) &&
350                        (Character.isDigit( (char)nextByte )||
351                         nextByte == '.' ) )
352                {
353                    buffer.append( (char)nextByte );
354                    nextByte = is.read();
355                }
356                is.unread( nextByte );
357                String value = buffer.toString();
358                if( value.indexOf( '.' ) >=0 )
359                {
360                    retval = new Double( value );
361                }
362                else
363                {
364                    retval = Integer.valueOf( buffer.toString() );
365                }
366                break;
367            }
368            default:
369            {
370                StringBuffer buffer = new StringBuffer();
371                buffer.append( (char)nextByte );
372                nextByte = is.read();
373
374                while( !isWhitespaceOrEOF( nextByte ) )
375                {
376                    buffer.append( (char)nextByte );
377                    nextByte = is.read();
378                }
379                retval = new Operator( buffer.toString() );
380
381                break;
382            }
383        }
384        return retval;
385    }
386
387    private void readUntilEndOfLine( InputStream is, StringBuffer buf ) throws IOException
388    {
389        int nextByte = is.read();
390        while( nextByte != -1 && nextByte != 0x0D && nextByte != 0x0A )
391        {
392            buf.append( (char)nextByte );
393            nextByte = is.read();
394        }
395    }
396
397    private boolean isWhitespaceOrEOF( int aByte )
398    {
399        return aByte == -1 || aByte == 0x20 || aByte == 0x0D || aByte == 0x0A;
400    }
401
402
403    private void increment( byte[] data )
404    {
405        increment( data, data.length-1 );
406    }
407
408    private void increment( byte[] data, int position )
409    {
410        if( position > 0 && (data[position]+256)%256 == 255 )
411        {
412            data[position]=0;
413            increment( data, position-1);
414        }
415        else
416        {
417            data[position] = (byte)(data[position]+1);
418        }
419    }
420
421    private String createStringFromBytes( byte[] bytes ) throws IOException
422    {
423        String retval = null;
424        if( bytes.length == 1 )
425        {
426            retval = new String( bytes );
427        }
428        else
429        {
430            retval = new String( bytes, "UTF-16BE" );
431        }
432        return retval;
433    }
434
435    private int compare( byte[] first, byte[] second )
436    {
437        int retval = 1;
438        boolean done = false;
439        for( int i=0; i<first.length && !done; i++ )
440        {
441            if( first[i] == second[i] )
442            {
443                //move to next position
444            }
445            else if( (first[i]+256)%256 < (second[i]+256)%256 )
446            {
447                done = true;
448                retval = -1;
449            }
450            else
451            {
452                done = true;
453                retval = 1;
454            }
455        }
456        return retval;
457    }
458
459    /**
460     * Internal class.
461     */
462    private class LiteralName
463    {
464        private String name;
465        private LiteralName( String theName )
466        {
467            name = theName;
468        }
469    }
470
471    /**
472     * Internal class.
473     */
474    private class Operator
475    {
476        private String op;
477        private Operator( String theOp )
478        {
479            op = theOp;
480        }
481    }
482
483    /**
484     * A simple class to test parsing of cmap files.
485     *
486     * @param args Some command line arguments.
487     *
488     * @throws Exception If there is an error parsing the file.
489    public static void main( String[] args ) throws Exception
490    {
491        if( args.length != 1 )
492        {
493            System.err.println( "usage: java org.pdfbox.cmapparser.CMapParser <CMAP File>" );
494            System.exit( -1 );
495        }
496        CMapParser parser = new CMapParser(  );
497        CMap result = parser.parse( new FileInputStream( args[0] ) );
498        System.out.println( "Result:" + result );
499    }
500     */
501}