001/*
002 * $Id: InlineImageUtils.java 4832 2011-05-04 13:35:36Z blowagie $
003 *
004 * This file is part of the iText (R) project.
005 * Copyright (c) 1998-2011 1T3XT BVBA
006 * Authors: Bruno Lowagie, Kevin Day, Paulo Soares, et al.
007 *
008 * This program is free software; you can redistribute it and/or modify
009 * it under the terms of the GNU Affero General Public License version 3
010 * as published by the Free Software Foundation with the addition of the
011 * following permission added to Section 15 as permitted in Section 7(a):
012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
014 *
015 * This program is distributed in the hope that it will be useful, but
016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
017 * or FITNESS FOR A PARTICULAR PURPOSE.
018 * See the GNU Affero General Public License for more details.
019 * You should have received a copy of the GNU Affero General Public License
020 * along with this program; if not, see http://www.gnu.org/licenses or write to
021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
022 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
023 * http://itextpdf.com/terms-of-use/
024 *
025 * The interactive user interfaces in modified source and object code versions
026 * of this program must display Appropriate Legal Notices, as required under
027 * Section 5 of the GNU Affero General Public License.
028 *
029 * In accordance with Section 7(b) of the GNU Affero General Public License,
030 * a covered work must retain the producer line in every PDF that is created
031 * or manipulated using iText.
032 *
033 * You can be released from the requirements of the license by purchasing
034 * a commercial license. Buying such a license is mandatory as soon as you
035 * develop commercial activities involving the iText software without
036 * disclosing the source code of your own applications.
037 * These activities include: offering paid services to customers as an ASP,
038 * serving PDFs on the fly in a web application, shipping iText with a closed
039 * source product.
040 *
041 * For more information, please contact iText Software Corp. at this
042 * address: sales@itextpdf.com
043 */
044package com.itextpdf.text.pdf.parser;
045
046import java.io.ByteArrayOutputStream;
047import java.io.IOException;
048import java.util.HashMap;
049import java.util.Map;
050
051import com.itextpdf.text.pdf.PRTokeniser;
052import com.itextpdf.text.pdf.PdfArray;
053import com.itextpdf.text.pdf.PdfContentParser;
054import com.itextpdf.text.pdf.PdfDictionary;
055import com.itextpdf.text.pdf.PdfName;
056import com.itextpdf.text.pdf.PdfNumber;
057import com.itextpdf.text.pdf.PdfObject;
058
059/**
060 * Utility methods to help with processing of inline images
061 * @since 5.0.4
062 */
063public final class InlineImageUtils {
064    private InlineImageUtils(){}
065
066    /**
067     * Simple class in case users need to differentiate an exception from processing
068     * inline images vs other exceptions 
069     * @since 5.0.4
070     */
071    public static class InlineImageParseException extends IOException{
072
073                private static final long serialVersionUID = 233760879000268548L;
074
075                public InlineImageParseException(String message) {
076            super(message);
077        }
078
079    }
080    
081    /**
082     * Map between key abbreviations allowed in dictionary of inline images and their
083     * equivalent image dictionary keys
084     */
085    private final static Map<PdfName, PdfName> inlineImageEntryAbbreviationMap;
086    static { // static initializer
087        inlineImageEntryAbbreviationMap = new HashMap<PdfName, PdfName>();
088
089        // allowed entries - just pass these through
090        inlineImageEntryAbbreviationMap.put(PdfName.BITSPERCOMPONENT, PdfName.BITSPERCOMPONENT);
091        inlineImageEntryAbbreviationMap.put(PdfName.COLORSPACE, PdfName.COLORSPACE);
092        inlineImageEntryAbbreviationMap.put(PdfName.DECODE, PdfName.DECODE);
093        inlineImageEntryAbbreviationMap.put(PdfName.DECODEPARMS, PdfName.DECODEPARMS);
094        inlineImageEntryAbbreviationMap.put(PdfName.FILTER, PdfName.FILTER);
095        inlineImageEntryAbbreviationMap.put(PdfName.HEIGHT, PdfName.HEIGHT);
096        inlineImageEntryAbbreviationMap.put(PdfName.IMAGEMASK, PdfName.IMAGEMASK);
097        inlineImageEntryAbbreviationMap.put(PdfName.INTENT, PdfName.INTENT);
098        inlineImageEntryAbbreviationMap.put(PdfName.INTERPOLATE, PdfName.INTERPOLATE);
099        inlineImageEntryAbbreviationMap.put(PdfName.WIDTH, PdfName.WIDTH);
100
101        // abbreviations - transform these to corresponding correct values
102        inlineImageEntryAbbreviationMap.put(new PdfName("BPC"), PdfName.BITSPERCOMPONENT);
103        inlineImageEntryAbbreviationMap.put(new PdfName("CS"), PdfName.COLORSPACE);
104        inlineImageEntryAbbreviationMap.put(new PdfName("D"), PdfName.DECODE);
105        inlineImageEntryAbbreviationMap.put(new PdfName("DP"), PdfName.DECODEPARMS);
106        inlineImageEntryAbbreviationMap.put(new PdfName("F"), PdfName.FILTER);
107        inlineImageEntryAbbreviationMap.put(new PdfName("H"), PdfName.HEIGHT);
108        inlineImageEntryAbbreviationMap.put(new PdfName("IM"), PdfName.IMAGEMASK);
109        inlineImageEntryAbbreviationMap.put(new PdfName("I"), PdfName.INTERPOLATE);
110        inlineImageEntryAbbreviationMap.put(new PdfName("W"), PdfName.WIDTH);
111    }
112    
113    /**
114     * Map between value abbreviations allowed in dictionary of inline images for COLORSPACE
115     */
116    private static final Map<PdfName, PdfName> inlineImageColorSpaceAbbreviationMap;
117    static {
118        inlineImageColorSpaceAbbreviationMap = new HashMap<PdfName, PdfName>();
119        
120        inlineImageColorSpaceAbbreviationMap.put(new PdfName("G"), PdfName.DEVICEGRAY);
121        inlineImageColorSpaceAbbreviationMap.put(new PdfName("RGB"), PdfName.DEVICERGB);
122        inlineImageColorSpaceAbbreviationMap.put(new PdfName("CMYK"), PdfName.DEVICECMYK);
123        inlineImageColorSpaceAbbreviationMap.put(new PdfName("I"), PdfName.INDEXED);
124    }
125    
126    /**
127     * Map between value abbreviations allowed in dictionary of inline images for FILTER
128     */
129    private static final Map<PdfName, PdfName> inlineImageFilterAbbreviationMap;
130    static {
131        inlineImageFilterAbbreviationMap = new HashMap<PdfName, PdfName>();
132        
133        inlineImageFilterAbbreviationMap.put(new PdfName("AHx"), PdfName.ASCIIHEXDECODE);
134        inlineImageFilterAbbreviationMap.put(new PdfName("A85"), PdfName.ASCII85DECODE);
135        inlineImageFilterAbbreviationMap.put(new PdfName("LZW"), PdfName.LZWDECODE);
136        inlineImageFilterAbbreviationMap.put(new PdfName("Fl"), PdfName.FLATEDECODE);
137        inlineImageFilterAbbreviationMap.put(new PdfName("RL"), PdfName.RUNLENGTHDECODE);
138        inlineImageFilterAbbreviationMap.put(new PdfName("CCF"), PdfName.CCITTFAXDECODE);
139        inlineImageFilterAbbreviationMap.put(new PdfName("DCT"), PdfName.DCTDECODE);
140    }
141    
142    /**
143     * Parses an inline image from the provided content parser.  The parser must be positioned immediately following the BI operator in the content stream.
144     * The parser will be left with current position immediately following the EI operator that terminates the inline image
145     * @param ps the content parser to use for reading the image. 
146     * @param colorSpaceDic a color space dictionary 
147     * @return the parsed image
148     * @throws IOException if anything goes wring with the parsing
149     * @throws InlineImageParseException if parsing of the inline image failed due to issues specific to inline image processing
150     */
151    public static PdfImageObject parseInlineImage(PdfContentParser ps, PdfDictionary colorSpaceDic) throws IOException{
152        PdfDictionary inlineImageDictionary = parseInlineImageDictionary(ps);
153        byte[] samples = parseInlineImageSamples(inlineImageDictionary, colorSpaceDic, ps);
154        return new PdfImageObject(inlineImageDictionary, samples);
155    }
156    
157    /**
158     * Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the EI operator.
159     * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
160     * @param ps the parser to extract the embedded image information from
161     * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
162     * @throws IOException if the parse fails
163     */
164    private static PdfDictionary parseInlineImageDictionary(PdfContentParser ps) throws IOException{
165        // by the time we get to here, we have already parsed the BI operator
166        PdfDictionary dictionary = new PdfDictionary();
167        
168        for(PdfObject key = ps.readPRObject(); key != null && !"ID".equals(key.toString()); key = ps.readPRObject()){
169            PdfObject value = ps.readPRObject();
170
171            PdfName resolvedKey = inlineImageEntryAbbreviationMap.get(key);
172            if (resolvedKey == null)
173                resolvedKey = (PdfName)key;
174
175            dictionary.put(resolvedKey, getAlternateValue(resolvedKey, value));
176        }
177
178        int ch = ps.getTokeniser().read();
179        if (!PRTokeniser.isWhitespace(ch))
180            throw new IOException("Unexpected character " + ch + " found after ID in inline image");
181        
182        return dictionary;
183    }
184    
185    /**
186     * Transforms value abbreviations into their corresponding real value 
187     * @param key the key that the value is for
188     * @param value the value that might be an abbreviation
189     * @return if value is an allowed abbreviation for the key, the expanded value for that abbreviation.  Otherwise, value is returned without modification 
190     */
191    private static PdfObject getAlternateValue(PdfName key, PdfObject value){
192        if (key == PdfName.FILTER){
193            if (value instanceof PdfName){
194                PdfName altValue = inlineImageFilterAbbreviationMap.get(value);
195                if (altValue != null)
196                    return altValue;
197            } else if (value instanceof PdfArray){
198                PdfArray array = ((PdfArray)value);
199                PdfArray altArray = new PdfArray();
200                int count = array.size();
201                for(int i = 0; i < count; i++){
202                    altArray.add(getAlternateValue(key, array.getPdfObject(i)));
203                }
204                return altArray;
205            }
206        } else if (key == PdfName.COLORSPACE){
207            PdfName altValue = inlineImageColorSpaceAbbreviationMap.get(value);
208            if (altValue != null)
209                return altValue;
210        }
211        
212        return value;
213    }
214    
215    /**
216     * @param colorSpaceName the name of the color space. If null, a bi-tonal (black and white) color space is assumed.
217     * @return the components per pixel for the specified color space
218     */
219    private static int getComponentsPerPixel(PdfName colorSpaceName, PdfDictionary colorSpaceDic){
220        if (colorSpaceName == null)
221            return 1;
222        if (colorSpaceName.equals(PdfName.DEVICEGRAY))
223            return 1;
224        if (colorSpaceName.equals(PdfName.DEVICERGB))
225            return 3;
226        if (colorSpaceName.equals(PdfName.DEVICECMYK))
227            return 4;
228        
229        if (colorSpaceDic != null){
230            PdfArray colorSpace = colorSpaceDic.getAsArray(colorSpaceName);
231            if (colorSpace != null){
232                if (PdfName.INDEXED.equals(colorSpace.getAsName(0))){
233                    return 1;
234                }
235            }
236        }
237        
238        throw new IllegalArgumentException("Unexpected color space " + colorSpaceName);
239    }
240    
241    /**
242     * Computes the number of unfiltered bytes that each row of the image will contain.
243     * If the number of bytes results in a partial terminating byte, this number is rounded up
244     * per the PDF specification
245     * @param imageDictionary the dictionary of the inline image
246     * @return the number of bytes per row of the image
247     */
248    private static int computeBytesPerRow(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic){
249        PdfNumber wObj = imageDictionary.getAsNumber(PdfName.WIDTH);
250        PdfNumber bpcObj = imageDictionary.getAsNumber(PdfName.BITSPERCOMPONENT);
251        int cpp = getComponentsPerPixel(imageDictionary.getAsName(PdfName.COLORSPACE), colorSpaceDic);
252        
253        int w = wObj.intValue();
254        int bpc = bpcObj != null ? bpcObj.intValue() : 1;
255        
256        
257        int bytesPerRow = (w * bpc * cpp + 7) / 8;
258        
259        return bytesPerRow;
260    }
261    
262    /**
263     * Parses the samples of the image from the underlying content parser, ignoring all filters.
264     * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
265     * The parser will be left positioned immediately following the EI operator.
266     * This is primarily useful if no filters have been applied. 
267     * @param imageDictionary the dictionary of the inline image
268     * @param ps the content parser
269     * @return the samples of the image
270     * @throws IOException if anything bad happens during parsing
271     */
272    private static byte[] parseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) throws IOException{
273        // special case:  when no filter is specified, we just read the number of bits
274        // per component, multiplied by the width and height.
275        if (imageDictionary.contains(PdfName.FILTER))
276            throw new IllegalArgumentException("Dictionary contains filters");
277        
278        PdfNumber h = imageDictionary.getAsNumber(PdfName.HEIGHT);
279
280        int bytesToRead = computeBytesPerRow(imageDictionary, colorSpaceDic) * h.intValue();
281        byte[] bytes = new byte[bytesToRead];
282        PRTokeniser tokeniser = ps.getTokeniser();
283        
284        int shouldBeWhiteSpace = tokeniser.read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
285        // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
286        // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
287        int startIndex = 0;
288        if (!PRTokeniser.isWhitespace(shouldBeWhiteSpace)){
289            bytes[0] = (byte)shouldBeWhiteSpace;
290            startIndex++;
291        }
292        for(int i = startIndex; i < bytesToRead; i++){
293            int ch = tokeniser.read();
294            if (ch == -1)
295                throw new InlineImageParseException("End of content stream reached before end of image data");
296            
297            bytes[i] = (byte)ch;
298        }
299        PdfObject ei = ps.readPRObject();
300        if (!ei.toString().equals("EI"))
301            throw new InlineImageParseException("EI not found after end of image data");
302        
303        return bytes;
304    }
305    
306    /**
307     * Parses the samples of the image from the underlying content parser, accounting for filters
308     * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
309     * The parser will be left positioned immediately following the EI operator.
310     * <b>Note:</b>This implementation does not actually apply the filters at this time
311     * @param imageDictionary the dictionary of the inline image
312     * @param ps the content parser
313     * @return the samples of the image
314     * @throws IOException if anything bad happens during parsing
315     */
316    private static byte[] parseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) throws IOException{
317        // by the time we get to here, we have already parsed the ID operator
318        
319        if (!imageDictionary.contains(PdfName.FILTER)){
320            return parseUnfilteredSamples(imageDictionary, colorSpaceDic, ps);
321        }
322        
323        
324        // read all content until we reach an EI operator surrounded by whitespace.
325        // The following algorithm has two potential issues: what if the image stream 
326        // contains <ws>EI<ws> ?
327        // Plus, there are some streams that don't have the <ws> before the EI operator
328        // it sounds like we would have to actually decode the content stream, which
329        // I'd rather avoid right now.
330        ByteArrayOutputStream baos = new ByteArrayOutputStream();
331        ByteArrayOutputStream accumulated = new ByteArrayOutputStream();
332        int ch;
333        int found = 0;
334        PRTokeniser tokeniser = ps.getTokeniser();
335        
336        while ((ch = tokeniser.read()) != -1){
337            if (found == 0 && PRTokeniser.isWhitespace(ch)){
338                found++;
339                accumulated.write(ch);
340            } else if (found == 1 && ch == 'E'){
341                found++;
342                accumulated.write(ch);
343            } else if (found == 1 && PRTokeniser.isWhitespace(ch)){
344                // this clause is needed if we have a white space character that is part of the image data
345                // followed by a whitespace character that precedes the EI operator.  In this case, we need
346                // to flush the first whitespace, then treat the current whitespace as the first potential
347                // character for the end of stream check.  Note that we don't increment 'found' here.
348                baos.write(accumulated.toByteArray());
349                accumulated.reset();
350                accumulated.write(ch);
351            } else if (found == 2 && ch == 'I'){ 
352                found++;
353                accumulated.write(ch);
354            } else if (found == 3 && PRTokeniser.isWhitespace(ch)){
355                return baos.toByteArray();
356            } else {
357                baos.write(accumulated.toByteArray());
358                accumulated.reset();
359                
360                baos.write(ch);
361                found = 0;
362            }
363        }
364        throw new InlineImageParseException("Could not find image data or EI");
365    }
366}