001// Copyright (C) 1998-2001 by Jason Hunter <jhunter_AT_acm_DOT_org>.
002// All rights reserved.  Use of this class is limited.
003// Please see the LICENSE for more information.
004
005package com.oreilly.servlet.multipart;
006
007import java.io.IOException;
008import java.util.Enumeration;
009import java.util.Vector;
010
011import javax.servlet.http.HttpServletRequest;
012import javax.servlet.ServletInputStream;
013
014/** 
015 * A utility class to handle <code>multipart/form-data</code> requests,
016 * the kind of requests that support file uploads.  This class uses a 
017 * "pull" model where the reading of incoming files and parameters is 
018 * controlled by the client code, which allows incoming files to be stored 
019 * into any <code>OutputStream</code>.  If you wish to use an API which 
020 * resembles <code>HttpServletRequest</code>, use the "push" model 
021 * <code>MultipartRequest</code> instead.  It's an easy-to-use wrapper 
022 * around this class.
023 * <p>
024 * This class can receive arbitrarily large files (up to an artificial limit 
025 * you can set), and fairly efficiently too.  
026 * It cannot handle nested data (multipart content within multipart content).
027 * It <b>can</b> now with the latest release handle internationalized content
028 * (such as non Latin-1 filenames).
029 * <p>
030 * It also optionally includes enhanced buffering and Content-Length
031 * limitation.  Buffering is only required if your servlet container is 
032 * poorly implemented (many are, including Tomcat 3.2),
033 * but it is generally recommended because it will make a slow servlet 
034 * container a lot faster, and will only make a fast servlet container a 
035 * little slower.  Content-Length limiting is usually only required if you find 
036 * that your servlet is hanging trying to read the input stram from the POST, 
037 * and it is similarly recommended because it only has a minimal impact on 
038 * performance.
039 * <p>
040 * See the included upload.war for an example of how to use this class.
041 * <p>
042 * The full file upload specification is contained in experimental RFC 1867,
043 * available at <a href="http://www.ietf.org/rfc/rfc1867.txt">
044 * http://www.ietf.org/rfc/rfc1867.txt</a>.
045 * 
046 * @see com.oreilly.servlet.MultipartRequest
047 * 
048 * @author Jason Hunter
049 * @author Geoff Soutter
050 * @version 1.11, 2002/11/01, added constructor that takes an encoding, to
051 *                            make sure chars are always read correctly
052 * @version 1.10, 2002/11/01, added support for a preamble before the first
053 *                            boundary marker
054 * @version 1.9, 2002/11/01, added support to parse odd Opera Content-Type
055 * @version 1.8, 2002/11/01, added support for lynx with unquoted param vals
056 * @version 1.7, 2002/04/30, fixed bug if a line was '\n' alone
057 * @version 1.6, 2002/04/30, added better internationalization support, thanks
058 *                           to Changshin Lee
059 * @version 1.5, 2002/04/30, added Opera header fix, thanks to Nic Ferrier
060 * @version 1.4, 2001/03/23, added IE5 bug workaround supporting \n as line
061 *                           ending, thanks to Michael Alyn Miller
062 * @version 1.3, 2001/01/22, added support for boundaries surrounded by quotes
063 *                           and content-disposition after content-type,
064 *                           thanks to Scott Stark
065 * @version 1.2, 2001/01/22, getFilePath() support thanks to Stefan Eissing
066 * @version 1.1, 2000/10/29, integrating old WebSphere fix
067 * @version 1.0, 2000/10/27, initial revision
068 */
069public class MultipartParser {
070  
071  /** input stream to read parts from */
072  private ServletInputStream in;
073  
074  /** MIME boundary that delimits parts */
075  private String boundary;
076  
077  /** reference to the last file part we returned */
078  private FilePart lastFilePart;
079
080  /** buffer for readLine method */
081  private byte[] buf = new byte[8 * 1024];
082  
083  /** default encoding */
084  private static String DEFAULT_ENCODING = "ISO-8859-1";
085
086  /** preferred encoding */
087  private String encoding = DEFAULT_ENCODING;
088
089  /**
090   * Creates a <code>MultipartParser</code> from the specified request,
091   * which limits the upload size to the specified length, buffers for 
092   * performance and prevent attempts to read past the amount specified 
093   * by the Content-Length.
094   * 
095   * @param req   the servlet request.
096   * @param maxSize the maximum size of the POST content.
097   */
098  public MultipartParser(HttpServletRequest req, 
099                         int maxSize) throws IOException {
100    this(req, maxSize, true, true);
101  }
102  
103  /**
104   * Creates a <code>MultipartParser</code> from the specified request,
105   * which limits the upload size to the specified length, and optionally 
106   * buffers for performance and prevents attempts to read past the amount 
107   * specified by the Content-Length. 
108   * 
109   * @param req   the servlet request.
110   * @param maxSize the maximum size of the POST content.
111   * @param buffer whether to do internal buffering or let the server buffer,
112   *               useful for servers that don't buffer
113   * @param limitLength boolean flag to indicate if we need to filter 
114   *                    the request's input stream to prevent trying to 
115   *                    read past the end of the stream.
116   */
117  public MultipartParser(HttpServletRequest req, int maxSize, boolean buffer, 
118                         boolean limitLength) throws IOException {
119    this(req, maxSize, buffer, limitLength, null);
120  }
121
122  /**
123   * Creates a <code>MultipartParser</code> from the specified request,
124   * which limits the upload size to the specified length, and optionally 
125   * buffers for performance and prevents attempts to read past the amount 
126   * specified by the Content-Length, and with a specified encoding. 
127   * 
128   * @param req   the servlet request.
129   * @param maxSize the maximum size of the POST content.
130   * @param buffer whether to do internal buffering or let the server buffer,
131   *               useful for servers that don't buffer
132   * @param limitLength boolean flag to indicate if we need to filter 
133   *                    the request's input stream to prevent trying to 
134   *                    read past the end of the stream.
135   * @param encoding the encoding to use for parsing, default is ISO-8859-1.
136   */
137  public MultipartParser(HttpServletRequest req, int maxSize, boolean buffer, 
138                         boolean limitLength, String encoding)
139                                                throws IOException {
140    // First make sure we know the encoding to handle chars correctly.
141    // Thanks to Andreas Granzer, andreas.granzer@wave-solutions.com,
142    // for pointing out the need to have this in the constructor.
143    if (encoding != null) {
144      setEncoding(encoding);
145    }
146
147    // Check the content type to make sure it's "multipart/form-data"
148    // Access header two ways to work around WebSphere oddities
149    String type = null;
150    String type1 = req.getHeader("Content-Type");
151    String type2 = req.getContentType();
152    // If one value is null, choose the other value
153    if (type1 == null && type2 != null) {
154      type = type2;
155    }
156    else if (type2 == null && type1 != null) {
157      type = type1;
158    }
159    // If neither value is null, choose the longer value
160    else if (type1 != null && type2 != null) {
161      type = (type1.length() > type2.length() ? type1 : type2);
162    }
163
164    if (type == null || 
165        !type.toLowerCase().startsWith("multipart/form-data")) {
166      throw new IOException("Posted content type isn't multipart/form-data");
167    }
168
169    // Check the content length to prevent denial of service attacks
170    int length = req.getContentLength();
171    if (length > maxSize) {
172      throw new IOException("Posted content length of " + length + 
173                            " exceeds limit of " + maxSize);
174    }
175
176    // Get the boundary string; it's included in the content type.
177    // Should look something like "------------------------12012133613061"
178    String boundary = extractBoundary(type);
179    if (boundary == null) {
180      throw new IOException("Separation boundary was not specified");
181    }
182
183    ServletInputStream in = req.getInputStream();
184    
185    // If required, wrap the real input stream with classes that 
186    // "enhance" its behaviour for performance and stability
187    if (buffer) {
188      in = new BufferedServletInputStream(in);
189    }
190    if (limitLength) {
191      in = new LimitedServletInputStream(in, length);
192    }
193
194    // Save our values for later
195    this.in = in;
196    this.boundary = boundary;
197    
198    // Read until we hit the boundary
199    // Some clients send a preamble (per RFC 2046), so ignore that
200    // Thanks to Ben Johnson, ben.johnson@merrillcorp.com, for pointing out
201    // the need for preamble support.
202    do {
203      String line = readLine();
204      if (line == null) {
205        throw new IOException("Corrupt form data: premature ending");
206      }
207      // See if this line is the boundary, and if so break
208      if (line.startsWith(boundary)) {
209        break;  // success
210      }
211    } while (true);
212  }
213
214  /**
215   * Sets the encoding used to parse from here onward.  The default is
216   * ISO-8859-1.  Encodings are actually best passed into the contructor,
217   * so even the initial line reads are correct.
218   *
219   * @param encoding The encoding to use for parsing
220   */
221   public void setEncoding(String encoding) {
222     this.encoding = encoding;
223   }
224
225  /**
226   * Read the next part arriving in the stream. Will be either a 
227   * <code>FilePart</code> or a <code>ParamPart</code>, or <code>null</code>
228   * to indicate there are no more parts to read. The order of arrival 
229   * corresponds to the order of the form elements in the submitted form.
230   * 
231   * @return either a <code>FilePart</code>, a <code>ParamPart</code> or
232   *        <code>null</code> if there are no more parts to read.
233   * @exception IOException     if an input or output exception has occurred.
234   * 
235   * @see FilePart
236   * @see ParamPart
237   */
238  public Part readNextPart() throws IOException {
239    // Make sure the last file was entirely read from the input
240    if (lastFilePart != null) {
241      lastFilePart.getInputStream().close();
242      lastFilePart = null;
243    }
244    
245    // Read the headers; they look like this (not all may be present):
246    // Content-Disposition: form-data; name="field1"; filename="file1.txt"
247    // Content-Type: type/subtype
248    // Content-Transfer-Encoding: binary
249    Vector headers = new Vector();
250
251    String line = readLine();
252    if (line == null) {
253      // No parts left, we're done
254      return null;
255    }
256    else if (line.length() == 0) {
257      // IE4 on Mac sends an empty line at the end; treat that as the end.
258      // Thanks to Daniel Lemire and Henri Tourigny for this fix.
259      return null;
260    }
261
262    // Read the following header lines we hit an empty line
263    // A line starting with whitespace is considered a continuation;
264    // that requires a little special logic.  Thanks to Nic Ferrier for
265    // identifying a good fix.
266    while (line != null && line.length() > 0) {
267      String nextLine = null;
268      boolean getNextLine = true;
269      while (getNextLine) {
270        nextLine = readLine();
271        if (nextLine != null
272            && (nextLine.startsWith(" ")
273          || nextLine.startsWith("\t"))) {
274          line = line + nextLine;
275        }
276        else {
277          getNextLine = false;
278        }
279      }
280      // Add the line to the header list
281      headers.addElement(line);
282      line = nextLine;
283    }
284
285    // If we got a null above, it's the end
286    if (line == null) {
287      return null;
288    }
289
290    String name = null;
291    String filename = null;
292    String origname = null;
293    String contentType = "text/plain";  // rfc1867 says this is the default
294
295    Enumeration myEnum = headers.elements();
296    while (myEnum.hasMoreElements()) {
297      String headerline = (String) myEnum.nextElement();
298      if (headerline.toLowerCase().startsWith("content-disposition:")) {
299        // Parse the content-disposition line
300        String[] dispInfo = extractDispositionInfo(headerline);
301        // String disposition = dispInfo[0];  // not currently used
302        name = dispInfo[1];
303        filename = dispInfo[2];
304        origname = dispInfo[3];
305      }
306      else if (headerline.toLowerCase().startsWith("content-type:")) {
307        // Get the content type, or null if none specified
308        String type = extractContentType(headerline);
309        if (type != null) {
310          contentType = type;
311        }
312      }
313    }
314
315    // Now, finally, we read the content (end after reading the boundary)
316    if (filename == null) {
317      // This is a parameter, add it to the vector of values
318      // The encoding is needed to help parse the value
319      return new ParamPart(name, in, boundary, encoding);
320    }
321    else {
322      // This is a file
323      if (filename.equals("")) {
324        filename = null; // empty filename, probably an "empty" file param
325      }
326      lastFilePart = new FilePart(name, in, boundary,
327                                  contentType, filename, origname);
328      return lastFilePart;
329    }
330  }
331  
332  /**
333   * Extracts and returns the boundary token from a line.
334   * 
335   * @return the boundary token.
336   */
337  private String extractBoundary(String line) {
338    // Use lastIndexOf() because IE 4.01 on Win98 has been known to send the
339    // "boundary=" string multiple times.  Thanks to David Wall for this fix.
340    int index = line.lastIndexOf("boundary=");
341    if (index == -1) {
342      return null;
343    }
344    String boundary = line.substring(index + 9);  // 9 for "boundary="
345    if (boundary.charAt(0) == '"') {
346      // The boundary is enclosed in quotes, strip them
347      index = boundary.lastIndexOf('"');
348      boundary = boundary.substring(1, index);
349    }
350
351    // The real boundary is always preceeded by an extra "--"
352    boundary = "--" + boundary;
353
354    return boundary;
355  }
356
357  /**
358   * Extracts and returns disposition info from a line, as a <code>String<code>
359   * array with elements: disposition, name, filename.
360   * 
361   * @return String[] of elements: disposition, name, filename.
362   * @exception  IOException if the line is malformatted.
363   */
364  private String[] extractDispositionInfo(String line) throws IOException {
365    // Return the line's data as an array: disposition, name, filename
366    String[] retval = new String[4];
367
368    // Convert the line to a lowercase string without the ending \r\n
369    // Keep the original line for error messages and for variable names.
370    String origline = line;
371    line = origline.toLowerCase();
372
373    // Get the content disposition, should be "form-data"
374    int start = line.indexOf("content-disposition: ");
375    int end = line.indexOf(";");
376    if (start == -1 || end == -1) {
377      throw new IOException("Content disposition corrupt: " + origline);
378    }
379    String disposition = line.substring(start + 21, end);
380    if (!disposition.equals("form-data")) {
381      throw new IOException("Invalid content disposition: " + disposition);
382    }
383
384    // Get the field name
385    start = line.indexOf("name=\"", end);  // start at last semicolon
386    end = line.indexOf("\"", start + 7);   // skip name=\"
387    int startOffset = 6;
388    if (start == -1 || end == -1) {
389      // Some browsers like lynx don't surround with ""
390      // Thanks to Deon van der Merwe, dvdm@truteq.co.za, for noticing
391      start = line.indexOf("name=", end);
392      end = line.indexOf(";", start + 6);
393      if (start == -1) {
394        throw new IOException("Content disposition corrupt: " + origline);
395      }
396      else if (end == -1) {
397        end = line.length();
398      }
399      startOffset = 5;  // without quotes we have one fewer char to skip
400    }
401    String name = origline.substring(start + startOffset, end);
402
403    // Get the filename, if given
404    String filename = null;
405    String origname = null;
406    start = line.indexOf("filename=\"", end + 2);  // start after name
407    end = line.indexOf("\"", start + 10);          // skip filename=\"
408    if (start != -1 && end != -1) {                // note the !=
409      filename = origline.substring(start + 10, end);
410      origname = filename;
411      // The filename may contain a full path.  Cut to just the filename.
412      int slash =
413        Math.max(filename.lastIndexOf('/'), filename.lastIndexOf('\\'));
414      if (slash > -1) {
415        filename = filename.substring(slash + 1);  // past last slash
416      }
417    }
418
419    // Return a String array: disposition, name, filename
420    // empty filename denotes no file posted!
421    retval[0] = disposition;
422    retval[1] = name;
423    retval[2] = filename;
424    retval[3] = origname;
425    return retval;
426  }
427
428  /**
429   * Extracts and returns the content type from a line, or null if the
430   * line was empty.
431   * 
432   * @return content type, or null if line was empty.
433   * @exception  IOException if the line is malformatted.
434   */
435  private static String extractContentType(String line) throws IOException {
436    // Convert the line to a lowercase string
437    line = line.toLowerCase();
438
439    // Get the content type, if any
440    // Note that Opera at least puts extra info after the type, so handle
441    // that.  For example:  Content-Type: text/plain; name="foo"
442    // Thanks to Leon Poyyayil, leon.poyyayil@trivadis.com, for noticing this.
443    int end = line.indexOf(";");
444    if (end == -1) {
445      end = line.length();
446    }
447
448    return line.substring(13, end).trim();  // "content-type:" is 13
449  }
450  
451  /**
452   * Read the next line of input.
453   * 
454   * @return     a String containing the next line of input from the stream,
455   *        or null to indicate the end of the stream.
456   * @exception IOException     if an input or output exception has occurred.
457   */
458  private String readLine() throws IOException {
459    StringBuffer sbuf = new StringBuffer();
460    int result;
461    String line;
462
463    do {
464      result = in.readLine(buf, 0, buf.length);  // does +=
465      if (result != -1) {
466        sbuf.append(new String(buf, 0, result, encoding));
467      }
468    } while (result == buf.length);  // loop only if the buffer was filled
469
470    if (sbuf.length() == 0) {
471      return null;  // nothing read, must be at the end of stream
472    }
473
474    // Cut off the trailing \n or \r\n
475    // It should always be \r\n but IE5 sometimes does just \n
476    // Thanks to Luke Blaikie for helping make this work with \n
477    int len = sbuf.length();
478    if (len >= 2 && sbuf.charAt(len - 2) == '\r') {
479      sbuf.setLength(len - 2);  // cut \r\n
480    }
481    else if (len >= 1 && sbuf.charAt(len - 1) == '\n') {
482      sbuf.setLength(len - 1);  // cut \n
483    }
484    return sbuf.toString();
485  }
486}