001// Copyright (C) 1998-2001 by Jason Hunter <jhunter_AT_acm_DOT_org>. 002// All rights reserved. Use of this class is limited. 003// Please see the LICENSE for more information. 004 005package com.oreilly.servlet.multipart; 006 007import java.io.IOException; 008import java.util.Enumeration; 009import java.util.Vector; 010 011import javax.servlet.http.HttpServletRequest; 012import javax.servlet.ServletInputStream; 013 014/** 015 * A utility class to handle <code>multipart/form-data</code> requests, 016 * the kind of requests that support file uploads. This class uses a 017 * "pull" model where the reading of incoming files and parameters is 018 * controlled by the client code, which allows incoming files to be stored 019 * into any <code>OutputStream</code>. If you wish to use an API which 020 * resembles <code>HttpServletRequest</code>, use the "push" model 021 * <code>MultipartRequest</code> instead. It's an easy-to-use wrapper 022 * around this class. 023 * <p> 024 * This class can receive arbitrarily large files (up to an artificial limit 025 * you can set), and fairly efficiently too. 026 * It cannot handle nested data (multipart content within multipart content). 027 * It <b>can</b> now with the latest release handle internationalized content 028 * (such as non Latin-1 filenames). 029 * <p> 030 * It also optionally includes enhanced buffering and Content-Length 031 * limitation. Buffering is only required if your servlet container is 032 * poorly implemented (many are, including Tomcat 3.2), 033 * but it is generally recommended because it will make a slow servlet 034 * container a lot faster, and will only make a fast servlet container a 035 * little slower. Content-Length limiting is usually only required if you find 036 * that your servlet is hanging trying to read the input stram from the POST, 037 * and it is similarly recommended because it only has a minimal impact on 038 * performance. 039 * <p> 040 * See the included upload.war for an example of how to use this class. 041 * <p> 042 * The full file upload specification is contained in experimental RFC 1867, 043 * available at <a href="http://www.ietf.org/rfc/rfc1867.txt"> 044 * http://www.ietf.org/rfc/rfc1867.txt</a>. 045 * 046 * @see com.oreilly.servlet.MultipartRequest 047 * 048 * @author Jason Hunter 049 * @author Geoff Soutter 050 * @version 1.11, 2002/11/01, added constructor that takes an encoding, to 051 * make sure chars are always read correctly 052 * @version 1.10, 2002/11/01, added support for a preamble before the first 053 * boundary marker 054 * @version 1.9, 2002/11/01, added support to parse odd Opera Content-Type 055 * @version 1.8, 2002/11/01, added support for lynx with unquoted param vals 056 * @version 1.7, 2002/04/30, fixed bug if a line was '\n' alone 057 * @version 1.6, 2002/04/30, added better internationalization support, thanks 058 * to Changshin Lee 059 * @version 1.5, 2002/04/30, added Opera header fix, thanks to Nic Ferrier 060 * @version 1.4, 2001/03/23, added IE5 bug workaround supporting \n as line 061 * ending, thanks to Michael Alyn Miller 062 * @version 1.3, 2001/01/22, added support for boundaries surrounded by quotes 063 * and content-disposition after content-type, 064 * thanks to Scott Stark 065 * @version 1.2, 2001/01/22, getFilePath() support thanks to Stefan Eissing 066 * @version 1.1, 2000/10/29, integrating old WebSphere fix 067 * @version 1.0, 2000/10/27, initial revision 068 */ 069public class MultipartParser { 070 071 /** input stream to read parts from */ 072 private ServletInputStream in; 073 074 /** MIME boundary that delimits parts */ 075 private String boundary; 076 077 /** reference to the last file part we returned */ 078 private FilePart lastFilePart; 079 080 /** buffer for readLine method */ 081 private byte[] buf = new byte[8 * 1024]; 082 083 /** default encoding */ 084 private static String DEFAULT_ENCODING = "ISO-8859-1"; 085 086 /** preferred encoding */ 087 private String encoding = DEFAULT_ENCODING; 088 089 /** 090 * Creates a <code>MultipartParser</code> from the specified request, 091 * which limits the upload size to the specified length, buffers for 092 * performance and prevent attempts to read past the amount specified 093 * by the Content-Length. 094 * 095 * @param req the servlet request. 096 * @param maxSize the maximum size of the POST content. 097 */ 098 public MultipartParser(HttpServletRequest req, 099 int maxSize) throws IOException { 100 this(req, maxSize, true, true); 101 } 102 103 /** 104 * Creates a <code>MultipartParser</code> from the specified request, 105 * which limits the upload size to the specified length, and optionally 106 * buffers for performance and prevents attempts to read past the amount 107 * specified by the Content-Length. 108 * 109 * @param req the servlet request. 110 * @param maxSize the maximum size of the POST content. 111 * @param buffer whether to do internal buffering or let the server buffer, 112 * useful for servers that don't buffer 113 * @param limitLength boolean flag to indicate if we need to filter 114 * the request's input stream to prevent trying to 115 * read past the end of the stream. 116 */ 117 public MultipartParser(HttpServletRequest req, int maxSize, boolean buffer, 118 boolean limitLength) throws IOException { 119 this(req, maxSize, buffer, limitLength, null); 120 } 121 122 /** 123 * Creates a <code>MultipartParser</code> from the specified request, 124 * which limits the upload size to the specified length, and optionally 125 * buffers for performance and prevents attempts to read past the amount 126 * specified by the Content-Length, and with a specified encoding. 127 * 128 * @param req the servlet request. 129 * @param maxSize the maximum size of the POST content. 130 * @param buffer whether to do internal buffering or let the server buffer, 131 * useful for servers that don't buffer 132 * @param limitLength boolean flag to indicate if we need to filter 133 * the request's input stream to prevent trying to 134 * read past the end of the stream. 135 * @param encoding the encoding to use for parsing, default is ISO-8859-1. 136 */ 137 public MultipartParser(HttpServletRequest req, int maxSize, boolean buffer, 138 boolean limitLength, String encoding) 139 throws IOException { 140 // First make sure we know the encoding to handle chars correctly. 141 // Thanks to Andreas Granzer, andreas.granzer@wave-solutions.com, 142 // for pointing out the need to have this in the constructor. 143 if (encoding != null) { 144 setEncoding(encoding); 145 } 146 147 // Check the content type to make sure it's "multipart/form-data" 148 // Access header two ways to work around WebSphere oddities 149 String type = null; 150 String type1 = req.getHeader("Content-Type"); 151 String type2 = req.getContentType(); 152 // If one value is null, choose the other value 153 if (type1 == null && type2 != null) { 154 type = type2; 155 } 156 else if (type2 == null && type1 != null) { 157 type = type1; 158 } 159 // If neither value is null, choose the longer value 160 else if (type1 != null && type2 != null) { 161 type = (type1.length() > type2.length() ? type1 : type2); 162 } 163 164 if (type == null || 165 !type.toLowerCase().startsWith("multipart/form-data")) { 166 throw new IOException("Posted content type isn't multipart/form-data"); 167 } 168 169 // Check the content length to prevent denial of service attacks 170 int length = req.getContentLength(); 171 if (length > maxSize) { 172 throw new IOException("Posted content length of " + length + 173 " exceeds limit of " + maxSize); 174 } 175 176 // Get the boundary string; it's included in the content type. 177 // Should look something like "------------------------12012133613061" 178 String boundary = extractBoundary(type); 179 if (boundary == null) { 180 throw new IOException("Separation boundary was not specified"); 181 } 182 183 ServletInputStream in = req.getInputStream(); 184 185 // If required, wrap the real input stream with classes that 186 // "enhance" its behaviour for performance and stability 187 if (buffer) { 188 in = new BufferedServletInputStream(in); 189 } 190 if (limitLength) { 191 in = new LimitedServletInputStream(in, length); 192 } 193 194 // Save our values for later 195 this.in = in; 196 this.boundary = boundary; 197 198 // Read until we hit the boundary 199 // Some clients send a preamble (per RFC 2046), so ignore that 200 // Thanks to Ben Johnson, ben.johnson@merrillcorp.com, for pointing out 201 // the need for preamble support. 202 do { 203 String line = readLine(); 204 if (line == null) { 205 throw new IOException("Corrupt form data: premature ending"); 206 } 207 // See if this line is the boundary, and if so break 208 if (line.startsWith(boundary)) { 209 break; // success 210 } 211 } while (true); 212 } 213 214 /** 215 * Sets the encoding used to parse from here onward. The default is 216 * ISO-8859-1. Encodings are actually best passed into the contructor, 217 * so even the initial line reads are correct. 218 * 219 * @param encoding The encoding to use for parsing 220 */ 221 public void setEncoding(String encoding) { 222 this.encoding = encoding; 223 } 224 225 /** 226 * Read the next part arriving in the stream. Will be either a 227 * <code>FilePart</code> or a <code>ParamPart</code>, or <code>null</code> 228 * to indicate there are no more parts to read. The order of arrival 229 * corresponds to the order of the form elements in the submitted form. 230 * 231 * @return either a <code>FilePart</code>, a <code>ParamPart</code> or 232 * <code>null</code> if there are no more parts to read. 233 * @exception IOException if an input or output exception has occurred. 234 * 235 * @see FilePart 236 * @see ParamPart 237 */ 238 public Part readNextPart() throws IOException { 239 // Make sure the last file was entirely read from the input 240 if (lastFilePart != null) { 241 lastFilePart.getInputStream().close(); 242 lastFilePart = null; 243 } 244 245 // Read the headers; they look like this (not all may be present): 246 // Content-Disposition: form-data; name="field1"; filename="file1.txt" 247 // Content-Type: type/subtype 248 // Content-Transfer-Encoding: binary 249 Vector headers = new Vector(); 250 251 String line = readLine(); 252 if (line == null) { 253 // No parts left, we're done 254 return null; 255 } 256 else if (line.length() == 0) { 257 // IE4 on Mac sends an empty line at the end; treat that as the end. 258 // Thanks to Daniel Lemire and Henri Tourigny for this fix. 259 return null; 260 } 261 262 // Read the following header lines we hit an empty line 263 // A line starting with whitespace is considered a continuation; 264 // that requires a little special logic. Thanks to Nic Ferrier for 265 // identifying a good fix. 266 while (line != null && line.length() > 0) { 267 String nextLine = null; 268 boolean getNextLine = true; 269 while (getNextLine) { 270 nextLine = readLine(); 271 if (nextLine != null 272 && (nextLine.startsWith(" ") 273 || nextLine.startsWith("\t"))) { 274 line = line + nextLine; 275 } 276 else { 277 getNextLine = false; 278 } 279 } 280 // Add the line to the header list 281 headers.addElement(line); 282 line = nextLine; 283 } 284 285 // If we got a null above, it's the end 286 if (line == null) { 287 return null; 288 } 289 290 String name = null; 291 String filename = null; 292 String origname = null; 293 String contentType = "text/plain"; // rfc1867 says this is the default 294 295 Enumeration myEnum = headers.elements(); 296 while (myEnum.hasMoreElements()) { 297 String headerline = (String) myEnum.nextElement(); 298 if (headerline.toLowerCase().startsWith("content-disposition:")) { 299 // Parse the content-disposition line 300 String[] dispInfo = extractDispositionInfo(headerline); 301 // String disposition = dispInfo[0]; // not currently used 302 name = dispInfo[1]; 303 filename = dispInfo[2]; 304 origname = dispInfo[3]; 305 } 306 else if (headerline.toLowerCase().startsWith("content-type:")) { 307 // Get the content type, or null if none specified 308 String type = extractContentType(headerline); 309 if (type != null) { 310 contentType = type; 311 } 312 } 313 } 314 315 // Now, finally, we read the content (end after reading the boundary) 316 if (filename == null) { 317 // This is a parameter, add it to the vector of values 318 // The encoding is needed to help parse the value 319 return new ParamPart(name, in, boundary, encoding); 320 } 321 else { 322 // This is a file 323 if (filename.equals("")) { 324 filename = null; // empty filename, probably an "empty" file param 325 } 326 lastFilePart = new FilePart(name, in, boundary, 327 contentType, filename, origname); 328 return lastFilePart; 329 } 330 } 331 332 /** 333 * Extracts and returns the boundary token from a line. 334 * 335 * @return the boundary token. 336 */ 337 private String extractBoundary(String line) { 338 // Use lastIndexOf() because IE 4.01 on Win98 has been known to send the 339 // "boundary=" string multiple times. Thanks to David Wall for this fix. 340 int index = line.lastIndexOf("boundary="); 341 if (index == -1) { 342 return null; 343 } 344 String boundary = line.substring(index + 9); // 9 for "boundary=" 345 if (boundary.charAt(0) == '"') { 346 // The boundary is enclosed in quotes, strip them 347 index = boundary.lastIndexOf('"'); 348 boundary = boundary.substring(1, index); 349 } 350 351 // The real boundary is always preceeded by an extra "--" 352 boundary = "--" + boundary; 353 354 return boundary; 355 } 356 357 /** 358 * Extracts and returns disposition info from a line, as a <code>String<code> 359 * array with elements: disposition, name, filename. 360 * 361 * @return String[] of elements: disposition, name, filename. 362 * @exception IOException if the line is malformatted. 363 */ 364 private String[] extractDispositionInfo(String line) throws IOException { 365 // Return the line's data as an array: disposition, name, filename 366 String[] retval = new String[4]; 367 368 // Convert the line to a lowercase string without the ending \r\n 369 // Keep the original line for error messages and for variable names. 370 String origline = line; 371 line = origline.toLowerCase(); 372 373 // Get the content disposition, should be "form-data" 374 int start = line.indexOf("content-disposition: "); 375 int end = line.indexOf(";"); 376 if (start == -1 || end == -1) { 377 throw new IOException("Content disposition corrupt: " + origline); 378 } 379 String disposition = line.substring(start + 21, end); 380 if (!disposition.equals("form-data")) { 381 throw new IOException("Invalid content disposition: " + disposition); 382 } 383 384 // Get the field name 385 start = line.indexOf("name=\"", end); // start at last semicolon 386 end = line.indexOf("\"", start + 7); // skip name=\" 387 int startOffset = 6; 388 if (start == -1 || end == -1) { 389 // Some browsers like lynx don't surround with "" 390 // Thanks to Deon van der Merwe, dvdm@truteq.co.za, for noticing 391 start = line.indexOf("name=", end); 392 end = line.indexOf(";", start + 6); 393 if (start == -1) { 394 throw new IOException("Content disposition corrupt: " + origline); 395 } 396 else if (end == -1) { 397 end = line.length(); 398 } 399 startOffset = 5; // without quotes we have one fewer char to skip 400 } 401 String name = origline.substring(start + startOffset, end); 402 403 // Get the filename, if given 404 String filename = null; 405 String origname = null; 406 start = line.indexOf("filename=\"", end + 2); // start after name 407 end = line.indexOf("\"", start + 10); // skip filename=\" 408 if (start != -1 && end != -1) { // note the != 409 filename = origline.substring(start + 10, end); 410 origname = filename; 411 // The filename may contain a full path. Cut to just the filename. 412 int slash = 413 Math.max(filename.lastIndexOf('/'), filename.lastIndexOf('\\')); 414 if (slash > -1) { 415 filename = filename.substring(slash + 1); // past last slash 416 } 417 } 418 419 // Return a String array: disposition, name, filename 420 // empty filename denotes no file posted! 421 retval[0] = disposition; 422 retval[1] = name; 423 retval[2] = filename; 424 retval[3] = origname; 425 return retval; 426 } 427 428 /** 429 * Extracts and returns the content type from a line, or null if the 430 * line was empty. 431 * 432 * @return content type, or null if line was empty. 433 * @exception IOException if the line is malformatted. 434 */ 435 private static String extractContentType(String line) throws IOException { 436 // Convert the line to a lowercase string 437 line = line.toLowerCase(); 438 439 // Get the content type, if any 440 // Note that Opera at least puts extra info after the type, so handle 441 // that. For example: Content-Type: text/plain; name="foo" 442 // Thanks to Leon Poyyayil, leon.poyyayil@trivadis.com, for noticing this. 443 int end = line.indexOf(";"); 444 if (end == -1) { 445 end = line.length(); 446 } 447 448 return line.substring(13, end).trim(); // "content-type:" is 13 449 } 450 451 /** 452 * Read the next line of input. 453 * 454 * @return a String containing the next line of input from the stream, 455 * or null to indicate the end of the stream. 456 * @exception IOException if an input or output exception has occurred. 457 */ 458 private String readLine() throws IOException { 459 StringBuffer sbuf = new StringBuffer(); 460 int result; 461 String line; 462 463 do { 464 result = in.readLine(buf, 0, buf.length); // does += 465 if (result != -1) { 466 sbuf.append(new String(buf, 0, result, encoding)); 467 } 468 } while (result == buf.length); // loop only if the buffer was filled 469 470 if (sbuf.length() == 0) { 471 return null; // nothing read, must be at the end of stream 472 } 473 474 // Cut off the trailing \n or \r\n 475 // It should always be \r\n but IE5 sometimes does just \n 476 // Thanks to Luke Blaikie for helping make this work with \n 477 int len = sbuf.length(); 478 if (len >= 2 && sbuf.charAt(len - 2) == '\r') { 479 sbuf.setLength(len - 2); // cut \r\n 480 } 481 else if (len >= 1 && sbuf.charAt(len - 1) == '\n') { 482 sbuf.setLength(len - 1); // cut \n 483 } 484 return sbuf.toString(); 485 } 486}