001package org.jsoup.helper; 002 003import org.jsoup.internal.ConstrainableInputStream; 004import org.jsoup.nodes.Document; 005import org.jsoup.nodes.Element; 006import org.jsoup.nodes.XmlDeclaration; 007import org.jsoup.parser.Parser; 008import org.jsoup.select.Elements; 009 010import java.io.BufferedReader; 011import java.io.File; 012import java.io.FileInputStream; 013import java.io.IOException; 014import java.io.InputStream; 015import java.io.InputStreamReader; 016import java.io.OutputStream; 017import java.io.RandomAccessFile; 018import java.nio.Buffer; 019import java.nio.ByteBuffer; 020import java.nio.charset.Charset; 021import java.nio.charset.IllegalCharsetNameException; 022import java.util.Locale; 023import java.util.Random; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027/** 028 * Internal static utilities for handling data. 029 * 030 */ 031public final class DataUtil { 032 private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)"); 033 static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset 034 private static final int firstReadBufferSize = 1024 * 5; 035 static final int bufferSize = 1024 * 32; 036 private static final char[] mimeBoundaryChars = 037 "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray(); 038 static final int boundaryLength = 32; 039 040 private DataUtil() {} 041 042 /** 043 * Loads a file to a Document. 044 * @param in file to load 045 * @param charsetName character set of input 046 * @param baseUri base URI of document, to resolve relative links against 047 * @return Document 048 * @throws IOException on IO error 049 */ 050 public static Document load(File in, String charsetName, String baseUri) throws IOException { 051 return parseInputStream(new FileInputStream(in), charsetName, baseUri, Parser.htmlParser()); 052 } 053 054 /** 055 * Parses a Document from an input steam. 056 * @param in input stream to parse. You will need to close it. 057 * @param charsetName character set of input 058 * @param baseUri base URI of document, to resolve relative links against 059 * @return Document 060 * @throws IOException on IO error 061 */ 062 public static Document load(InputStream in, String charsetName, String baseUri) throws IOException { 063 return parseInputStream(in, charsetName, baseUri, Parser.htmlParser()); 064 } 065 066 /** 067 * Parses a Document from an input steam, using the provided Parser. 068 * @param in input stream to parse. You will need to close it. 069 * @param charsetName character set of input 070 * @param baseUri base URI of document, to resolve relative links against 071 * @param parser alternate {@link Parser#xmlParser() parser} to use. 072 * @return Document 073 * @throws IOException on IO error 074 */ 075 public static Document load(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException { 076 return parseInputStream(in, charsetName, baseUri, parser); 077 } 078 079 /** 080 * Writes the input stream to the output stream. Doesn't close them. 081 * @param in input stream to read from 082 * @param out output stream to write to 083 * @throws IOException on IO error 084 */ 085 static void crossStreams(final InputStream in, final OutputStream out) throws IOException { 086 final byte[] buffer = new byte[bufferSize]; 087 int len; 088 while ((len = in.read(buffer)) != -1) { 089 out.write(buffer, 0, len); 090 } 091 } 092 093 static Document parseInputStream(InputStream input, String charsetName, String baseUri, Parser parser) throws IOException { 094 if (input == null) // empty body 095 return new Document(baseUri); 096 input = ConstrainableInputStream.wrap(input, bufferSize, 0); 097 098 Document doc = null; 099 boolean fullyRead = false; 100 101 // read the start of the stream and look for a BOM or meta charset 102 input.mark(firstReadBufferSize); 103 ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed 104 fullyRead = input.read() == -1; 105 input.reset(); 106 107 // look for BOM - overrides any other header or input 108 BomCharset bomCharset = detectCharsetFromBom(firstBytes); 109 if (bomCharset != null) { 110 charsetName = bomCharset.charset; 111 input.skip(bomCharset.offset); 112 } 113 114 if (charsetName == null) { // determine from meta. safe first parse as UTF-8 115 String docData = Charset.forName(defaultCharset).decode(firstBytes).toString(); 116 doc = parser.parseInput(docData, baseUri); 117 118 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> 119 Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); 120 String foundCharset = null; // if not found, will keep utf-8 as best attempt 121 for (Element meta : metaElements) { 122 if (meta.hasAttr("http-equiv")) 123 foundCharset = getCharsetFromContentType(meta.attr("content")); 124 if (foundCharset == null && meta.hasAttr("charset")) 125 foundCharset = meta.attr("charset"); 126 if (foundCharset != null) 127 break; 128 } 129 130 // look for <?xml encoding='ISO-8859-1'?> 131 if (foundCharset == null && doc.childNodeSize() > 0 && doc.childNode(0) instanceof XmlDeclaration) { 132 XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0); 133 if (prolog.name().equals("xml")) 134 foundCharset = prolog.attr("encoding"); 135 } 136 foundCharset = validateCharset(foundCharset); 137 if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharset)) { // need to re-decode. (case insensitive check here to match how validate works) 138 foundCharset = foundCharset.trim().replaceAll("[\"']", ""); 139 charsetName = foundCharset; 140 doc = null; 141 } else if (!fullyRead) { 142 doc = null; 143 } 144 } else { // specified by content type header (or by user on file load) 145 Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); 146 } 147 if (doc == null) { 148 if (charsetName == null) 149 charsetName = defaultCharset; 150 BufferedReader reader = new BufferedReader(new InputStreamReader(input, charsetName), bufferSize); 151 doc = parser.parseInput(reader, baseUri); 152 doc.outputSettings().charset(charsetName); 153 } 154 input.close(); 155 return doc; 156 } 157 158 /** 159 * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this 160 * method is executing on. The data read until being interrupted will be available. 161 * @param inStream the input stream to read from 162 * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited. 163 * @return the filled byte buffer 164 * @throws IOException if an exception occurs whilst reading from the input stream. 165 */ 166 public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException { 167 Validate.isTrue(maxSize >= 0, "maxSize must be 0 (unlimited) or larger"); 168 final ConstrainableInputStream input = ConstrainableInputStream.wrap(inStream, bufferSize, maxSize); 169 return input.readToByteBuffer(maxSize); 170 } 171 172 static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException { 173 return readToByteBuffer(inStream, 0); 174 } 175 176 static ByteBuffer readFileToByteBuffer(File file) throws IOException { 177 RandomAccessFile randomAccessFile = null; 178 try { 179 randomAccessFile = new RandomAccessFile(file, "r"); 180 byte[] bytes = new byte[(int) randomAccessFile.length()]; 181 randomAccessFile.readFully(bytes); 182 return ByteBuffer.wrap(bytes); 183 } finally { 184 if (randomAccessFile != null) 185 randomAccessFile.close(); 186 } 187 } 188 189 static ByteBuffer emptyByteBuffer() { 190 return ByteBuffer.allocate(0); 191 } 192 193 /** 194 * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default 195 * will kick in.) 196 * @param contentType e.g. "text/html; charset=EUC-JP" 197 * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. 198 */ 199 static String getCharsetFromContentType(String contentType) { 200 if (contentType == null) return null; 201 Matcher m = charsetPattern.matcher(contentType); 202 if (m.find()) { 203 String charset = m.group(1).trim(); 204 charset = charset.replace("charset=", ""); 205 return validateCharset(charset); 206 } 207 return null; 208 } 209 210 private static String validateCharset(String cs) { 211 if (cs == null || cs.length() == 0) return null; 212 cs = cs.trim().replaceAll("[\"']", ""); 213 try { 214 if (Charset.isSupported(cs)) return cs; 215 cs = cs.toUpperCase(Locale.ENGLISH); 216 if (Charset.isSupported(cs)) return cs; 217 } catch (IllegalCharsetNameException e) { 218 // if our this charset matching fails.... we just take the default 219 } 220 return null; 221 } 222 223 /** 224 * Creates a random string, suitable for use as a mime boundary 225 */ 226 static String mimeBoundary() { 227 final StringBuilder mime = new StringBuilder(boundaryLength); 228 final Random rand = new Random(); 229 for (int i = 0; i < boundaryLength; i++) { 230 mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]); 231 } 232 return mime.toString(); 233 } 234 235 private static BomCharset detectCharsetFromBom(final ByteBuffer byteData) { 236 final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat 237 buffer.mark(); 238 byte[] bom = new byte[4]; 239 if (byteData.remaining() >= bom.length) { 240 byteData.get(bom); 241 buffer.rewind(); 242 } 243 if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE 244 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE 245 return new BomCharset("UTF-32", 0); // and I hope it's on your system 246 } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE 247 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) { 248 return new BomCharset("UTF-16", 0); // in all Javas 249 } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) { 250 return new BomCharset("UTF-8", 3); // in all Javas 251 // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here 252 } 253 return null; 254 } 255 256 private static class BomCharset { 257 private final String charset; 258 private final int offset; 259 260 public BomCharset(String charset, int offset) { 261 this.charset = charset; 262 this.offset = offset; 263 } 264 } 265}