001/* 002 * ==================================================================== 003 * Licensed to the Apache Software Foundation (ASF) under one 004 * or more contributor license agreements. See the NOTICE file 005 * distributed with this work for additional information 006 * regarding copyright ownership. The ASF licenses this file 007 * to you under the Apache License, Version 2.0 (the 008 * "License"); you may not use this file except in compliance 009 * with the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, 014 * software distributed under the License is distributed on an 015 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 016 * KIND, either express or implied. See the License for the 017 * specific language governing permissions and limitations 018 * under the License. 019 * ==================================================================== 020 * 021 * This software consists of voluntary contributions made by many 022 * individuals on behalf of the Apache Software Foundation. For more 023 * information on the Apache Software Foundation, please see 024 * <http://www.apache.org/>. 025 * 026 */ 027 028package org.apache.http.client.utils; 029 030import java.io.IOException; 031import java.io.InputStream; 032import java.io.InputStreamReader; 033import java.io.Reader; 034import java.net.URI; 035import java.nio.ByteBuffer; 036import java.nio.CharBuffer; 037import java.nio.charset.Charset; 038import java.util.ArrayList; 039import java.util.BitSet; 040import java.util.Collections; 041import java.util.List; 042import java.util.Scanner; 043 044import org.apache.http.Consts; 045import org.apache.http.Header; 046import org.apache.http.HeaderElement; 047import org.apache.http.HttpEntity; 048import org.apache.http.NameValuePair; 049import org.apache.http.entity.ContentType; 050import org.apache.http.message.BasicNameValuePair; 051import org.apache.http.message.ParserCursor; 052import org.apache.http.message.TokenParser; 053import org.apache.http.protocol.HTTP; 054import org.apache.http.util.Args; 055import org.apache.http.util.CharArrayBuffer; 056 057/** 058 * A collection of utilities for encoding URLs. 059 * 060 * @since 4.0 061 */ 062public class URLEncodedUtils { 063 064 /** 065 * The default HTML form content type. 066 */ 067 public static final String CONTENT_TYPE = "application/x-www-form-urlencoded"; 068 069 private static final char QP_SEP_A = '&'; 070 private static final char QP_SEP_S = ';'; 071 private static final String NAME_VALUE_SEPARATOR = "="; 072 073 /** 074 * @deprecated 4.5 Use {@link #parse(URI, Charset)} 075 */ 076 public static List <NameValuePair> parse(final URI uri, final String charsetName) { 077 return parse(uri, charsetName != null ? Charset.forName(charsetName) : null); 078 } 079 080 /** 081 * Returns a list of {@link NameValuePair NameValuePairs} as built from the URI's query portion. For example, a URI 082 * of {@code http://example.org/path/to/file?a=1&b=2&c=3} would return a list of three NameValuePairs, one for a=1, 083 * one for b=2, and one for c=3. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators. 084 * <p> 085 * This is typically useful while parsing an HTTP PUT. 086 * 087 * This API is currently only used for testing. 088 * 089 * @param uri 090 * URI to parse 091 * @param charset 092 * Charset to use while parsing the query 093 * @return a list of {@link NameValuePair} as built from the URI's query portion. 094 * 095 * @since 4.5 096 */ 097 public static List <NameValuePair> parse(final URI uri, final Charset charset) { 098 Args.notNull(uri, "URI"); 099 final String query = uri.getRawQuery(); 100 if (query != null && !query.isEmpty()) { 101 return parse(query, charset); 102 } 103 return Collections.emptyList(); 104 } 105 106 /** 107 * Returns a list of {@link NameValuePair NameValuePairs} as parsed from an {@link HttpEntity}. 108 * The encoding is taken from the entity's Content-Encoding header. 109 * <p> 110 * This is typically used while parsing an HTTP POST. 111 * 112 * @param entity 113 * The entity to parse 114 * @return a list of {@link NameValuePair} as built from the URI's query portion. 115 * @throws IOException 116 * If there was an exception getting the entity's data. 117 */ 118 public static List <NameValuePair> parse( 119 final HttpEntity entity) throws IOException { 120 Args.notNull(entity, "HTTP entity"); 121 final ContentType contentType = ContentType.get(entity); 122 if (contentType == null || !contentType.getMimeType().equalsIgnoreCase(CONTENT_TYPE)) { 123 return Collections.emptyList(); 124 } 125 final long len = entity.getContentLength(); 126 Args.check(len <= Integer.MAX_VALUE, "HTTP entity is too large"); 127 final Charset charset = contentType.getCharset() != null ? contentType.getCharset() : HTTP.DEF_CONTENT_CHARSET; 128 final InputStream instream = entity.getContent(); 129 if (instream == null) { 130 return Collections.emptyList(); 131 } 132 final CharArrayBuffer buf; 133 try { 134 buf = new CharArrayBuffer(len > 0 ? (int) len : 1024); 135 final Reader reader = new InputStreamReader(instream, charset); 136 final char[] tmp = new char[1024]; 137 int l; 138 while((l = reader.read(tmp)) != -1) { 139 buf.append(tmp, 0, l); 140 } 141 142 } finally { 143 instream.close(); 144 } 145 if (buf.length() == 0) { 146 return Collections.emptyList(); 147 } 148 return parse(buf, charset, QP_SEP_A); 149 } 150 151 /** 152 * Returns true if the entity's Content-Type header is 153 * {@code application/x-www-form-urlencoded}. 154 */ 155 public static boolean isEncoded(final HttpEntity entity) { 156 Args.notNull(entity, "HTTP entity"); 157 final Header h = entity.getContentType(); 158 if (h != null) { 159 final HeaderElement[] elems = h.getElements(); 160 if (elems.length > 0) { 161 final String contentType = elems[0].getName(); 162 return contentType.equalsIgnoreCase(CONTENT_TYPE); 163 } 164 } 165 return false; 166 } 167 168 /** 169 * Adds all parameters within the Scanner to the list of {@code parameters}, as encoded by 170 * {@code encoding}. For example, a scanner containing the string {@code a=1&b=2&c=3} would add the 171 * {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the list of parameters. By convention, {@code '&'} and 172 * {@code ';'} are accepted as parameter separators. 173 * 174 * @param parameters 175 * List to add parameters to. 176 * @param scanner 177 * Input that contains the parameters to parse. 178 * @param charset 179 * Encoding to use when decoding the parameters. 180 * 181 * @deprecated (4.4) use {@link #parse(String, java.nio.charset.Charset)} 182 */ 183 @Deprecated 184 public static void parse( 185 final List<NameValuePair> parameters, 186 final Scanner scanner, 187 final String charset) { 188 parse(parameters, scanner, "[" + QP_SEP_A + QP_SEP_S + "]", charset); 189 } 190 191 /** 192 * Adds all parameters within the Scanner to the list of 193 * {@code parameters}, as encoded by {@code encoding}. For 194 * example, a scanner containing the string {@code a=1&b=2&c=3} would 195 * add the {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the 196 * list of parameters. 197 * 198 * @param parameters 199 * List to add parameters to. 200 * @param scanner 201 * Input that contains the parameters to parse. 202 * @param parameterSepartorPattern 203 * The Pattern string for parameter separators, by convention {@code "[&;]"} 204 * @param charset 205 * Encoding to use when decoding the parameters. 206 * 207 * @deprecated (4.4) use {@link #parse(org.apache.http.util.CharArrayBuffer, java.nio.charset.Charset, char...)} 208 */ 209 @Deprecated 210 public static void parse( 211 final List <NameValuePair> parameters, 212 final Scanner scanner, 213 final String parameterSepartorPattern, 214 final String charset) { 215 scanner.useDelimiter(parameterSepartorPattern); 216 while (scanner.hasNext()) { 217 final String name; 218 final String value; 219 final String token = scanner.next(); 220 final int i = token.indexOf(NAME_VALUE_SEPARATOR); 221 if (i != -1) { 222 name = decodeFormFields(token.substring(0, i).trim(), charset); 223 value = decodeFormFields(token.substring(i + 1).trim(), charset); 224 } else { 225 name = decodeFormFields(token.trim(), charset); 226 value = null; 227 } 228 parameters.add(new BasicNameValuePair(name, value)); 229 } 230 } 231 232 /** 233 * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character 234 * encoding. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators. 235 * 236 * @param s 237 * text to parse. 238 * @param charset 239 * Encoding to use when decoding the parameters. 240 * @return a list of {@link NameValuePair} as built from the URI's query portion. 241 * 242 * @since 4.2 243 */ 244 public static List<NameValuePair> parse(final String s, final Charset charset) { 245 if (s == null) { 246 return Collections.emptyList(); 247 } 248 final CharArrayBuffer buffer = new CharArrayBuffer(s.length()); 249 buffer.append(s); 250 return parse(buffer, charset, QP_SEP_A, QP_SEP_S); 251 } 252 253 /** 254 * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character 255 * encoding. 256 * 257 * @param s 258 * text to parse. 259 * @param charset 260 * Encoding to use when decoding the parameters. 261 * @param separators 262 * element separators. 263 * @return a list of {@link NameValuePair} as built from the URI's query portion. 264 * 265 * @since 4.3 266 */ 267 public static List<NameValuePair> parse(final String s, final Charset charset, final char... separators) { 268 if (s == null) { 269 return Collections.emptyList(); 270 } 271 final CharArrayBuffer buffer = new CharArrayBuffer(s.length()); 272 buffer.append(s); 273 return parse(buffer, charset, separators); 274 } 275 276 /** 277 * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using 278 * the given character encoding. 279 * 280 * @param buf 281 * text to parse. 282 * @param charset 283 * Encoding to use when decoding the parameters. 284 * @param separators 285 * element separators. 286 * @return a list of {@link NameValuePair} as built from the URI's query portion. 287 * 288 * @since 4.4 289 */ 290 public static List<NameValuePair> parse( 291 final CharArrayBuffer buf, final Charset charset, final char... separators) { 292 Args.notNull(buf, "Char array buffer"); 293 final TokenParser tokenParser = TokenParser.INSTANCE; 294 final BitSet delimSet = new BitSet(); 295 for (final char separator: separators) { 296 delimSet.set(separator); 297 } 298 final ParserCursor cursor = new ParserCursor(0, buf.length()); 299 final List<NameValuePair> list = new ArrayList<NameValuePair>(); 300 while (!cursor.atEnd()) { 301 delimSet.set('='); 302 final String name = tokenParser.parseToken(buf, cursor, delimSet); 303 String value = null; 304 if (!cursor.atEnd()) { 305 final int delim = buf.charAt(cursor.getPos()); 306 cursor.updatePos(cursor.getPos() + 1); 307 if (delim == '=') { 308 delimSet.clear('='); 309 value = tokenParser.parseValue(buf, cursor, delimSet); 310 if (!cursor.atEnd()) { 311 cursor.updatePos(cursor.getPos() + 1); 312 } 313 } 314 } 315 if (!name.isEmpty()) { 316 list.add(new BasicNameValuePair( 317 decodeFormFields(name, charset), 318 decodeFormFields(value, charset))); 319 } 320 } 321 return list; 322 } 323 324 /** 325 * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded} 326 * list of parameters in an HTTP PUT or HTTP POST. 327 * 328 * @param parameters The parameters to include. 329 * @param charset The encoding to use. 330 * @return An {@code application/x-www-form-urlencoded} string 331 */ 332 public static String format( 333 final List <? extends NameValuePair> parameters, 334 final String charset) { 335 return format(parameters, QP_SEP_A, charset); 336 } 337 338 /** 339 * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded} 340 * list of parameters in an HTTP PUT or HTTP POST. 341 * 342 * @param parameters The parameters to include. 343 * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}. 344 * @param charset The encoding to use. 345 * @return An {@code application/x-www-form-urlencoded} string 346 * 347 * @since 4.3 348 */ 349 public static String format( 350 final List <? extends NameValuePair> parameters, 351 final char parameterSeparator, 352 final String charset) { 353 final StringBuilder result = new StringBuilder(); 354 for (final NameValuePair parameter : parameters) { 355 final String encodedName = encodeFormFields(parameter.getName(), charset); 356 final String encodedValue = encodeFormFields(parameter.getValue(), charset); 357 if (result.length() > 0) { 358 result.append(parameterSeparator); 359 } 360 result.append(encodedName); 361 if (encodedValue != null) { 362 result.append(NAME_VALUE_SEPARATOR); 363 result.append(encodedValue); 364 } 365 } 366 return result.toString(); 367 } 368 369 /** 370 * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded} 371 * list of parameters in an HTTP PUT or HTTP POST. 372 * 373 * @param parameters The parameters to include. 374 * @param charset The encoding to use. 375 * @return An {@code application/x-www-form-urlencoded} string 376 * 377 * @since 4.2 378 */ 379 public static String format( 380 final Iterable<? extends NameValuePair> parameters, 381 final Charset charset) { 382 return format(parameters, QP_SEP_A, charset); 383 } 384 385 /** 386 * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded} 387 * list of parameters in an HTTP PUT or HTTP POST. 388 * 389 * @param parameters The parameters to include. 390 * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}. 391 * @param charset The encoding to use. 392 * @return An {@code application/x-www-form-urlencoded} string 393 * 394 * @since 4.3 395 */ 396 public static String format( 397 final Iterable<? extends NameValuePair> parameters, 398 final char parameterSeparator, 399 final Charset charset) { 400 Args.notNull(parameters, "Parameters"); 401 final StringBuilder result = new StringBuilder(); 402 for (final NameValuePair parameter : parameters) { 403 final String encodedName = encodeFormFields(parameter.getName(), charset); 404 final String encodedValue = encodeFormFields(parameter.getValue(), charset); 405 if (result.length() > 0) { 406 result.append(parameterSeparator); 407 } 408 result.append(encodedName); 409 if (encodedValue != null) { 410 result.append(NAME_VALUE_SEPARATOR); 411 result.append(encodedValue); 412 } 413 } 414 return result.toString(); 415 } 416 417 /** 418 * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *} 419 * <p> 420 * This list is the same as the {@code unreserved} list in 421 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 422 */ 423 private static final BitSet UNRESERVED = new BitSet(256); 424 /** 425 * Punctuation characters: , ; : $ & + = 426 * <p> 427 * These are the additional characters allowed by userinfo. 428 */ 429 private static final BitSet PUNCT = new BitSet(256); 430 /** Characters which are safe to use in userinfo, 431 * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */ 432 private static final BitSet USERINFO = new BitSet(256); 433 /** Characters which are safe to use in a path, 434 * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */ 435 private static final BitSet PATHSAFE = new BitSet(256); 436 /** Characters which are safe to use in a query or a fragment, 437 * i.e. {@link #RESERVED} plus {@link #UNRESERVED} */ 438 private static final BitSet URIC = new BitSet(256); 439 440 /** 441 * Reserved characters, i.e. {@code ;/?:@&=+$,[]} 442 * <p> 443 * This list is the same as the {@code reserved} list in 444 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 445 * as augmented by 446 * <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a> 447 */ 448 private static final BitSet RESERVED = new BitSet(256); 449 450 451 /** 452 * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour, 453 * i.e. alphanumeric plus {@code "-", "_", ".", "*"} 454 */ 455 private static final BitSet URLENCODER = new BitSet(256); 456 457 static { 458 // unreserved chars 459 // alpha characters 460 for (int i = 'a'; i <= 'z'; i++) { 461 UNRESERVED.set(i); 462 } 463 for (int i = 'A'; i <= 'Z'; i++) { 464 UNRESERVED.set(i); 465 } 466 // numeric characters 467 for (int i = '0'; i <= '9'; i++) { 468 UNRESERVED.set(i); 469 } 470 UNRESERVED.set('_'); // these are the charactes of the "mark" list 471 UNRESERVED.set('-'); 472 UNRESERVED.set('.'); 473 UNRESERVED.set('*'); 474 URLENCODER.or(UNRESERVED); // skip remaining unreserved characters 475 UNRESERVED.set('!'); 476 UNRESERVED.set('~'); 477 UNRESERVED.set('\''); 478 UNRESERVED.set('('); 479 UNRESERVED.set(')'); 480 // punct chars 481 PUNCT.set(','); 482 PUNCT.set(';'); 483 PUNCT.set(':'); 484 PUNCT.set('$'); 485 PUNCT.set('&'); 486 PUNCT.set('+'); 487 PUNCT.set('='); 488 // Safe for userinfo 489 USERINFO.or(UNRESERVED); 490 USERINFO.or(PUNCT); 491 492 // URL path safe 493 PATHSAFE.or(UNRESERVED); 494 PATHSAFE.set('/'); // segment separator 495 PATHSAFE.set(';'); // param separator 496 PATHSAFE.set(':'); // rest as per list in 2396, i.e. : @ & = + $ , 497 PATHSAFE.set('@'); 498 PATHSAFE.set('&'); 499 PATHSAFE.set('='); 500 PATHSAFE.set('+'); 501 PATHSAFE.set('$'); 502 PATHSAFE.set(','); 503 504 RESERVED.set(';'); 505 RESERVED.set('/'); 506 RESERVED.set('?'); 507 RESERVED.set(':'); 508 RESERVED.set('@'); 509 RESERVED.set('&'); 510 RESERVED.set('='); 511 RESERVED.set('+'); 512 RESERVED.set('$'); 513 RESERVED.set(','); 514 RESERVED.set('['); // added by RFC 2732 515 RESERVED.set(']'); // added by RFC 2732 516 517 URIC.or(RESERVED); 518 URIC.or(UNRESERVED); 519 } 520 521 private static final int RADIX = 16; 522 523 private static String urlEncode( 524 final String content, 525 final Charset charset, 526 final BitSet safechars, 527 final boolean blankAsPlus) { 528 if (content == null) { 529 return null; 530 } 531 final StringBuilder buf = new StringBuilder(); 532 final ByteBuffer bb = charset.encode(content); 533 while (bb.hasRemaining()) { 534 final int b = bb.get() & 0xff; 535 if (safechars.get(b)) { 536 buf.append((char) b); 537 } else if (blankAsPlus && b == ' ') { 538 buf.append('+'); 539 } else { 540 buf.append("%"); 541 final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX)); 542 final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX)); 543 buf.append(hex1); 544 buf.append(hex2); 545 } 546 } 547 return buf.toString(); 548 } 549 550 /** 551 * Decode/unescape a portion of a URL, to use with the query part ensure {@code plusAsBlank} is true. 552 * 553 * @param content the portion to decode 554 * @param charset the charset to use 555 * @param plusAsBlank if {@code true}, then convert '+' to space (e.g. for www-url-form-encoded content), otherwise leave as is. 556 * @return encoded string 557 */ 558 private static String urlDecode( 559 final String content, 560 final Charset charset, 561 final boolean plusAsBlank) { 562 if (content == null) { 563 return null; 564 } 565 final ByteBuffer bb = ByteBuffer.allocate(content.length()); 566 final CharBuffer cb = CharBuffer.wrap(content); 567 while (cb.hasRemaining()) { 568 final char c = cb.get(); 569 if (c == '%' && cb.remaining() >= 2) { 570 final char uc = cb.get(); 571 final char lc = cb.get(); 572 final int u = Character.digit(uc, 16); 573 final int l = Character.digit(lc, 16); 574 if (u != -1 && l != -1) { 575 bb.put((byte) ((u << 4) + l)); 576 } else { 577 bb.put((byte) '%'); 578 bb.put((byte) uc); 579 bb.put((byte) lc); 580 } 581 } else if (plusAsBlank && c == '+') { 582 bb.put((byte) ' '); 583 } else { 584 bb.put((byte) c); 585 } 586 } 587 bb.flip(); 588 return charset.decode(bb).toString(); 589 } 590 591 /** 592 * Decode/unescape www-url-form-encoded content. 593 * 594 * @param content the content to decode, will decode '+' as space 595 * @param charset the charset to use 596 * @return encoded string 597 */ 598 private static String decodeFormFields (final String content, final String charset) { 599 if (content == null) { 600 return null; 601 } 602 return urlDecode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, true); 603 } 604 605 /** 606 * Decode/unescape www-url-form-encoded content. 607 * 608 * @param content the content to decode, will decode '+' as space 609 * @param charset the charset to use 610 * @return encoded string 611 */ 612 private static String decodeFormFields (final String content, final Charset charset) { 613 if (content == null) { 614 return null; 615 } 616 return urlDecode(content, charset != null ? charset : Consts.UTF_8, true); 617 } 618 619 /** 620 * Encode/escape www-url-form-encoded content. 621 * <p> 622 * Uses the {@link #URLENCODER} set of characters, rather than 623 * the {@link #UNRESERVED} set; this is for compatibilty with previous 624 * releases, URLEncoder.encode() and most browsers. 625 * 626 * @param content the content to encode, will convert space to '+' 627 * @param charset the charset to use 628 * @return encoded string 629 */ 630 private static String encodeFormFields(final String content, final String charset) { 631 if (content == null) { 632 return null; 633 } 634 return urlEncode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, URLENCODER, true); 635 } 636 637 /** 638 * Encode/escape www-url-form-encoded content. 639 * <p> 640 * Uses the {@link #URLENCODER} set of characters, rather than 641 * the {@link #UNRESERVED} set; this is for compatibilty with previous 642 * releases, URLEncoder.encode() and most browsers. 643 * 644 * @param content the content to encode, will convert space to '+' 645 * @param charset the charset to use 646 * @return encoded string 647 */ 648 private static String encodeFormFields (final String content, final Charset charset) { 649 if (content == null) { 650 return null; 651 } 652 return urlEncode(content, charset != null ? charset : Consts.UTF_8, URLENCODER, true); 653 } 654 655 /** 656 * Encode a String using the {@link #USERINFO} set of characters. 657 * <p> 658 * Used by URIBuilder to encode the userinfo segment. 659 * 660 * @param content the string to encode, does not convert space to '+' 661 * @param charset the charset to use 662 * @return the encoded string 663 */ 664 static String encUserInfo(final String content, final Charset charset) { 665 return urlEncode(content, charset, USERINFO, false); 666 } 667 668 /** 669 * Encode a String using the {@link #URIC} set of characters. 670 * <p> 671 * Used by URIBuilder to encode the query and fragment segments. 672 * 673 * @param content the string to encode, does not convert space to '+' 674 * @param charset the charset to use 675 * @return the encoded string 676 */ 677 static String encUric(final String content, final Charset charset) { 678 return urlEncode(content, charset, URIC, false); 679 } 680 681 /** 682 * Encode a String using the {@link #PATHSAFE} set of characters. 683 * <p> 684 * Used by URIBuilder to encode path segments. 685 * 686 * @param content the string to encode, does not convert space to '+' 687 * @param charset the charset to use 688 * @return the encoded string 689 */ 690 static String encPath(final String content, final Charset charset) { 691 return urlEncode(content, charset, PATHSAFE, false); 692 } 693 694}