001/*
002 * ====================================================================
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *   http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing,
014 * software distributed under the License is distributed on an
015 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
016 * KIND, either express or implied.  See the License for the
017 * specific language governing permissions and limitations
018 * under the License.
019 * ====================================================================
020 *
021 * This software consists of voluntary contributions made by many
022 * individuals on behalf of the Apache Software Foundation.  For more
023 * information on the Apache Software Foundation, please see
024 * <http://www.apache.org/>.
025 *
026 */
027
028package org.apache.http.client.utils;
029
030import java.io.IOException;
031import java.io.InputStream;
032import java.io.InputStreamReader;
033import java.io.Reader;
034import java.net.URI;
035import java.nio.ByteBuffer;
036import java.nio.CharBuffer;
037import java.nio.charset.Charset;
038import java.util.ArrayList;
039import java.util.BitSet;
040import java.util.Collections;
041import java.util.List;
042import java.util.Scanner;
043
044import org.apache.http.Consts;
045import org.apache.http.Header;
046import org.apache.http.HeaderElement;
047import org.apache.http.HttpEntity;
048import org.apache.http.NameValuePair;
049import org.apache.http.entity.ContentType;
050import org.apache.http.message.BasicNameValuePair;
051import org.apache.http.message.ParserCursor;
052import org.apache.http.message.TokenParser;
053import org.apache.http.protocol.HTTP;
054import org.apache.http.util.Args;
055import org.apache.http.util.CharArrayBuffer;
056
057/**
058 * A collection of utilities for encoding URLs.
059 *
060 * @since 4.0
061 */
062public class URLEncodedUtils {
063
064    /**
065     * The default HTML form content type.
066     */
067    public static final String CONTENT_TYPE = "application/x-www-form-urlencoded";
068
069    private static final char QP_SEP_A = '&';
070    private static final char QP_SEP_S = ';';
071    private static final String NAME_VALUE_SEPARATOR = "=";
072
073    /**
074     * @deprecated 4.5 Use {@link #parse(URI, Charset)}
075     */
076    public static List <NameValuePair> parse(final URI uri, final String charsetName) {
077        return parse(uri, charsetName != null ? Charset.forName(charsetName) : null);
078    }
079
080    /**
081     * Returns a list of {@link NameValuePair NameValuePairs} as built from the URI's query portion. For example, a URI
082     * of {@code http://example.org/path/to/file?a=1&b=2&c=3} would return a list of three NameValuePairs, one for a=1,
083     * one for b=2, and one for c=3. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
084     * <p>
085     * This is typically useful while parsing an HTTP PUT.
086     *
087     * This API is currently only used for testing.
088     *
089     * @param uri
090     *        URI to parse
091     * @param charset
092     *        Charset to use while parsing the query
093     * @return a list of {@link NameValuePair} as built from the URI's query portion.
094     *
095     * @since 4.5
096     */
097    public static List <NameValuePair> parse(final URI uri, final Charset charset) {
098        Args.notNull(uri, "URI");
099        final String query = uri.getRawQuery();
100        if (query != null && !query.isEmpty()) {
101            return parse(query, charset);
102        }
103        return Collections.emptyList();
104    }
105
106    /**
107     * Returns a list of {@link NameValuePair NameValuePairs} as parsed from an {@link HttpEntity}.
108     * The encoding is taken from the entity's Content-Encoding header.
109     * <p>
110     * This is typically used while parsing an HTTP POST.
111     *
112     * @param entity
113     *            The entity to parse
114     * @return a list of {@link NameValuePair} as built from the URI's query portion.
115     * @throws IOException
116     *             If there was an exception getting the entity's data.
117     */
118    public static List <NameValuePair> parse(
119            final HttpEntity entity) throws IOException {
120        Args.notNull(entity, "HTTP entity");
121        final ContentType contentType = ContentType.get(entity);
122        if (contentType == null || !contentType.getMimeType().equalsIgnoreCase(CONTENT_TYPE)) {
123            return Collections.emptyList();
124        }
125        final long len = entity.getContentLength();
126        Args.check(len <= Integer.MAX_VALUE, "HTTP entity is too large");
127        final Charset charset = contentType.getCharset() != null ? contentType.getCharset() : HTTP.DEF_CONTENT_CHARSET;
128        final InputStream instream = entity.getContent();
129        if (instream == null) {
130            return Collections.emptyList();
131        }
132        final CharArrayBuffer buf;
133        try {
134            buf = new CharArrayBuffer(len > 0 ? (int) len : 1024);
135            final Reader reader = new InputStreamReader(instream, charset);
136            final char[] tmp = new char[1024];
137            int l;
138            while((l = reader.read(tmp)) != -1) {
139                buf.append(tmp, 0, l);
140            }
141
142        } finally {
143            instream.close();
144        }
145        if (buf.length() == 0) {
146            return Collections.emptyList();
147        }
148        return parse(buf, charset, QP_SEP_A);
149    }
150
151    /**
152     * Returns true if the entity's Content-Type header is
153     * {@code application/x-www-form-urlencoded}.
154     */
155    public static boolean isEncoded(final HttpEntity entity) {
156        Args.notNull(entity, "HTTP entity");
157        final Header h = entity.getContentType();
158        if (h != null) {
159            final HeaderElement[] elems = h.getElements();
160            if (elems.length > 0) {
161                final String contentType = elems[0].getName();
162                return contentType.equalsIgnoreCase(CONTENT_TYPE);
163            }
164        }
165        return false;
166    }
167
168    /**
169     * Adds all parameters within the Scanner to the list of {@code parameters}, as encoded by
170     * {@code encoding}. For example, a scanner containing the string {@code a=1&b=2&c=3} would add the
171     * {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the list of parameters. By convention, {@code '&'} and
172     * {@code ';'} are accepted as parameter separators.
173     *
174     * @param parameters
175     *            List to add parameters to.
176     * @param scanner
177     *            Input that contains the parameters to parse.
178     * @param charset
179     *            Encoding to use when decoding the parameters.
180     *
181     * @deprecated (4.4) use {@link #parse(String, java.nio.charset.Charset)}
182     */
183    @Deprecated
184    public static void parse(
185            final List<NameValuePair> parameters,
186            final Scanner scanner,
187            final String charset) {
188        parse(parameters, scanner, "[" + QP_SEP_A + QP_SEP_S + "]", charset);
189    }
190
191    /**
192     * Adds all parameters within the Scanner to the list of
193     * {@code parameters}, as encoded by {@code encoding}. For
194     * example, a scanner containing the string {@code a=1&b=2&c=3} would
195     * add the {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the
196     * list of parameters.
197     *
198     * @param parameters
199     *            List to add parameters to.
200     * @param scanner
201     *            Input that contains the parameters to parse.
202     * @param parameterSepartorPattern
203     *            The Pattern string for parameter separators, by convention {@code "[&;]"}
204     * @param charset
205     *            Encoding to use when decoding the parameters.
206     *
207     * @deprecated (4.4) use {@link #parse(org.apache.http.util.CharArrayBuffer, java.nio.charset.Charset, char...)}
208     */
209    @Deprecated
210    public static void parse(
211            final List <NameValuePair> parameters,
212            final Scanner scanner,
213            final String parameterSepartorPattern,
214            final String charset) {
215        scanner.useDelimiter(parameterSepartorPattern);
216        while (scanner.hasNext()) {
217            final String name;
218            final String value;
219            final String token = scanner.next();
220            final int i = token.indexOf(NAME_VALUE_SEPARATOR);
221            if (i != -1) {
222                name = decodeFormFields(token.substring(0, i).trim(), charset);
223                value = decodeFormFields(token.substring(i + 1).trim(), charset);
224            } else {
225                name = decodeFormFields(token.trim(), charset);
226                value = null;
227            }
228            parameters.add(new BasicNameValuePair(name, value));
229        }
230    }
231
232    /**
233     * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
234     * encoding. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
235     *
236     * @param s
237     *            text to parse.
238     * @param charset
239     *            Encoding to use when decoding the parameters.
240     * @return a list of {@link NameValuePair} as built from the URI's query portion.
241     *
242     * @since 4.2
243     */
244    public static List<NameValuePair> parse(final String s, final Charset charset) {
245        if (s == null) {
246            return Collections.emptyList();
247        }
248        final CharArrayBuffer buffer = new CharArrayBuffer(s.length());
249        buffer.append(s);
250        return parse(buffer, charset, QP_SEP_A, QP_SEP_S);
251    }
252
253    /**
254     * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
255     * encoding.
256     *
257     * @param s
258     *            text to parse.
259     * @param charset
260     *            Encoding to use when decoding the parameters.
261     * @param separators
262     *            element separators.
263     * @return a list of {@link NameValuePair} as built from the URI's query portion.
264     *
265     * @since 4.3
266     */
267    public static List<NameValuePair> parse(final String s, final Charset charset, final char... separators) {
268        if (s == null) {
269            return Collections.emptyList();
270        }
271        final CharArrayBuffer buffer = new CharArrayBuffer(s.length());
272        buffer.append(s);
273        return parse(buffer, charset, separators);
274    }
275
276    /**
277     * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using
278     * the given character encoding.
279     *
280     * @param buf
281     *            text to parse.
282     * @param charset
283     *            Encoding to use when decoding the parameters.
284     * @param separators
285     *            element separators.
286     * @return a list of {@link NameValuePair} as built from the URI's query portion.
287     *
288     * @since 4.4
289     */
290    public static List<NameValuePair> parse(
291            final CharArrayBuffer buf, final Charset charset, final char... separators) {
292        Args.notNull(buf, "Char array buffer");
293        final TokenParser tokenParser = TokenParser.INSTANCE;
294        final BitSet delimSet = new BitSet();
295        for (final char separator: separators) {
296            delimSet.set(separator);
297        }
298        final ParserCursor cursor = new ParserCursor(0, buf.length());
299        final List<NameValuePair> list = new ArrayList<NameValuePair>();
300        while (!cursor.atEnd()) {
301            delimSet.set('=');
302            final String name = tokenParser.parseToken(buf, cursor, delimSet);
303            String value = null;
304            if (!cursor.atEnd()) {
305                final int delim = buf.charAt(cursor.getPos());
306                cursor.updatePos(cursor.getPos() + 1);
307                if (delim == '=') {
308                    delimSet.clear('=');
309                    value = tokenParser.parseValue(buf, cursor, delimSet);
310                    if (!cursor.atEnd()) {
311                        cursor.updatePos(cursor.getPos() + 1);
312                    }
313                }
314            }
315            if (!name.isEmpty()) {
316                list.add(new BasicNameValuePair(
317                        decodeFormFields(name, charset),
318                        decodeFormFields(value, charset)));
319            }
320        }
321        return list;
322    }
323
324    /**
325     * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
326     * list of parameters in an HTTP PUT or HTTP POST.
327     *
328     * @param parameters  The parameters to include.
329     * @param charset The encoding to use.
330     * @return An {@code application/x-www-form-urlencoded} string
331     */
332    public static String format(
333            final List <? extends NameValuePair> parameters,
334            final String charset) {
335        return format(parameters, QP_SEP_A, charset);
336    }
337
338    /**
339     * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
340     * list of parameters in an HTTP PUT or HTTP POST.
341     *
342     * @param parameters  The parameters to include.
343     * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
344     * @param charset The encoding to use.
345     * @return An {@code application/x-www-form-urlencoded} string
346     *
347     * @since 4.3
348     */
349    public static String format(
350            final List <? extends NameValuePair> parameters,
351            final char parameterSeparator,
352            final String charset) {
353        final StringBuilder result = new StringBuilder();
354        for (final NameValuePair parameter : parameters) {
355            final String encodedName = encodeFormFields(parameter.getName(), charset);
356            final String encodedValue = encodeFormFields(parameter.getValue(), charset);
357            if (result.length() > 0) {
358                result.append(parameterSeparator);
359            }
360            result.append(encodedName);
361            if (encodedValue != null) {
362                result.append(NAME_VALUE_SEPARATOR);
363                result.append(encodedValue);
364            }
365        }
366        return result.toString();
367    }
368
369    /**
370     * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
371     * list of parameters in an HTTP PUT or HTTP POST.
372     *
373     * @param parameters  The parameters to include.
374     * @param charset The encoding to use.
375     * @return An {@code application/x-www-form-urlencoded} string
376     *
377     * @since 4.2
378     */
379    public static String format(
380            final Iterable<? extends NameValuePair> parameters,
381            final Charset charset) {
382        return format(parameters, QP_SEP_A, charset);
383    }
384
385    /**
386     * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
387     * list of parameters in an HTTP PUT or HTTP POST.
388     *
389     * @param parameters  The parameters to include.
390     * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
391     * @param charset The encoding to use.
392     * @return An {@code application/x-www-form-urlencoded} string
393     *
394     * @since 4.3
395     */
396    public static String format(
397            final Iterable<? extends NameValuePair> parameters,
398            final char parameterSeparator,
399            final Charset charset) {
400        Args.notNull(parameters, "Parameters");
401        final StringBuilder result = new StringBuilder();
402        for (final NameValuePair parameter : parameters) {
403            final String encodedName = encodeFormFields(parameter.getName(), charset);
404            final String encodedValue = encodeFormFields(parameter.getValue(), charset);
405            if (result.length() > 0) {
406                result.append(parameterSeparator);
407            }
408            result.append(encodedName);
409            if (encodedValue != null) {
410                result.append(NAME_VALUE_SEPARATOR);
411                result.append(encodedValue);
412            }
413        }
414        return result.toString();
415    }
416
417    /**
418     * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *}
419     * <p>
420     *  This list is the same as the {@code unreserved} list in
421     *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
422     */
423    private static final BitSet UNRESERVED   = new BitSet(256);
424    /**
425     * Punctuation characters: , ; : $ & + =
426     * <p>
427     * These are the additional characters allowed by userinfo.
428     */
429    private static final BitSet PUNCT        = new BitSet(256);
430    /** Characters which are safe to use in userinfo,
431     * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */
432    private static final BitSet USERINFO     = new BitSet(256);
433    /** Characters which are safe to use in a path,
434     * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */
435    private static final BitSet PATHSAFE     = new BitSet(256);
436    /** Characters which are safe to use in a query or a fragment,
437     * i.e. {@link #RESERVED} plus {@link #UNRESERVED} */
438    private static final BitSet URIC     = new BitSet(256);
439
440    /**
441     * Reserved characters, i.e. {@code ;/?:@&=+$,[]}
442     * <p>
443     *  This list is the same as the {@code reserved} list in
444     *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
445     *  as augmented by
446     *  <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>
447     */
448    private static final BitSet RESERVED     = new BitSet(256);
449
450
451    /**
452     * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour,
453     * i.e. alphanumeric plus {@code "-", "_", ".", "*"}
454     */
455    private static final BitSet URLENCODER   = new BitSet(256);
456
457    static {
458        // unreserved chars
459        // alpha characters
460        for (int i = 'a'; i <= 'z'; i++) {
461            UNRESERVED.set(i);
462        }
463        for (int i = 'A'; i <= 'Z'; i++) {
464            UNRESERVED.set(i);
465        }
466        // numeric characters
467        for (int i = '0'; i <= '9'; i++) {
468            UNRESERVED.set(i);
469        }
470        UNRESERVED.set('_'); // these are the charactes of the "mark" list
471        UNRESERVED.set('-');
472        UNRESERVED.set('.');
473        UNRESERVED.set('*');
474        URLENCODER.or(UNRESERVED); // skip remaining unreserved characters
475        UNRESERVED.set('!');
476        UNRESERVED.set('~');
477        UNRESERVED.set('\'');
478        UNRESERVED.set('(');
479        UNRESERVED.set(')');
480        // punct chars
481        PUNCT.set(',');
482        PUNCT.set(';');
483        PUNCT.set(':');
484        PUNCT.set('$');
485        PUNCT.set('&');
486        PUNCT.set('+');
487        PUNCT.set('=');
488        // Safe for userinfo
489        USERINFO.or(UNRESERVED);
490        USERINFO.or(PUNCT);
491
492        // URL path safe
493        PATHSAFE.or(UNRESERVED);
494        PATHSAFE.set('/'); // segment separator
495        PATHSAFE.set(';'); // param separator
496        PATHSAFE.set(':'); // rest as per list in 2396, i.e. : @ & = + $ ,
497        PATHSAFE.set('@');
498        PATHSAFE.set('&');
499        PATHSAFE.set('=');
500        PATHSAFE.set('+');
501        PATHSAFE.set('$');
502        PATHSAFE.set(',');
503
504        RESERVED.set(';');
505        RESERVED.set('/');
506        RESERVED.set('?');
507        RESERVED.set(':');
508        RESERVED.set('@');
509        RESERVED.set('&');
510        RESERVED.set('=');
511        RESERVED.set('+');
512        RESERVED.set('$');
513        RESERVED.set(',');
514        RESERVED.set('['); // added by RFC 2732
515        RESERVED.set(']'); // added by RFC 2732
516
517        URIC.or(RESERVED);
518        URIC.or(UNRESERVED);
519    }
520
521    private static final int RADIX = 16;
522
523    private static String urlEncode(
524            final String content,
525            final Charset charset,
526            final BitSet safechars,
527            final boolean blankAsPlus) {
528        if (content == null) {
529            return null;
530        }
531        final StringBuilder buf = new StringBuilder();
532        final ByteBuffer bb = charset.encode(content);
533        while (bb.hasRemaining()) {
534            final int b = bb.get() & 0xff;
535            if (safechars.get(b)) {
536                buf.append((char) b);
537            } else if (blankAsPlus && b == ' ') {
538                buf.append('+');
539            } else {
540                buf.append("%");
541                final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
542                final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
543                buf.append(hex1);
544                buf.append(hex2);
545            }
546        }
547        return buf.toString();
548    }
549
550    /**
551     * Decode/unescape a portion of a URL, to use with the query part ensure {@code plusAsBlank} is true.
552     *
553     * @param content the portion to decode
554     * @param charset the charset to use
555     * @param plusAsBlank if {@code true}, then convert '+' to space (e.g. for www-url-form-encoded content), otherwise leave as is.
556     * @return encoded string
557     */
558    private static String urlDecode(
559            final String content,
560            final Charset charset,
561            final boolean plusAsBlank) {
562        if (content == null) {
563            return null;
564        }
565        final ByteBuffer bb = ByteBuffer.allocate(content.length());
566        final CharBuffer cb = CharBuffer.wrap(content);
567        while (cb.hasRemaining()) {
568            final char c = cb.get();
569            if (c == '%' && cb.remaining() >= 2) {
570                final char uc = cb.get();
571                final char lc = cb.get();
572                final int u = Character.digit(uc, 16);
573                final int l = Character.digit(lc, 16);
574                if (u != -1 && l != -1) {
575                    bb.put((byte) ((u << 4) + l));
576                } else {
577                    bb.put((byte) '%');
578                    bb.put((byte) uc);
579                    bb.put((byte) lc);
580                }
581            } else if (plusAsBlank && c == '+') {
582                bb.put((byte) ' ');
583            } else {
584                bb.put((byte) c);
585            }
586        }
587        bb.flip();
588        return charset.decode(bb).toString();
589    }
590
591    /**
592     * Decode/unescape www-url-form-encoded content.
593     *
594     * @param content the content to decode, will decode '+' as space
595     * @param charset the charset to use
596     * @return encoded string
597     */
598    private static String decodeFormFields (final String content, final String charset) {
599        if (content == null) {
600            return null;
601        }
602        return urlDecode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, true);
603    }
604
605    /**
606     * Decode/unescape www-url-form-encoded content.
607     *
608     * @param content the content to decode, will decode '+' as space
609     * @param charset the charset to use
610     * @return encoded string
611     */
612    private static String decodeFormFields (final String content, final Charset charset) {
613        if (content == null) {
614            return null;
615        }
616        return urlDecode(content, charset != null ? charset : Consts.UTF_8, true);
617    }
618
619    /**
620     * Encode/escape www-url-form-encoded content.
621     * <p>
622     * Uses the {@link #URLENCODER} set of characters, rather than
623     * the {@link #UNRESERVED} set; this is for compatibilty with previous
624     * releases, URLEncoder.encode() and most browsers.
625     *
626     * @param content the content to encode, will convert space to '+'
627     * @param charset the charset to use
628     * @return encoded string
629     */
630    private static String encodeFormFields(final String content, final String charset) {
631        if (content == null) {
632            return null;
633        }
634        return urlEncode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, URLENCODER, true);
635    }
636
637    /**
638     * Encode/escape www-url-form-encoded content.
639     * <p>
640     * Uses the {@link #URLENCODER} set of characters, rather than
641     * the {@link #UNRESERVED} set; this is for compatibilty with previous
642     * releases, URLEncoder.encode() and most browsers.
643     *
644     * @param content the content to encode, will convert space to '+'
645     * @param charset the charset to use
646     * @return encoded string
647     */
648    private static String encodeFormFields (final String content, final Charset charset) {
649        if (content == null) {
650            return null;
651        }
652        return urlEncode(content, charset != null ? charset : Consts.UTF_8, URLENCODER, true);
653    }
654
655    /**
656     * Encode a String using the {@link #USERINFO} set of characters.
657     * <p>
658     * Used by URIBuilder to encode the userinfo segment.
659     *
660     * @param content the string to encode, does not convert space to '+'
661     * @param charset the charset to use
662     * @return the encoded string
663     */
664    static String encUserInfo(final String content, final Charset charset) {
665        return urlEncode(content, charset, USERINFO, false);
666    }
667
668    /**
669     * Encode a String using the {@link #URIC} set of characters.
670     * <p>
671     * Used by URIBuilder to encode the query and fragment segments.
672     *
673     * @param content the string to encode, does not convert space to '+'
674     * @param charset the charset to use
675     * @return the encoded string
676     */
677    static String encUric(final String content, final Charset charset) {
678        return urlEncode(content, charset, URIC, false);
679    }
680
681    /**
682     * Encode a String using the {@link #PATHSAFE} set of characters.
683     * <p>
684     * Used by URIBuilder to encode path segments.
685     *
686     * @param content the string to encode, does not convert space to '+'
687     * @param charset the charset to use
688     * @return the encoded string
689     */
690    static String encPath(final String content, final Charset charset) {
691        return urlEncode(content, charset, PATHSAFE, false);
692    }
693
694}