001/*
002 * ====================================================================
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *   http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing,
014 * software distributed under the License is distributed on an
015 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
016 * KIND, either express or implied.  See the License for the
017 * specific language governing permissions and limitations
018 * under the License.
019 * ====================================================================
020 *
021 * This software consists of voluntary contributions made by many
022 * individuals on behalf of the Apache Software Foundation.  For more
023 * information on the Apache Software Foundation, please see
024 * <http://www.apache.org/>.
025 *
026 */
027
028package org.apache.http.message;
029
030import java.util.NoSuchElementException;
031
032import org.apache.http.HeaderIterator;
033import org.apache.http.ParseException;
034import org.apache.http.TokenIterator;
035import org.apache.http.util.Args;
036
037/**
038 * Basic implementation of a {@link TokenIterator}.
039 * This implementation parses {@code #token} sequences as
040 * defined by RFC 2616, section 2.
041 * It extends that definition somewhat beyond US-ASCII.
042 *
043 * @since 4.0
044 */
045public class BasicTokenIterator implements TokenIterator {
046
047    /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
048    // the order of the characters here is adjusted to put the
049    // most likely candidates at the beginning of the collection
050    public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
051
052
053    /** The iterator from which to obtain the next header. */
054    protected final HeaderIterator headerIt;
055
056    /**
057     * The value of the current header.
058     * This is the header value that includes {@link #currentToken}.
059     * Undefined if the iteration is over.
060     */
061    protected String currentHeader;
062
063    /**
064     * The token to be returned by the next call to {@link #nextToken()}.
065     * {@code null} if the iteration is over.
066     */
067    protected String currentToken;
068
069    /**
070     * The position after {@link #currentToken} in {@link #currentHeader}.
071     * Undefined if the iteration is over.
072     */
073    protected int searchPos;
074
075
076    /**
077     * Creates a new instance of {@link BasicTokenIterator}.
078     *
079     * @param headerIterator    the iterator for the headers to tokenize
080     */
081    public BasicTokenIterator(final HeaderIterator headerIterator) {
082        super();
083        this.headerIt = Args.notNull(headerIterator, "Header iterator");
084        this.searchPos = findNext(-1);
085    }
086
087
088    // non-javadoc, see interface TokenIterator
089    @Override
090    public boolean hasNext() {
091        return (this.currentToken != null);
092    }
093
094
095    /**
096     * Obtains the next token from this iteration.
097     *
098     * @return  the next token in this iteration
099     *
100     * @throws NoSuchElementException   if the iteration is already over
101     * @throws ParseException   if an invalid header value is encountered
102     */
103    @Override
104    public String nextToken()
105        throws NoSuchElementException, ParseException {
106
107        if (this.currentToken == null) {
108            throw new NoSuchElementException("Iteration already finished.");
109        }
110
111        final String result = this.currentToken;
112        // updates currentToken, may trigger ParseException:
113        this.searchPos = findNext(this.searchPos);
114
115        return result;
116    }
117
118
119    /**
120     * Returns the next token.
121     * Same as {@link #nextToken}, but with generic return type.
122     *
123     * @return  the next token in this iteration
124     *
125     * @throws NoSuchElementException   if there are no more tokens
126     * @throws ParseException   if an invalid header value is encountered
127     */
128    @Override
129    public final Object next()
130        throws NoSuchElementException, ParseException {
131        return nextToken();
132    }
133
134
135    /**
136     * Removing tokens is not supported.
137     *
138     * @throws UnsupportedOperationException    always
139     */
140    @Override
141    public final void remove()
142        throws UnsupportedOperationException {
143
144        throw new UnsupportedOperationException
145            ("Removing tokens is not supported.");
146    }
147
148
149    /**
150     * Determines the next token.
151     * If found, the token is stored in {@link #currentToken}.
152     * The return value indicates the position after the token
153     * in {@link #currentHeader}. If necessary, the next header
154     * will be obtained from {@link #headerIt}.
155     * If not found, {@link #currentToken} is set to {@code null}.
156     *
157     * @param pos       the position in the current header at which to
158     *                  start the search, -1 to search in the first header
159     *
160     * @return  the position after the found token in the current header, or
161     *          negative if there was no next token
162     *
163     * @throws ParseException   if an invalid header value is encountered
164     */
165    protected int findNext(final int pos) throws ParseException {
166        int from = pos;
167        if (from < 0) {
168            // called from the constructor, initialize the first header
169            if (!this.headerIt.hasNext()) {
170                return -1;
171            }
172            this.currentHeader = this.headerIt.nextHeader().getValue();
173            from = 0;
174        } else {
175            // called after a token, make sure there is a separator
176            from = findTokenSeparator(from);
177        }
178
179        final int start = findTokenStart(from);
180        if (start < 0) {
181            this.currentToken = null;
182            return -1; // nothing found
183        }
184
185        final int end = findTokenEnd(start);
186        this.currentToken = createToken(this.currentHeader, start, end);
187        return end;
188    }
189
190
191    /**
192     * Creates a new token to be returned.
193     * Called from {@link #findNext findNext} after the token is identified.
194     * The default implementation simply calls
195     * {@link java.lang.String#substring String.substring}.
196     * <p>
197     * If header values are significantly longer than tokens, and some
198     * tokens are permanently referenced by the application, there can
199     * be problems with garbage collection. A substring will hold a
200     * reference to the full characters of the original string and
201     * therefore occupies more memory than might be expected.
202     * To avoid this, override this method and create a new string
203     * instead of a substring.
204     * </p>
205     *
206     * @param value     the full header value from which to create a token
207     * @param start     the index of the first token character
208     * @param end       the index after the last token character
209     *
210     * @return  a string representing the token identified by the arguments
211     */
212    protected String createToken(final String value, final int start, final int end) {
213        return value.substring(start, end);
214    }
215
216
217    /**
218     * Determines the starting position of the next token.
219     * This method will iterate over headers if necessary.
220     *
221     * @param pos       the position in the current header at which to
222     *                  start the search
223     *
224     * @return  the position of the token start in the current header,
225     *          negative if no token start could be found
226     */
227    protected int findTokenStart(final int pos) {
228        int from = Args.notNegative(pos, "Search position");
229        boolean found = false;
230        while (!found && (this.currentHeader != null)) {
231
232            final int to = this.currentHeader.length();
233            while (!found && (from < to)) {
234
235                final char ch = this.currentHeader.charAt(from);
236                if (isTokenSeparator(ch) || isWhitespace(ch)) {
237                    // whitspace and token separators are skipped
238                    from++;
239                } else if (isTokenChar(this.currentHeader.charAt(from))) {
240                    // found the start of a token
241                    found = true;
242                } else {
243                    throw new ParseException
244                        ("Invalid character before token (pos " + from +
245                         "): " + this.currentHeader);
246                }
247            }
248            if (!found) {
249                if (this.headerIt.hasNext()) {
250                    this.currentHeader = this.headerIt.nextHeader().getValue();
251                    from = 0;
252                } else {
253                    this.currentHeader = null;
254                }
255            }
256        } // while headers
257
258        return found ? from : -1;
259    }
260
261
262    /**
263     * Determines the position of the next token separator.
264     * Because of multi-header joining rules, the end of a
265     * header value is a token separator. This method does
266     * therefore not need to iterate over headers.
267     *
268     * @param pos       the position in the current header at which to
269     *                  start the search
270     *
271     * @return  the position of a token separator in the current header,
272     *          or at the end
273     *
274     * @throws ParseException
275     *         if a new token is found before a token separator.
276     *         RFC 2616, section 2.1 explicitly requires a comma between
277     *         tokens for {@code #}.
278     */
279    protected int findTokenSeparator(final int pos) {
280        int from = Args.notNegative(pos, "Search position");
281        boolean found = false;
282        final int to = this.currentHeader.length();
283        while (!found && (from < to)) {
284            final char ch = this.currentHeader.charAt(from);
285            if (isTokenSeparator(ch)) {
286                found = true;
287            } else if (isWhitespace(ch)) {
288                from++;
289            } else if (isTokenChar(ch)) {
290                throw new ParseException
291                    ("Tokens without separator (pos " + from +
292                     "): " + this.currentHeader);
293            } else {
294                throw new ParseException
295                    ("Invalid character after token (pos " + from +
296                     "): " + this.currentHeader);
297            }
298        }
299
300        return from;
301    }
302
303
304    /**
305     * Determines the ending position of the current token.
306     * This method will not leave the current header value,
307     * since the end of the header value is a token boundary.
308     *
309     * @param from      the position of the first character of the token
310     *
311     * @return  the position after the last character of the token.
312     *          The behavior is undefined if {@code from} does not
313     *          point to a token character in the current header value.
314     */
315    protected int findTokenEnd(final int from) {
316        Args.notNegative(from, "Search position");
317        final int to = this.currentHeader.length();
318        int end = from+1;
319        while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
320            end++;
321        }
322
323        return end;
324    }
325
326
327    /**
328     * Checks whether a character is a token separator.
329     * RFC 2616, section 2.1 defines comma as the separator for
330     * {@code #token} sequences. The end of a header value will
331     * also separate tokens, but that is not a character check.
332     *
333     * @param ch        the character to check
334     *
335     * @return  {@code true} if the character is a token separator,
336     *          {@code false} otherwise
337     */
338    protected boolean isTokenSeparator(final char ch) {
339        return (ch == ',');
340    }
341
342
343    /**
344     * Checks whether a character is a whitespace character.
345     * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
346     * The optional preceeding line break is irrelevant, since header
347     * continuation is handled transparently when parsing messages.
348     *
349     * @param ch        the character to check
350     *
351     * @return  {@code true} if the character is whitespace,
352     *          {@code false} otherwise
353     */
354    protected boolean isWhitespace(final char ch) {
355
356        // we do not use Character.isWhitspace(ch) here, since that allows
357        // many control characters which are not whitespace as per RFC 2616
358        return ((ch == '\t') || Character.isSpaceChar(ch));
359    }
360
361
362    /**
363     * Checks whether a character is a valid token character.
364     * Whitespace, control characters, and HTTP separators are not
365     * valid token characters. The HTTP specification (RFC 2616, section 2.2)
366     * defines tokens only for the US-ASCII character set, this
367     * method extends the definition to other character sets.
368     *
369     * @param ch        the character to check
370     *
371     * @return  {@code true} if the character is a valid token start,
372     *          {@code false} otherwise
373     */
374    protected boolean isTokenChar(final char ch) {
375
376        // common sense extension of ALPHA + DIGIT
377        if (Character.isLetterOrDigit(ch)) {
378            return true;
379        }
380
381        // common sense extension of CTL
382        if (Character.isISOControl(ch)) {
383            return false;
384        }
385
386        // no common sense extension for this
387        if (isHttpSeparator(ch)) {
388            return false;
389        }
390
391        // RFC 2616, section 2.2 defines a token character as
392        // "any CHAR except CTLs or separators". The controls
393        // and separators are included in the checks above.
394        // This will yield unexpected results for Unicode format characters.
395        // If that is a problem, overwrite isHttpSeparator(char) to filter
396        // out the false positives.
397        return true;
398    }
399
400
401    /**
402     * Checks whether a character is an HTTP separator.
403     * The implementation in this class checks only for the HTTP separators
404     * defined in RFC 2616, section 2.2. If you need to detect other
405     * separators beyond the US-ASCII character set, override this method.
406     *
407     * @param ch        the character to check
408     *
409     * @return  {@code true} if the character is an HTTP separator
410     */
411    protected boolean isHttpSeparator(final char ch) {
412        return (HTTP_SEPARATORS.indexOf(ch) >= 0);
413    }
414
415
416} // class BasicTokenIterator
417