001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.binary;
019
020/**
021 * Provides Base32 encoding and decoding as defined by <a href="http://www.ietf.org/rfc/rfc4648.txt">RFC 4648</a>.
022 *
023 * <p>
024 * The class can be parameterized in the following manner with various constructors:
025 * <ul>
026 * <li>Whether to use the "base32hex" variant instead of the default "base32"</li>
027 * <li>Line length: Default 76. Line length that aren't multiples of 8 will still essentially end up being multiples of
028 * 8 in the encoded data.
029 * <li>Line separator: Default is CRLF ("\r\n")</li>
030 * </ul>
031 * </p>
032 * <p>
033 * This class operates directly on byte streams, and not character streams.
034 * </p>
035 * <p>
036 * This class is thread-safe.
037 * </p>
038 *
039 * @see <a href="http://www.ietf.org/rfc/rfc4648.txt">RFC 4648</a>
040 *
041 * @since 1.5
042 * @version $Id: Base32.java 1488493 2013-06-01 08:43:58Z sebb $
043 */
044public class Base32 extends BaseNCodec {
045
046    /**
047     * BASE32 characters are 5 bits in length.
048     * They are formed by taking a block of five octets to form a 40-bit string,
049     * which is converted into eight BASE32 characters.
050     */
051    private static final int BITS_PER_ENCODED_BYTE = 5;
052    private static final int BYTES_PER_ENCODED_BLOCK = 8;
053    private static final int BYTES_PER_UNENCODED_BLOCK = 5;
054
055    /**
056     * Chunk separator per RFC 2045 section 2.1.
057     *
058     * @see <a href="http://www.ietf.org/rfc/rfc2045.txt">RFC 2045 section 2.1</a>
059     */
060    private static final byte[] CHUNK_SEPARATOR = {'\r', '\n'};
061
062    /**
063     * This array is a lookup table that translates Unicode characters drawn from the "Base32 Alphabet" (as specified
064     * in Table 3 of RFC 4648) into their 5-bit positive integer equivalents. Characters that are not in the Base32
065     * alphabet but fall within the bounds of the array are translated to -1.
066     */
067    private static final byte[] DECODE_TABLE = {
068         //  0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
069            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f
070            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f
071            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 20-2f
072            -1, -1, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, // 30-3f 2-7
073            -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, // 40-4f A-N
074            15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,                     // 50-5a O-Z
075    };
076
077    /**
078     * This array is a lookup table that translates 5-bit positive integer index values into their "Base32 Alphabet"
079     * equivalents as specified in Table 3 of RFC 4648.
080     */
081    private static final byte[] ENCODE_TABLE = {
082            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
083            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
084            '2', '3', '4', '5', '6', '7',
085    };
086
087    /**
088     * This array is a lookup table that translates Unicode characters drawn from the "Base32 |Hex Alphabet" (as
089     * specified in Table 3 of RFC 4648) into their 5-bit positive integer equivalents. Characters that are not in the
090     * Base32 Hex alphabet but fall within the bounds of the array are translated to -1.
091     */
092    private static final byte[] HEX_DECODE_TABLE = {
093         //  0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
094            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f
095            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f
096            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 20-2f
097             0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1, // 30-3f 2-7
098            -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, // 40-4f A-N
099            25, 26, 27, 28, 29, 30, 31, 32,                                 // 50-57 O-V
100    };
101
102    /**
103     * This array is a lookup table that translates 5-bit positive integer index values into their
104     * "Base32 Hex Alphabet" equivalents as specified in Table 3 of RFC 4648.
105     */
106    private static final byte[] HEX_ENCODE_TABLE = {
107            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
108            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
109            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
110    };
111
112    /** Mask used to extract 5 bits, used when encoding Base32 bytes */
113    private static final int MASK_5BITS = 0x1f;
114
115    // The static final fields above are used for the original static byte[] methods on Base32.
116    // The private member fields below are used with the new streaming approach, which requires
117    // some state be preserved between calls of encode() and decode().
118
119    /**
120     * Place holder for the bytes we're dealing with for our based logic.
121     * Bitwise operations store and extract the encoding or decoding from this variable.
122     */
123
124    /**
125     * Convenience variable to help us determine when our buffer is going to run out of room and needs resizing.
126     * <code>decodeSize = {@link #BYTES_PER_ENCODED_BLOCK} - 1 + lineSeparator.length;</code>
127     */
128    private final int decodeSize;
129
130    /**
131     * Decode table to use.
132     */
133    private final byte[] decodeTable;
134
135    /**
136     * Convenience variable to help us determine when our buffer is going to run out of room and needs resizing.
137     * <code>encodeSize = {@link #BYTES_PER_ENCODED_BLOCK} + lineSeparator.length;</code>
138     */
139    private final int encodeSize;
140
141    /**
142     * Encode table to use.
143     */
144    private final byte[] encodeTable;
145
146    /**
147     * Line separator for encoding. Not used when decoding. Only used if lineLength > 0.
148     */
149    private final byte[] lineSeparator;
150
151    /**
152     * Creates a Base32 codec used for decoding and encoding.
153     * <p>
154     * When encoding the line length is 0 (no chunking).
155     * </p>
156     *
157     */
158    public Base32() {
159        this(false);
160    }
161
162    /**
163     * Creates a Base32 codec used for decoding and encoding.
164     * <p>
165     * When encoding the line length is 0 (no chunking).
166     * </p>
167     * @param useHex if {@code true} then use Base32 Hex alphabet
168     */
169    public Base32(final boolean useHex) {
170        this(0, null, useHex);
171    }
172
173    /**
174     * Creates a Base32 codec used for decoding and encoding.
175     * <p>
176     * When encoding the line length is given in the constructor, the line separator is CRLF.
177     * </p>
178     *
179     * @param lineLength
180     *            Each line of encoded data will be at most of the given length (rounded down to nearest multiple of
181     *            8). If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when
182     *            decoding.
183     */
184    public Base32(final int lineLength) {
185        this(lineLength, CHUNK_SEPARATOR);
186    }
187
188    /**
189     * Creates a Base32 codec used for decoding and encoding.
190     * <p>
191     * When encoding the line length and line separator are given in the constructor.
192     * </p>
193     * <p>
194     * Line lengths that aren't multiples of 8 will still essentially end up being multiples of 8 in the encoded data.
195     * </p>
196     *
197     * @param lineLength
198     *            Each line of encoded data will be at most of the given length (rounded down to nearest multiple of
199     *            8). If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when
200     *            decoding.
201     * @param lineSeparator
202     *            Each line of encoded data will end with this sequence of bytes.
203     * @throws IllegalArgumentException
204     *             The provided lineSeparator included some Base32 characters. That's not going to work!
205     */
206    public Base32(final int lineLength, final byte[] lineSeparator) {
207        this(lineLength, lineSeparator, false);
208    }
209
210    /**
211     * Creates a Base32 / Base32 Hex codec used for decoding and encoding.
212     * <p>
213     * When encoding the line length and line separator are given in the constructor.
214     * </p>
215     * <p>
216     * Line lengths that aren't multiples of 8 will still essentially end up being multiples of 8 in the encoded data.
217     * </p>
218     *
219     * @param lineLength
220     *            Each line of encoded data will be at most of the given length (rounded down to nearest multiple of
221     *            8). If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when
222     *            decoding.
223     * @param lineSeparator
224     *            Each line of encoded data will end with this sequence of bytes.
225     * @param useHex
226     *            if {@code true}, then use Base32 Hex alphabet, otherwise use Base32 alphabet
227     * @throws IllegalArgumentException
228     *             The provided lineSeparator included some Base32 characters. That's not going to work! Or the
229     *             lineLength > 0 and lineSeparator is null.
230     */
231    public Base32(final int lineLength, final byte[] lineSeparator, final boolean useHex) {
232        super(BYTES_PER_UNENCODED_BLOCK, BYTES_PER_ENCODED_BLOCK,
233                lineLength,
234                lineSeparator == null ? 0 : lineSeparator.length);
235        if (useHex){
236            this.encodeTable = HEX_ENCODE_TABLE;
237            this.decodeTable = HEX_DECODE_TABLE;
238        } else {
239            this.encodeTable = ENCODE_TABLE;
240            this.decodeTable = DECODE_TABLE;
241        }
242        if (lineLength > 0) {
243            if (lineSeparator == null) {
244                throw new IllegalArgumentException("lineLength "+lineLength+" > 0, but lineSeparator is null");
245            }
246            // Must be done after initializing the tables
247            if (containsAlphabetOrPad(lineSeparator)) {
248                final String sep = StringUtils.newStringUtf8(lineSeparator);
249                throw new IllegalArgumentException("lineSeparator must not contain Base32 characters: [" + sep + "]");
250            }
251            this.encodeSize = BYTES_PER_ENCODED_BLOCK + lineSeparator.length;
252            this.lineSeparator = new byte[lineSeparator.length];
253            System.arraycopy(lineSeparator, 0, this.lineSeparator, 0, lineSeparator.length);
254        } else {
255            this.encodeSize = BYTES_PER_ENCODED_BLOCK;
256            this.lineSeparator = null;
257        }
258        this.decodeSize = this.encodeSize - 1;
259    }
260
261    /**
262     * <p>
263     * Decodes all of the provided data, starting at inPos, for inAvail bytes. Should be called at least twice: once
264     * with the data to decode, and once with inAvail set to "-1" to alert decoder that EOF has been reached. The "-1"
265     * call is not necessary when decoding, but it doesn't hurt, either.
266     * </p>
267     * <p>
268     * Ignores all non-Base32 characters. This is how chunked (e.g. 76 character) data is handled, since CR and LF are
269     * silently ignored, but has implications for other bytes, too. This method subscribes to the garbage-in,
270     * garbage-out philosophy: it will not check the provided data for validity.
271     * </p>
272     *
273     * @param in
274     *            byte[] array of ascii data to Base32 decode.
275     * @param inPos
276     *            Position to start reading data from.
277     * @param inAvail
278     *            Amount of bytes available from input for encoding.
279     * @param context the context to be used
280     *
281     * Output is written to {@link Context#buffer} as 8-bit octets, using {@link Context#pos} as the buffer position
282     */
283    @Override
284    void decode(final byte[] in, int inPos, final int inAvail, final Context context) {
285        // package protected for access from I/O streams
286
287        if (context.eof) {
288            return;
289        }
290        if (inAvail < 0) {
291            context.eof = true;
292        }
293        for (int i = 0; i < inAvail; i++) {
294            final byte b = in[inPos++];
295            if (b == PAD) {
296                // We're done.
297                context.eof = true;
298                break;
299            } else {
300                final byte[] buffer = ensureBufferSize(decodeSize, context);
301                if (b >= 0 && b < this.decodeTable.length) {
302                    final int result = this.decodeTable[b];
303                    if (result >= 0) {
304                        context.modulus = (context.modulus+1) % BYTES_PER_ENCODED_BLOCK;
305                        // collect decoded bytes
306                        context.lbitWorkArea = (context.lbitWorkArea << BITS_PER_ENCODED_BYTE) + result;
307                        if (context.modulus == 0) { // we can output the 5 bytes
308                            buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 32) & MASK_8BITS);
309                            buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 24) & MASK_8BITS);
310                            buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 16) & MASK_8BITS);
311                            buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
312                            buffer[context.pos++] = (byte) (context.lbitWorkArea & MASK_8BITS);
313                        }
314                    }
315                }
316            }
317        }
318
319        // Two forms of EOF as far as Base32 decoder is concerned: actual
320        // EOF (-1) and first time '=' character is encountered in stream.
321        // This approach makes the '=' padding characters completely optional.
322        if (context.eof && context.modulus >= 2) { // if modulus < 2, nothing to do
323            final byte[] buffer = ensureBufferSize(decodeSize, context);
324
325            //  we ignore partial bytes, i.e. only multiples of 8 count
326            switch (context.modulus) {
327                case 2 : // 10 bits, drop 2 and output one byte
328                    buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 2) & MASK_8BITS);
329                    break;
330                case 3 : // 15 bits, drop 7 and output 1 byte
331                    buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 7) & MASK_8BITS);
332                    break;
333                case 4 : // 20 bits = 2*8 + 4
334                    context.lbitWorkArea = context.lbitWorkArea >> 4; // drop 4 bits
335                    buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
336                    buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
337                    break;
338                case 5 : // 25bits = 3*8 + 1
339                    context.lbitWorkArea = context.lbitWorkArea >> 1;
340                    buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 16) & MASK_8BITS);
341                    buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
342                    buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
343                    break;
344                case 6 : // 30bits = 3*8 + 6
345                    context.lbitWorkArea = context.lbitWorkArea >> 6;
346                    buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 16) & MASK_8BITS);
347                    buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
348                    buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
349                    break;
350                case 7 : // 35 = 4*8 +3
351                    context.lbitWorkArea = context.lbitWorkArea >> 3;
352                    buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 24) & MASK_8BITS);
353                    buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 16) & MASK_8BITS);
354                    buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
355                    buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
356                    break;
357                default:
358                    // modulus can be 0-7, and we excluded 0,1 already
359                    throw new IllegalStateException("Impossible modulus "+context.modulus);
360            }
361        }
362    }
363
364    /**
365     * <p>
366     * Encodes all of the provided data, starting at inPos, for inAvail bytes. Must be called at least twice: once with
367     * the data to encode, and once with inAvail set to "-1" to alert encoder that EOF has been reached, so flush last
368     * remaining bytes (if not multiple of 5).
369     * </p>
370     *
371     * @param in
372     *            byte[] array of binary data to Base32 encode.
373     * @param inPos
374     *            Position to start reading data from.
375     * @param inAvail
376     *            Amount of bytes available from input for encoding.
377     * @param context the context to be used
378     */
379    @Override
380    void encode(final byte[] in, int inPos, final int inAvail, final Context context) {
381        // package protected for access from I/O streams
382
383        if (context.eof) {
384            return;
385        }
386        // inAvail < 0 is how we're informed of EOF in the underlying data we're
387        // encoding.
388        if (inAvail < 0) {
389            context.eof = true;
390            if (0 == context.modulus && lineLength == 0) {
391                return; // no leftovers to process and not using chunking
392            }
393            final byte[] buffer = ensureBufferSize(encodeSize, context);
394            final int savedPos = context.pos;
395            switch (context.modulus) { // % 5
396                case 0 :
397                    break;
398                case 1 : // Only 1 octet; take top 5 bits then remainder
399                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 3) & MASK_5BITS]; // 8-1*5 = 3
400                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea << 2) & MASK_5BITS]; // 5-3=2
401                    buffer[context.pos++] = PAD;
402                    buffer[context.pos++] = PAD;
403                    buffer[context.pos++] = PAD;
404                    buffer[context.pos++] = PAD;
405                    buffer[context.pos++] = PAD;
406                    buffer[context.pos++] = PAD;
407                    break;
408                case 2 : // 2 octets = 16 bits to use
409                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 11) & MASK_5BITS]; // 16-1*5 = 11
410                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >>  6) & MASK_5BITS]; // 16-2*5 = 6
411                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >>  1) & MASK_5BITS]; // 16-3*5 = 1
412                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea <<  4) & MASK_5BITS]; // 5-1 = 4
413                    buffer[context.pos++] = PAD;
414                    buffer[context.pos++] = PAD;
415                    buffer[context.pos++] = PAD;
416                    buffer[context.pos++] = PAD;
417                    break;
418                case 3 : // 3 octets = 24 bits to use
419                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 19) & MASK_5BITS]; // 24-1*5 = 19
420                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 14) & MASK_5BITS]; // 24-2*5 = 14
421                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >>  9) & MASK_5BITS]; // 24-3*5 = 9
422                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >>  4) & MASK_5BITS]; // 24-4*5 = 4
423                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea <<  1) & MASK_5BITS]; // 5-4 = 1
424                    buffer[context.pos++] = PAD;
425                    buffer[context.pos++] = PAD;
426                    buffer[context.pos++] = PAD;
427                    break;
428                case 4 : // 4 octets = 32 bits to use
429                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 27) & MASK_5BITS]; // 32-1*5 = 27
430                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 22) & MASK_5BITS]; // 32-2*5 = 22
431                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 17) & MASK_5BITS]; // 32-3*5 = 17
432                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 12) & MASK_5BITS]; // 32-4*5 = 12
433                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >>  7) & MASK_5BITS]; // 32-5*5 =  7
434                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >>  2) & MASK_5BITS]; // 32-6*5 =  2
435                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea <<  3) & MASK_5BITS]; // 5-2 = 3
436                    buffer[context.pos++] = PAD;
437                    break;
438                default:
439                    throw new IllegalStateException("Impossible modulus "+context.modulus);
440            }
441            context.currentLinePos += context.pos - savedPos; // keep track of current line position
442            // if currentPos == 0 we are at the start of a line, so don't add CRLF
443            if (lineLength > 0 && context.currentLinePos > 0){ // add chunk separator if required
444                System.arraycopy(lineSeparator, 0, buffer, context.pos, lineSeparator.length);
445                context.pos += lineSeparator.length;
446            }
447        } else {
448            for (int i = 0; i < inAvail; i++) {
449                final byte[] buffer = ensureBufferSize(encodeSize, context);
450                context.modulus = (context.modulus+1) % BYTES_PER_UNENCODED_BLOCK;
451                int b = in[inPos++];
452                if (b < 0) {
453                    b += 256;
454                }
455                context.lbitWorkArea = (context.lbitWorkArea << 8) + b; // BITS_PER_BYTE
456                if (0 == context.modulus) { // we have enough bytes to create our output
457                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 35) & MASK_5BITS];
458                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 30) & MASK_5BITS];
459                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 25) & MASK_5BITS];
460                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 20) & MASK_5BITS];
461                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 15) & MASK_5BITS];
462                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 10) & MASK_5BITS];
463                    buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 5) & MASK_5BITS];
464                    buffer[context.pos++] = encodeTable[(int)context.lbitWorkArea & MASK_5BITS];
465                    context.currentLinePos += BYTES_PER_ENCODED_BLOCK;
466                    if (lineLength > 0 && lineLength <= context.currentLinePos) {
467                        System.arraycopy(lineSeparator, 0, buffer, context.pos, lineSeparator.length);
468                        context.pos += lineSeparator.length;
469                        context.currentLinePos = 0;
470                    }
471                }
472            }
473        }
474    }
475
476    /**
477     * Returns whether or not the <code>octet</code> is in the Base32 alphabet.
478     *
479     * @param octet
480     *            The value to test
481     * @return {@code true} if the value is defined in the the Base32 alphabet {@code false} otherwise.
482     */
483    @Override
484    public boolean isInAlphabet(final byte octet) {
485        return octet >= 0 && octet < decodeTable.length && decodeTable[octet] != -1;
486    }
487}