001/*
002 * $Id: XMLUtil.java 4784 2011-03-15 08:33:00Z blowagie $
003 *
004 * This file is part of the iText (R) project.
005 * Copyright (c) 1998-2011 1T3XT BVBA
006 * Authors: Bruno Lowagie, Paulo Soares, Balder Van Camp, et al.
007 *
008 * This program is free software; you can redistribute it and/or modify
009 * it under the terms of the GNU Affero General Public License version 3
010 * as published by the Free Software Foundation with the addition of the
011 * following permission added to Section 15 as permitted in Section 7(a):
012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
014 *
015 * This program is distributed in the hope that it will be useful, but
016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
017 * or FITNESS FOR A PARTICULAR PURPOSE.
018 * See the GNU Affero General Public License for more details.
019 * You should have received a copy of the GNU Affero General Public License
020 * along with this program; if not, see http://www.gnu.org/licenses or write to
021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
022 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
023 * http://itextpdf.com/terms-of-use/
024 *
025 * The interactive user interfaces in modified source and object code versions
026 * of this program must display Appropriate Legal Notices, as required under
027 * Section 5 of the GNU Affero General Public License.
028 *
029 * In accordance with Section 7(b) of the GNU Affero General Public License,
030 * a covered work must retain the producer line in every PDF that is created
031 * or manipulated using iText.
032 *
033 * You can be released from the requirements of the license by purchasing
034 * a commercial license. Buying such a license is mandatory as soon as you
035 * develop commercial activities involving the iText software without
036 * disclosing the source code of your own applications.
037 * These activities include: offering paid services to customers as an ASP,
038 * serving PDFs on the fly in a web application, shipping iText with a closed
039 * source product.
040 *
041 * For more information, please contact iText Software Corp. at this
042 * address: sales@itextpdf.com
043 */
044package com.itextpdf.text.xml;
045
046/**
047 * Contains utility methods for XML.
048 * @author Balder
049 * @since 5.0.6
050 *
051 */
052public class XMLUtil {
053
054         /**
055     * Escapes a string with the appropriated XML codes.
056     * @param s the string to be escaped
057     * @param onlyASCII codes above 127 will always be escaped with &amp;#nn; if <CODE>true</CODE>
058     * @return the escaped string
059     * @since 5.0.6
060     */
061    public static String escapeXML(final String s, final boolean onlyASCII) {
062        char cc[] = s.toCharArray();
063        int len = cc.length;
064        StringBuffer sb = new StringBuffer();
065        for (int k = 0; k < len; ++k) {
066            int c = cc[k];
067            switch (c) {
068                case '<':
069                    sb.append("&lt;");
070                    break;
071                case '>':
072                    sb.append("&gt;");
073                    break;
074                case '&':
075                    sb.append("&amp;");
076                    break;
077                case '"':
078                    sb.append("&quot;");
079                    break;
080                case '\'':
081                    sb.append("&apos;");
082                    break;
083                default:
084                        if (c == 0x9 || c == 0xA || c == 0xD
085                                || c >= 0x20 && c <= 0xD7FF
086                                || c >= 0xE000 && c <= 0xFFFD
087                                || c >= 0x10000 && c <= 0x10FFFF) {
088                                if (onlyASCII && c > 127)
089                                        sb.append("&#").append(c).append(';');
090                                else
091                                        sb.append((char)c);
092                        }
093            }
094        }
095        return sb.toString();
096    }
097
098    /**
099     * Returns the IANA encoding name that is auto-detected from
100     * the bytes specified, with the endian-ness of that encoding where appropriate.
101     * (method found in org.apache.xerces.impl.XMLEntityManager, originally published
102     * by the Apache Software Foundation under the Apache Software License; now being
103     * used in iText under the MPL)
104     * @param b4    The first four bytes of the input.
105     * @return an IANA-encoding string
106     * @since 5.0.6
107     */
108    public static String getEncodingName(final byte[] b4) {
109
110        // UTF-16, with BOM
111        int b0 = b4[0] & 0xFF;
112        int b1 = b4[1] & 0xFF;
113        if (b0 == 0xFE && b1 == 0xFF) {
114            // UTF-16, big-endian
115            return "UTF-16BE";
116        }
117        if (b0 == 0xFF && b1 == 0xFE) {
118            // UTF-16, little-endian
119            return "UTF-16LE";
120        }
121
122        // UTF-8 with a BOM
123        int b2 = b4[2] & 0xFF;
124        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
125            return "UTF-8";
126        }
127
128        // other encodings
129        int b3 = b4[3] & 0xFF;
130        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
131            // UCS-4, big endian (1234)
132            return "ISO-10646-UCS-4";
133        }
134        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
135            // UCS-4, little endian (4321)
136            return "ISO-10646-UCS-4";
137        }
138        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
139            // UCS-4, unusual octet order (2143)
140            // REVISIT: What should this be?
141            return "ISO-10646-UCS-4";
142        }
143        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
144            // UCS-4, unusual octet order (3412)
145            // REVISIT: What should this be?
146            return "ISO-10646-UCS-4";
147        }
148        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
149            // UTF-16, big-endian, no BOM
150            // (or could turn out to be UCS-2...
151            // REVISIT: What should this be?
152            return "UTF-16BE";
153        }
154        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
155            // UTF-16, little-endian, no BOM
156            // (or could turn out to be UCS-2...
157            return "UTF-16LE";
158        }
159        if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
160            // EBCDIC
161            // a la xerces1, return CP037 instead of EBCDIC here
162            return "CP037";
163        }
164
165        // default encoding
166        return "UTF-8";
167    }
168}