001/* 002 * $Id: XMLUtil.java 4784 2011-03-15 08:33:00Z blowagie $ 003 * 004 * This file is part of the iText (R) project. 005 * Copyright (c) 1998-2011 1T3XT BVBA 006 * Authors: Bruno Lowagie, Paulo Soares, Balder Van Camp, et al. 007 * 008 * This program is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Affero General Public License version 3 010 * as published by the Free Software Foundation with the addition of the 011 * following permission added to Section 15 as permitted in Section 7(a): 012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT, 013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. 014 * 015 * This program is distributed in the hope that it will be useful, but 016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 017 * or FITNESS FOR A PARTICULAR PURPOSE. 018 * See the GNU Affero General Public License for more details. 019 * You should have received a copy of the GNU Affero General Public License 020 * along with this program; if not, see http://www.gnu.org/licenses or write to 021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 022 * Boston, MA, 02110-1301 USA, or download the license from the following URL: 023 * http://itextpdf.com/terms-of-use/ 024 * 025 * The interactive user interfaces in modified source and object code versions 026 * of this program must display Appropriate Legal Notices, as required under 027 * Section 5 of the GNU Affero General Public License. 028 * 029 * In accordance with Section 7(b) of the GNU Affero General Public License, 030 * a covered work must retain the producer line in every PDF that is created 031 * or manipulated using iText. 032 * 033 * You can be released from the requirements of the license by purchasing 034 * a commercial license. Buying such a license is mandatory as soon as you 035 * develop commercial activities involving the iText software without 036 * disclosing the source code of your own applications. 037 * These activities include: offering paid services to customers as an ASP, 038 * serving PDFs on the fly in a web application, shipping iText with a closed 039 * source product. 040 * 041 * For more information, please contact iText Software Corp. at this 042 * address: sales@itextpdf.com 043 */ 044package com.itextpdf.text.xml; 045 046/** 047 * Contains utility methods for XML. 048 * @author Balder 049 * @since 5.0.6 050 * 051 */ 052public class XMLUtil { 053 054 /** 055 * Escapes a string with the appropriated XML codes. 056 * @param s the string to be escaped 057 * @param onlyASCII codes above 127 will always be escaped with &#nn; if <CODE>true</CODE> 058 * @return the escaped string 059 * @since 5.0.6 060 */ 061 public static String escapeXML(final String s, final boolean onlyASCII) { 062 char cc[] = s.toCharArray(); 063 int len = cc.length; 064 StringBuffer sb = new StringBuffer(); 065 for (int k = 0; k < len; ++k) { 066 int c = cc[k]; 067 switch (c) { 068 case '<': 069 sb.append("<"); 070 break; 071 case '>': 072 sb.append(">"); 073 break; 074 case '&': 075 sb.append("&"); 076 break; 077 case '"': 078 sb.append("""); 079 break; 080 case '\'': 081 sb.append("'"); 082 break; 083 default: 084 if (c == 0x9 || c == 0xA || c == 0xD 085 || c >= 0x20 && c <= 0xD7FF 086 || c >= 0xE000 && c <= 0xFFFD 087 || c >= 0x10000 && c <= 0x10FFFF) { 088 if (onlyASCII && c > 127) 089 sb.append("&#").append(c).append(';'); 090 else 091 sb.append((char)c); 092 } 093 } 094 } 095 return sb.toString(); 096 } 097 098 /** 099 * Returns the IANA encoding name that is auto-detected from 100 * the bytes specified, with the endian-ness of that encoding where appropriate. 101 * (method found in org.apache.xerces.impl.XMLEntityManager, originally published 102 * by the Apache Software Foundation under the Apache Software License; now being 103 * used in iText under the MPL) 104 * @param b4 The first four bytes of the input. 105 * @return an IANA-encoding string 106 * @since 5.0.6 107 */ 108 public static String getEncodingName(final byte[] b4) { 109 110 // UTF-16, with BOM 111 int b0 = b4[0] & 0xFF; 112 int b1 = b4[1] & 0xFF; 113 if (b0 == 0xFE && b1 == 0xFF) { 114 // UTF-16, big-endian 115 return "UTF-16BE"; 116 } 117 if (b0 == 0xFF && b1 == 0xFE) { 118 // UTF-16, little-endian 119 return "UTF-16LE"; 120 } 121 122 // UTF-8 with a BOM 123 int b2 = b4[2] & 0xFF; 124 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { 125 return "UTF-8"; 126 } 127 128 // other encodings 129 int b3 = b4[3] & 0xFF; 130 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { 131 // UCS-4, big endian (1234) 132 return "ISO-10646-UCS-4"; 133 } 134 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { 135 // UCS-4, little endian (4321) 136 return "ISO-10646-UCS-4"; 137 } 138 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { 139 // UCS-4, unusual octet order (2143) 140 // REVISIT: What should this be? 141 return "ISO-10646-UCS-4"; 142 } 143 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { 144 // UCS-4, unusual octet order (3412) 145 // REVISIT: What should this be? 146 return "ISO-10646-UCS-4"; 147 } 148 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 149 // UTF-16, big-endian, no BOM 150 // (or could turn out to be UCS-2... 151 // REVISIT: What should this be? 152 return "UTF-16BE"; 153 } 154 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 155 // UTF-16, little-endian, no BOM 156 // (or could turn out to be UCS-2... 157 return "UTF-16LE"; 158 } 159 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { 160 // EBCDIC 161 // a la xerces1, return CP037 instead of EBCDIC here 162 return "CP037"; 163 } 164 165 // default encoding 166 return "UTF-8"; 167 } 168}