001package org.jsoup.helper; 002 003import org.jsoup.nodes.Attribute; 004import org.jsoup.nodes.Attributes; 005import org.jsoup.select.NodeTraversor; 006import org.jsoup.select.NodeVisitor; 007import org.w3c.dom.Comment; 008import org.w3c.dom.Document; 009import org.w3c.dom.Element; 010import org.w3c.dom.Text; 011 012import javax.xml.parsers.DocumentBuilder; 013import javax.xml.parsers.DocumentBuilderFactory; 014import javax.xml.parsers.ParserConfigurationException; 015import javax.xml.transform.Transformer; 016import javax.xml.transform.TransformerException; 017import javax.xml.transform.TransformerFactory; 018import javax.xml.transform.dom.DOMSource; 019import javax.xml.transform.stream.StreamResult; 020import java.io.StringWriter; 021import java.util.HashMap; 022 023/** 024 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document}, 025 * for integration with toolsets that use the W3C DOM. 026 */ 027public class W3CDom { 028 protected DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 029 030 /** 031 * Convert a jsoup Document to a W3C Document. 032 * @param in jsoup doc 033 * @return w3c doc 034 */ 035 public Document fromJsoup(org.jsoup.nodes.Document in) { 036 Validate.notNull(in); 037 DocumentBuilder builder; 038 try { 039 //set the factory to be namespace-aware 040 factory.setNamespaceAware(true); 041 builder = factory.newDocumentBuilder(); 042 Document out = builder.newDocument(); 043 convert(in, out); 044 return out; 045 } catch (ParserConfigurationException e) { 046 throw new IllegalStateException(e); 047 } 048 } 049 050 /** 051 * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output document 052 * before converting. 053 * @param in jsoup doc 054 * @param out w3c doc 055 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Document) 056 */ 057 public void convert(org.jsoup.nodes.Document in, Document out) { 058 if (!StringUtil.isBlank(in.location())) 059 out.setDocumentURI(in.location()); 060 061 org.jsoup.nodes.Element rootEl = in.child(0); // skip the #root node 062 NodeTraversor.traverse(new W3CBuilder(out), rootEl); 063 } 064 065 /** 066 * Implements the conversion by walking the input. 067 */ 068 protected static class W3CBuilder implements NodeVisitor { 069 private static final String xmlnsKey = "xmlns"; 070 private static final String xmlnsPrefix = "xmlns:"; 071 072 private final Document doc; 073 private final HashMap<String, String> namespaces = new HashMap<>(); // prefix => urn 074 private Element dest; 075 076 public W3CBuilder(Document doc) { 077 this.doc = doc; 078 } 079 080 public void head(org.jsoup.nodes.Node source, int depth) { 081 if (source instanceof org.jsoup.nodes.Element) { 082 org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; 083 084 String prefix = updateNamespaces(sourceEl); 085 String namespace = namespaces.get(prefix); 086 087 Element el = doc.createElementNS(namespace, sourceEl.tagName()); 088 copyAttributes(sourceEl, el); 089 if (dest == null) { // sets up the root 090 doc.appendChild(el); 091 } else { 092 dest.appendChild(el); 093 } 094 dest = el; // descend 095 } else if (source instanceof org.jsoup.nodes.TextNode) { 096 org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source; 097 Text text = doc.createTextNode(sourceText.getWholeText()); 098 dest.appendChild(text); 099 } else if (source instanceof org.jsoup.nodes.Comment) { 100 org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source; 101 Comment comment = doc.createComment(sourceComment.getData()); 102 dest.appendChild(comment); 103 } else if (source instanceof org.jsoup.nodes.DataNode) { 104 org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source; 105 Text node = doc.createTextNode(sourceData.getWholeData()); 106 dest.appendChild(node); 107 } else { 108 // unhandled 109 } 110 } 111 112 public void tail(org.jsoup.nodes.Node source, int depth) { 113 if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) { 114 dest = (Element) dest.getParentNode(); // undescend. cromulent. 115 } 116 } 117 118 private void copyAttributes(org.jsoup.nodes.Node source, Element el) { 119 for (Attribute attribute : source.attributes()) { 120 // valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.] 121 String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", ""); 122 if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*")) 123 el.setAttribute(key, attribute.getValue()); 124 } 125 } 126 127 /** 128 * Finds any namespaces defined in this element. Returns any tag prefix. 129 */ 130 private String updateNamespaces(org.jsoup.nodes.Element el) { 131 // scan the element for namespace declarations 132 // like: xmlns="blah" or xmlns:prefix="blah" 133 Attributes attributes = el.attributes(); 134 for (Attribute attr : attributes) { 135 String key = attr.getKey(); 136 String prefix; 137 if (key.equals(xmlnsKey)) { 138 prefix = ""; 139 } else if (key.startsWith(xmlnsPrefix)) { 140 prefix = key.substring(xmlnsPrefix.length()); 141 } else { 142 continue; 143 } 144 namespaces.put(prefix, attr.getValue()); 145 } 146 147 // get the element prefix if any 148 int pos = el.tagName().indexOf(":"); 149 return pos > 0 ? el.tagName().substring(0, pos) : ""; 150 } 151 152 } 153 154 /** 155 * Serialize a W3C document to a String. 156 * @param doc Document 157 * @return Document as string 158 */ 159 public String asString(Document doc) { 160 try { 161 DOMSource domSource = new DOMSource(doc); 162 StringWriter writer = new StringWriter(); 163 StreamResult result = new StreamResult(writer); 164 TransformerFactory tf = TransformerFactory.newInstance(); 165 Transformer transformer = tf.newTransformer(); 166 transformer.transform(domSource, result); 167 return writer.toString(); 168 } catch (TransformerException e) { 169 throw new IllegalStateException(e); 170 } 171 } 172}