001package org.jsoup.helper;
002
003import org.jsoup.nodes.Attribute;
004import org.jsoup.nodes.Attributes;
005import org.jsoup.select.NodeTraversor;
006import org.jsoup.select.NodeVisitor;
007import org.w3c.dom.Comment;
008import org.w3c.dom.Document;
009import org.w3c.dom.Element;
010import org.w3c.dom.Text;
011
012import javax.xml.parsers.DocumentBuilder;
013import javax.xml.parsers.DocumentBuilderFactory;
014import javax.xml.parsers.ParserConfigurationException;
015import javax.xml.transform.Transformer;
016import javax.xml.transform.TransformerException;
017import javax.xml.transform.TransformerFactory;
018import javax.xml.transform.dom.DOMSource;
019import javax.xml.transform.stream.StreamResult;
020import java.io.StringWriter;
021import java.util.HashMap;
022
023/**
024 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
025 * for integration with toolsets that use the W3C DOM.
026 */
027public class W3CDom {
028    protected DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
029
030    /**
031     * Convert a jsoup Document to a W3C Document.
032     * @param in jsoup doc
033     * @return w3c doc
034     */
035    public Document fromJsoup(org.jsoup.nodes.Document in) {
036        Validate.notNull(in);
037        DocumentBuilder builder;
038        try {
039                //set the factory to be namespace-aware
040                factory.setNamespaceAware(true);
041            builder = factory.newDocumentBuilder();
042            Document out = builder.newDocument();
043            convert(in, out);
044            return out;
045        } catch (ParserConfigurationException e) {
046            throw new IllegalStateException(e);
047        }
048    }
049
050    /**
051     * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output document
052     * before converting.
053     * @param in jsoup doc
054     * @param out w3c doc
055     * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Document)
056     */
057    public void convert(org.jsoup.nodes.Document in, Document out) {
058        if (!StringUtil.isBlank(in.location()))
059            out.setDocumentURI(in.location());
060
061        org.jsoup.nodes.Element rootEl = in.child(0); // skip the #root node
062        NodeTraversor.traverse(new W3CBuilder(out), rootEl);
063    }
064
065    /**
066     * Implements the conversion by walking the input.
067     */
068    protected static class W3CBuilder implements NodeVisitor {
069        private static final String xmlnsKey = "xmlns";
070        private static final String xmlnsPrefix = "xmlns:";
071
072        private final Document doc;
073        private final HashMap<String, String> namespaces = new HashMap<>(); // prefix => urn
074        private Element dest;
075
076        public W3CBuilder(Document doc) {
077            this.doc = doc;
078        }
079
080        public void head(org.jsoup.nodes.Node source, int depth) {
081            if (source instanceof org.jsoup.nodes.Element) {
082                org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
083
084                String prefix = updateNamespaces(sourceEl);
085                String namespace = namespaces.get(prefix);
086
087                Element el = doc.createElementNS(namespace, sourceEl.tagName());
088                copyAttributes(sourceEl, el);
089                if (dest == null) { // sets up the root
090                    doc.appendChild(el);
091                } else {
092                    dest.appendChild(el);
093                }
094                dest = el; // descend
095            } else if (source instanceof org.jsoup.nodes.TextNode) {
096                org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
097                Text text = doc.createTextNode(sourceText.getWholeText());
098                dest.appendChild(text);
099            } else if (source instanceof org.jsoup.nodes.Comment) {
100                org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
101                Comment comment = doc.createComment(sourceComment.getData());
102                dest.appendChild(comment);
103            } else if (source instanceof org.jsoup.nodes.DataNode) {
104                org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
105                Text node = doc.createTextNode(sourceData.getWholeData());
106                dest.appendChild(node);
107            } else {
108                // unhandled
109            }
110        }
111
112        public void tail(org.jsoup.nodes.Node source, int depth) {
113            if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
114                dest = (Element) dest.getParentNode(); // undescend. cromulent.
115            }
116        }
117
118        private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
119            for (Attribute attribute : source.attributes()) {
120                // valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.]
121                String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
122                if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*"))
123                    el.setAttribute(key, attribute.getValue());
124            }
125        }
126
127        /**
128         * Finds any namespaces defined in this element. Returns any tag prefix.
129         */
130        private String updateNamespaces(org.jsoup.nodes.Element el) {
131            // scan the element for namespace declarations
132            // like: xmlns="blah" or xmlns:prefix="blah"
133            Attributes attributes = el.attributes();
134            for (Attribute attr : attributes) {
135                String key = attr.getKey();
136                String prefix;
137                if (key.equals(xmlnsKey)) {
138                    prefix = "";
139                } else if (key.startsWith(xmlnsPrefix)) {
140                    prefix = key.substring(xmlnsPrefix.length());
141                } else {
142                    continue;
143                }
144                namespaces.put(prefix, attr.getValue());
145            }
146
147            // get the element prefix if any
148            int pos = el.tagName().indexOf(":");
149            return pos > 0 ? el.tagName().substring(0, pos) : "";
150        }
151
152    }
153
154    /**
155     * Serialize a W3C document to a String.
156     * @param doc Document
157     * @return Document as string
158     */
159    public String asString(Document doc) {
160        try {
161            DOMSource domSource = new DOMSource(doc);
162            StringWriter writer = new StringWriter();
163            StreamResult result = new StreamResult(writer);
164            TransformerFactory tf = TransformerFactory.newInstance();
165            Transformer transformer = tf.newTransformer();
166            transformer.transform(domSource, result);
167            return writer.toString();
168        } catch (TransformerException e) {
169            throw new IllegalStateException(e);
170        }
171    }
172}