001package org.jsoup.nodes;
002
003import org.jsoup.helper.StringUtil;
004import org.jsoup.helper.Validate;
005import org.jsoup.parser.ParseSettings;
006import org.jsoup.parser.Tag;
007import org.jsoup.select.Elements;
008
009import java.nio.charset.Charset;
010import java.nio.charset.CharsetEncoder;
011import java.util.ArrayList;
012import java.util.List;
013
014/**
015 A HTML Document.
016
017 @author Jonathan Hedley, jonathan@hedley.net */
018public class Document extends Element {
019    private OutputSettings outputSettings = new OutputSettings();
020    private QuirksMode quirksMode = QuirksMode.noQuirks;
021    private String location;
022    private boolean updateMetaCharset = false;
023
024    /**
025     Create a new, empty Document.
026     @param baseUri base URI of document
027     @see org.jsoup.Jsoup#parse
028     @see #createShell
029     */
030    public Document(String baseUri) {
031        super(Tag.valueOf("#root", ParseSettings.htmlDefault), baseUri);
032        this.location = baseUri;
033    }
034
035    /**
036     Create a valid, empty shell of a document, suitable for adding more elements to.
037     @param baseUri baseUri of document
038     @return document with html, head, and body elements.
039     */
040    public static Document createShell(String baseUri) {
041        Validate.notNull(baseUri);
042
043        Document doc = new Document(baseUri);
044        Element html = doc.appendElement("html");
045        html.appendElement("head");
046        html.appendElement("body");
047
048        return doc;
049    }
050
051    /**
052     * Get the URL this Document was parsed from. If the starting URL is a redirect,
053     * this will return the final URL from which the document was served from.
054     * @return location
055     */
056    public String location() {
057     return location;
058    }
059    
060    /**
061     Accessor to the document's {@code head} element.
062     @return {@code head}
063     */
064    public Element head() {
065        return findFirstElementByTagName("head", this);
066    }
067
068    /**
069     Accessor to the document's {@code body} element.
070     @return {@code body}
071     */
072    public Element body() {
073        return findFirstElementByTagName("body", this);
074    }
075
076    /**
077     Get the string contents of the document's {@code title} element.
078     @return Trimmed title, or empty string if none set.
079     */
080    public String title() {
081        // title is a preserve whitespace tag (for document output), but normalised here
082        Element titleEl = getElementsByTag("title").first();
083        return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
084    }
085
086    /**
087     Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
088     not present
089     @param title string to set as title
090     */
091    public void title(String title) {
092        Validate.notNull(title);
093        Element titleEl = getElementsByTag("title").first();
094        if (titleEl == null) { // add to head
095            head().appendElement("title").text(title);
096        } else {
097            titleEl.text(title);
098        }
099    }
100
101    /**
102     Create a new Element, with this document's base uri. Does not make the new element a child of this document.
103     @param tagName element tag name (e.g. {@code a})
104     @return new element
105     */
106    public Element createElement(String tagName) {
107        return new Element(Tag.valueOf(tagName, ParseSettings.preserveCase), this.baseUri());
108    }
109
110    /**
111     Normalise the document. This happens after the parse phase so generally does not need to be called.
112     Moves any text content that is not in the body element into the body.
113     @return this document after normalisation
114     */
115    public Document normalise() {
116        Element htmlEl = findFirstElementByTagName("html", this);
117        if (htmlEl == null)
118            htmlEl = appendElement("html");
119        if (head() == null)
120            htmlEl.prependElement("head");
121        if (body() == null)
122            htmlEl.appendElement("body");
123
124        // pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care
125        // of. do in inverse order to maintain text order.
126        normaliseTextNodes(head());
127        normaliseTextNodes(htmlEl);
128        normaliseTextNodes(this);
129
130        normaliseStructure("head", htmlEl);
131        normaliseStructure("body", htmlEl);
132        
133        ensureMetaCharsetElement();
134        
135        return this;
136    }
137
138    // does not recurse.
139    private void normaliseTextNodes(Element element) {
140        List<Node> toMove = new ArrayList<>();
141        for (Node node: element.childNodes) {
142            if (node instanceof TextNode) {
143                TextNode tn = (TextNode) node;
144                if (!tn.isBlank())
145                    toMove.add(tn);
146            }
147        }
148
149        for (int i = toMove.size()-1; i >= 0; i--) {
150            Node node = toMove.get(i);
151            element.removeChild(node);
152            body().prependChild(new TextNode(" "));
153            body().prependChild(node);
154        }
155    }
156
157    // merge multiple <head> or <body> contents into one, delete the remainder, and ensure they are owned by <html>
158    private void normaliseStructure(String tag, Element htmlEl) {
159        Elements elements = this.getElementsByTag(tag);
160        Element master = elements.first(); // will always be available as created above if not existent
161        if (elements.size() > 1) { // dupes, move contents to master
162            List<Node> toMove = new ArrayList<>();
163            for (int i = 1; i < elements.size(); i++) {
164                Node dupe = elements.get(i);
165                toMove.addAll(dupe.ensureChildNodes());
166                dupe.remove();
167            }
168
169            for (Node dupe : toMove)
170                master.appendChild(dupe);
171        }
172        // ensure parented by <html>
173        if (!master.parent().equals(htmlEl)) {
174            htmlEl.appendChild(master); // includes remove()            
175        }
176    }
177
178    // fast method to get first by tag name, used for html, head, body finders
179    private Element findFirstElementByTagName(String tag, Node node) {
180        if (node.nodeName().equals(tag))
181            return (Element) node;
182        else {
183            int size = node.childNodeSize();
184            for (int i = 0; i < size; i++) {
185                Element found = findFirstElementByTagName(tag, node.childNode(i));
186                if (found != null)
187                    return found;
188            }
189        }
190        return null;
191    }
192
193    @Override
194    public String outerHtml() {
195        return super.html(); // no outer wrapper tag
196    }
197
198    /**
199     Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
200     @param text unencoded text
201     @return this document
202     */
203    @Override
204    public Element text(String text) {
205        body().text(text); // overridden to not nuke doc structure
206        return this;
207    }
208
209    @Override
210    public String nodeName() {
211        return "#document";
212    }
213    
214    /**
215     * Sets the charset used in this document. This method is equivalent
216     * to {@link OutputSettings#charset(java.nio.charset.Charset)
217     * OutputSettings.charset(Charset)} but in addition it updates the
218     * charset / encoding element within the document.
219     * 
220     * <p>This enables
221     * {@link #updateMetaCharsetElement(boolean) meta charset update}.</p>
222     * 
223     * <p>If there's no element with charset / encoding information yet it will
224     * be created. Obsolete charset / encoding definitions are removed!</p>
225     * 
226     * <p><b>Elements used:</b></p>
227     * 
228     * <ul>
229     * <li><b>Html:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
230     * <li><b>Xml:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
231     * </ul>
232     * 
233     * @param charset Charset
234     * 
235     * @see #updateMetaCharsetElement(boolean) 
236     * @see OutputSettings#charset(java.nio.charset.Charset) 
237     */
238    public void charset(Charset charset) {
239        updateMetaCharsetElement(true);
240        outputSettings.charset(charset);
241        ensureMetaCharsetElement();
242    }
243    
244    /**
245     * Returns the charset used in this document. This method is equivalent
246     * to {@link OutputSettings#charset()}.
247     * 
248     * @return Current Charset
249     * 
250     * @see OutputSettings#charset() 
251     */
252    public Charset charset() {
253        return outputSettings.charset();
254    }
255    
256    /**
257     * Sets whether the element with charset information in this document is
258     * updated on changes through {@link #charset(java.nio.charset.Charset)
259     * Document.charset(Charset)} or not.
260     * 
261     * <p>If set to <tt>false</tt> <i>(default)</i> there are no elements
262     * modified.</p>
263     * 
264     * @param update If <tt>true</tt> the element updated on charset
265     * changes, <tt>false</tt> if not
266     * 
267     * @see #charset(java.nio.charset.Charset) 
268     */
269    public void updateMetaCharsetElement(boolean update) {
270        this.updateMetaCharset = update;
271    }
272    
273    /**
274     * Returns whether the element with charset information in this document is
275     * updated on changes through {@link #charset(java.nio.charset.Charset)
276     * Document.charset(Charset)} or not.
277     * 
278     * @return Returns <tt>true</tt> if the element is updated on charset
279     * changes, <tt>false</tt> if not
280     */
281    public boolean updateMetaCharsetElement() {
282        return updateMetaCharset;
283    }
284
285    @Override
286    public Document clone() {
287        Document clone = (Document) super.clone();
288        clone.outputSettings = this.outputSettings.clone();
289        return clone;
290    }
291    
292    /**
293     * Ensures a meta charset (html) or xml declaration (xml) with the current
294     * encoding used. This only applies with
295     * {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to
296     * <tt>true</tt>, otherwise this method does nothing.
297     * 
298     * <ul>
299     * <li>An existing element gets updated with the current charset</li>
300     * <li>If there's no element yet it will be inserted</li>
301     * <li>Obsolete elements are removed</li>
302     * </ul>
303     * 
304     * <p><b>Elements used:</b></p>
305     * 
306     * <ul>
307     * <li><b>Html:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
308     * <li><b>Xml:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
309     * </ul>
310     */
311    private void ensureMetaCharsetElement() {
312        if (updateMetaCharset) {
313            OutputSettings.Syntax syntax = outputSettings().syntax();
314
315            if (syntax == OutputSettings.Syntax.html) {
316                Element metaCharset = select("meta[charset]").first();
317
318                if (metaCharset != null) {
319                    metaCharset.attr("charset", charset().displayName());
320                } else {
321                    Element head = head();
322
323                    if (head != null) {
324                        head.appendElement("meta").attr("charset", charset().displayName());
325                    }
326                }
327
328                // Remove obsolete elements
329                select("meta[name=charset]").remove();
330            } else if (syntax == OutputSettings.Syntax.xml) {
331                Node node = childNodes().get(0);
332
333                if (node instanceof XmlDeclaration) {
334                    XmlDeclaration decl = (XmlDeclaration) node;
335
336                    if (decl.name().equals("xml")) {
337                        decl.attr("encoding", charset().displayName());
338
339                        final String version = decl.attr("version");
340
341                        if (version != null) {
342                            decl.attr("version", "1.0");
343                        }
344                    } else {
345                        decl = new XmlDeclaration("xml", false);
346                        decl.attr("version", "1.0");
347                        decl.attr("encoding", charset().displayName());
348
349                        prependChild(decl);
350                    }
351                } else {
352                    XmlDeclaration decl = new XmlDeclaration("xml", false);
353                    decl.attr("version", "1.0");
354                    decl.attr("encoding", charset().displayName());
355
356                    prependChild(decl);
357                }
358            }
359        }
360    }
361    
362
363    /**
364     * A Document's output settings control the form of the text() and html() methods.
365     */
366    public static class OutputSettings implements Cloneable {
367        /**
368         * The output serialization syntax.
369         */
370        public enum Syntax {html, xml}
371
372        private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
373        private Charset charset;
374        CharsetEncoder encoder; // initialized by start of OuterHtmlVisitor and cleared at end
375        Entities.CoreCharset coreCharset; // fast encoders for ascii and utf8
376
377        private boolean prettyPrint = true;
378        private boolean outline = false;
379        private int indentAmount = 1;
380        private Syntax syntax = Syntax.html;
381
382        public OutputSettings() {
383            charset(Charset.forName("UTF8"));
384        }
385        
386        /**
387         * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML
388         * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>,
389         * which uses the complete set of HTML named entities.
390         * <p>
391         * The default escape mode is <code>base</code>.
392         * @return the document's current escape mode
393         */
394        public Entities.EscapeMode escapeMode() {
395            return escapeMode;
396        }
397
398        /**
399         * Set the document's escape mode, which determines how characters are escaped when the output character set
400         * does not support a given character:- using either a named or a numbered escape.
401         * @param escapeMode the new escape mode to use
402         * @return the document's output settings, for chaining
403         */
404        public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
405            this.escapeMode = escapeMode;
406            return this;
407        }
408
409        /**
410         * Get the document's current output charset, which is used to control which characters are escaped when
411         * generating HTML (via the <code>html()</code> methods), and which are kept intact.
412         * <p>
413         * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
414         * input charset. Otherwise, it defaults to UTF-8.
415         * @return the document's current charset.
416         */
417        public Charset charset() {
418            return charset;
419        }
420
421        /**
422         * Update the document's output charset.
423         * @param charset the new charset to use.
424         * @return the document's output settings, for chaining
425         */
426        public OutputSettings charset(Charset charset) {
427            this.charset = charset;
428            return this;
429        }
430
431        /**
432         * Update the document's output charset.
433         * @param charset the new charset (by name) to use.
434         * @return the document's output settings, for chaining
435         */
436        public OutputSettings charset(String charset) {
437            charset(Charset.forName(charset));
438            return this;
439        }
440
441        CharsetEncoder prepareEncoder() {
442            encoder = charset.newEncoder(); // created at start of OuterHtmlVisitor so each pass has own encoder, so OutputSettings can be shared among threads
443            coreCharset = Entities.CoreCharset.byName(encoder.charset().name());
444            return encoder;
445        }
446
447        /**
448         * Get the document's current output syntax.
449         * @return current syntax
450         */
451        public Syntax syntax() {
452            return syntax;
453        }
454
455        /**
456         * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
457         * {@code xml}, with self-closing tags.
458         * @param syntax serialization syntax
459         * @return the document's output settings, for chaining
460         */
461        public OutputSettings syntax(Syntax syntax) {
462            this.syntax = syntax;
463            return this;
464        }
465
466        /**
467         * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
468         * the output, and the output will generally look like the input.
469         * @return if pretty printing is enabled.
470         */
471        public boolean prettyPrint() {
472            return prettyPrint;
473        }
474
475        /**
476         * Enable or disable pretty printing.
477         * @param pretty new pretty print setting
478         * @return this, for chaining
479         */
480        public OutputSettings prettyPrint(boolean pretty) {
481            prettyPrint = pretty;
482            return this;
483        }
484        
485        /**
486         * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider
487         * all tags as block.
488         * @return if outline mode is enabled.
489         */
490        public boolean outline() {
491            return outline;
492        }
493        
494        /**
495         * Enable or disable HTML outline mode.
496         * @param outlineMode new outline setting
497         * @return this, for chaining
498         */
499        public OutputSettings outline(boolean outlineMode) {
500            outline = outlineMode;
501            return this;
502        }
503
504        /**
505         * Get the current tag indent amount, used when pretty printing.
506         * @return the current indent amount
507         */
508        public int indentAmount() {
509            return indentAmount;
510        }
511
512        /**
513         * Set the indent amount for pretty printing
514         * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0.
515         * @return this, for chaining
516         */
517        public OutputSettings indentAmount(int indentAmount) {
518            Validate.isTrue(indentAmount >= 0);
519            this.indentAmount = indentAmount;
520            return this;
521        }
522
523        @Override
524        public OutputSettings clone() {
525            OutputSettings clone;
526            try {
527                clone = (OutputSettings) super.clone();
528            } catch (CloneNotSupportedException e) {
529                throw new RuntimeException(e);
530            }
531            clone.charset(charset.name()); // new charset and charset encoder
532            clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
533            // indentAmount, prettyPrint are primitives so object.clone() will handle
534            return clone;
535        }
536    }
537
538    /**
539     * Get the document's current output settings.
540     * @return the document's current output settings.
541     */
542    public OutputSettings outputSettings() {
543        return outputSettings;
544    }
545
546    /**
547     * Set the document's output settings.
548     * @param outputSettings new output settings.
549     * @return this document, for chaining.
550     */
551    public Document outputSettings(OutputSettings outputSettings) {
552        Validate.notNull(outputSettings);
553        this.outputSettings = outputSettings;
554        return this;
555    }
556
557    public enum QuirksMode {
558        noQuirks, quirks, limitedQuirks
559    }
560
561    public QuirksMode quirksMode() {
562        return quirksMode;
563    }
564
565    public Document quirksMode(QuirksMode quirksMode) {
566        this.quirksMode = quirksMode;
567        return this;
568    }
569}