1   
2   
3   
4   
5   
6   
7   
8   
9   
10  
11  
12  
13  
14  
15  package org.htmlunit.html.parser.neko;
16  
17  import java.io.IOException;
18  import java.io.InputStream;
19  import java.io.StringReader;
20  import java.lang.reflect.InvocationTargetException;
21  import java.net.URL;
22  import java.nio.charset.Charset;
23  import java.util.ArrayList;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.concurrent.ConcurrentHashMap;
27  
28  import org.htmlunit.ObjectInstantiationException;
29  import org.htmlunit.Page;
30  import org.htmlunit.SgmlPage;
31  import org.htmlunit.WebAssert;
32  import org.htmlunit.WebClient;
33  import org.htmlunit.WebResponse;
34  import org.htmlunit.cyberneko.HTMLScanner;
35  import org.htmlunit.cyberneko.HTMLTagBalancer;
36  import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
37  import org.htmlunit.cyberneko.xerces.xni.QName;
38  import org.htmlunit.cyberneko.xerces.xni.XNIException;
39  import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
40  import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
41  import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
42  import org.htmlunit.html.DefaultElementFactory;
43  import org.htmlunit.html.DomNode;
44  import org.htmlunit.html.ElementFactory;
45  import org.htmlunit.html.Html;
46  import org.htmlunit.html.HtmlPage;
47  import org.htmlunit.html.UnknownElementFactory;
48  import org.htmlunit.html.parser.HTMLParser;
49  import org.htmlunit.html.parser.HTMLParserListener;
50  import org.htmlunit.svg.SvgElementFactory;
51  import org.htmlunit.util.StringUtils;
52  import org.w3c.dom.Node;
53  import org.xml.sax.SAXException;
54  
55  
56  
57  
58  
59  
60  
61  
62  
63  
64  
65  
66  
67  
68  
69  
70  public final class HtmlUnitNekoHtmlParser implements HTMLParser {
71  
72      
73  
74  
75      public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
76  
77      private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new ConcurrentHashMap<>();
78  
79      static {
80          final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
81          for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
82              ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
83          }
84      }
85  
86      
87  
88  
89      @Override
90      public void parseFragment(final WebClient webClient, final DomNode parent, final DomNode context,
91              final String source, final boolean createdByJavascript)
92          throws SAXException, IOException {
93          final Page page = parent.getPage();
94          if (!(page instanceof HtmlPage)) {
95              return;
96          }
97          final HtmlPage htmlPage = (HtmlPage) page;
98          final URL url = htmlPage.getUrl();
99  
100         final HtmlUnitNekoDOMBuilder domBuilder =
101                 new HtmlUnitNekoDOMBuilder(this, parent, url, source, createdByJavascript);
102         domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
103         
104         DomNode node = context;
105         final List<QName> ancestors = new ArrayList<>();
106         while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
107             ancestors.add(0, new QName(null, node.getNodeName(), null, null));
108             node = node.getParentNode();
109         }
110         if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
111             ancestors.add(new QName(null, "html", null, null));
112             ancestors.add(new QName(null, "body", null, null));
113         }
114         else if (ancestors.size() == 1
115                 || (!"body".equals(ancestors.get(1).getLocalpart())
116                         && !"head".equals(ancestors.get(1).getLocalpart()))) {
117             ancestors.add(new QName(null, "body", null, null));
118         }
119 
120         domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
121         domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
122 
123         final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
124 
125         htmlPage.registerParsingStart();
126         htmlPage.registerSnippetParsingStart();
127         try {
128             domBuilder.parse(in);
129         }
130         finally {
131             htmlPage.registerParsingEnd();
132             htmlPage.registerSnippetParsingEnd();
133         }
134     }
135 
136     
137 
138 
139     @Override
140     public void parse(final WebClient webClient, final WebResponse webResponse, final HtmlPage page,
141             final boolean xhtml, final boolean createdByJavascript) throws IOException {
142         final URL url = webResponse.getWebRequest().getUrl();
143         final HtmlUnitNekoDOMBuilder domBuilder =
144                 new HtmlUnitNekoDOMBuilder(this, page, url, null, createdByJavascript);
145 
146         final Charset charset = webResponse.getContentCharset();
147         try {
148             if (!webResponse.wasContentCharsetTentative()) {
149                 
150                 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
151             }
152 
153             
154             if (xhtml) {
155                 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
156                 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
157                 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
158                 domBuilder.setFeature(HTMLScanner.CDATA_SECTIONS, true);
159                 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
160             }
161 
162             if (webClient != null) {
163                 final int bufferSize = webClient.getOptions().getNekoReaderBufferSize();
164                 if (bufferSize > 0) {
165                     domBuilder.setProperty(HTMLScanner.READER_BUFFER_SIZE, bufferSize);
166                 }
167             }
168         }
169         catch (final Exception e) {
170             throw new ObjectInstantiationException("Error setting HTML parser feature", e);
171         }
172 
173         try (InputStream content = webResponse.getContentAsStream()) {
174             final String encoding = charset.name();
175             final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
176 
177             page.registerParsingStart();
178             try {
179                 domBuilder.parse(in);
180             }
181             catch (final XNIException e) {
182                 
183                 final Throwable origin = extractNestedException(e);
184                 throw new RuntimeException("Failed parsing content from " + url, origin);
185             }
186         }
187         finally {
188             page.registerParsingEnd();
189         }
190     }
191 
192     
193 
194 
195 
196 
197 
198 
199     static Throwable extractNestedException(final Throwable e) {
200         Throwable originalException = e;
201         Throwable cause = ((XNIException) e).getException();
202         while (cause != null) {
203             originalException = cause;
204             if (cause instanceof XNIException) {
205                 cause = ((XNIException) cause).getException();
206             }
207             else if (cause instanceof InvocationTargetException) {
208                 cause = cause.getCause();
209             }
210             else {
211                 cause = null;
212             }
213         }
214         return originalException;
215     }
216 
217     
218 
219 
220     @Override
221     public ElementFactory getSvgFactory() {
222         return SVG_FACTORY;
223     }
224 
225     
226 
227 
228     @Override
229     public ElementFactory getFactory(final String tagName) {
230         final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
231 
232         if (result != null) {
233             return result;
234         }
235         return UnknownElementFactory.INSTANCE;
236     }
237 
238     
239 
240 
241 
242 
243 
244 
245 
246 
247 
248 
249     @Override
250     public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
251             final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
252         if (insideSvg) {
253             return SVG_FACTORY;
254         }
255 
256         if (namespaceURI == null || namespaceURI.isEmpty()
257             || Html.XHTML_NAMESPACE.equals(namespaceURI)
258             || Html.SVG_NAMESPACE.equals(namespaceURI)
259             || !qualifiedName.contains(":")) {
260 
261             String tagName = qualifiedName;
262             final int index = tagName.indexOf(':');
263             if (index == -1) {
264                 tagName = StringUtils.toRootLowerCase(tagName);
265             }
266             else {
267                 tagName = tagName.substring(index + 1);
268             }
269             final ElementFactory factory;
270             if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
271                 factory = SVG_FACTORY;
272             }
273             else {
274                 factory = ELEMENT_FACTORIES.get(tagName);
275             }
276 
277             if (factory != null) {
278                 return factory;
279             }
280         }
281         return UnknownElementFactory.INSTANCE;
282     }
283 }
284 
285 
286 
287 
288 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
289     private final HTMLParserListener listener_;
290     private final URL url_;
291     private final String html_;
292 
293     HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
294         WebAssert.notNull("listener", listener);
295         WebAssert.notNull("url", url);
296         listener_ = listener;
297         url_ = url;
298         html_ = htmlContent;
299     }
300 
301     
302 
303 
304     @Override
305     public void error(final String domain, final String key,
306             final XMLParseException exception) throws XNIException {
307         listener_.error(exception.getMessage(),
308                 url_,
309                 html_,
310                 exception.getLineNumber(),
311                 exception.getColumnNumber(),
312                 key);
313     }
314 
315     
316 
317 
318     @Override
319     public void warning(final String domain, final String key,
320             final XMLParseException exception) throws XNIException {
321         listener_.warning(exception.getMessage(),
322                 url_,
323                 html_,
324                 exception.getLineNumber(),
325                 exception.getColumnNumber(),
326                 key);
327     }
328 
329     @Override
330     public void fatalError(final String domain, final String key,
331             final XMLParseException exception) throws XNIException {
332         listener_.error(exception.getMessage(),
333                 url_,
334                 html_,
335                 exception.getLineNumber(),
336                 exception.getColumnNumber(),
337                 key);
338     }
339 }