1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.parser.neko;
16
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.StringReader;
20 import java.net.URL;
21 import java.nio.charset.Charset;
22 import java.util.ArrayList;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.concurrent.ConcurrentHashMap;
26
27 import org.htmlunit.ObjectInstantiationException;
28 import org.htmlunit.Page;
29 import org.htmlunit.SgmlPage;
30 import org.htmlunit.WebAssert;
31 import org.htmlunit.WebClient;
32 import org.htmlunit.WebResponse;
33 import org.htmlunit.cyberneko.HTMLScanner;
34 import org.htmlunit.cyberneko.HTMLTagBalancer;
35 import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
36 import org.htmlunit.cyberneko.xerces.xni.QName;
37 import org.htmlunit.cyberneko.xerces.xni.XNIException;
38 import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
39 import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
40 import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
41 import org.htmlunit.html.DefaultElementFactory;
42 import org.htmlunit.html.DomNode;
43 import org.htmlunit.html.ElementFactory;
44 import org.htmlunit.html.Html;
45 import org.htmlunit.html.HtmlPage;
46 import org.htmlunit.html.UnknownElementFactory;
47 import org.htmlunit.html.parser.HTMLParser;
48 import org.htmlunit.html.parser.HTMLParserListener;
49 import org.htmlunit.svg.SvgElementFactory;
50 import org.htmlunit.util.StringUtils;
51 import org.w3c.dom.Node;
52 import org.xml.sax.SAXException;
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 public final class HtmlUnitNekoHtmlParser implements HTMLParser {
70
71
72
73
74 public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
75
76 private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new ConcurrentHashMap<>();
77
78 static {
79 final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
80 for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
81 ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
82 }
83 }
84
85
86
87
88 @Override
89 public void parseFragment(final WebClient webClient, final DomNode parent, final DomNode context,
90 final String source, final boolean createdByJavascript)
91 throws SAXException, IOException {
92 final Page page = parent.getPage();
93 if (!(page instanceof HtmlPage)) {
94 return;
95 }
96 final HtmlPage htmlPage = (HtmlPage) page;
97 final URL url = htmlPage.getUrl();
98
99 final HtmlUnitNekoDOMBuilder domBuilder =
100 new HtmlUnitNekoDOMBuilder(this, parent, url, source, createdByJavascript);
101 domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
102
103 DomNode node = context;
104 final List<QName> ancestors = new ArrayList<>();
105 while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
106 ancestors.add(0, new QName(null, node.getNodeName(), null, null));
107 node = node.getParentNode();
108 }
109 if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
110 ancestors.add(new QName(null, "html", null, null));
111 ancestors.add(new QName(null, "body", null, null));
112 }
113 else if (ancestors.size() == 1
114 || (!"body".equals(ancestors.get(1).getLocalpart())
115 && !"head".equals(ancestors.get(1).getLocalpart()))) {
116 ancestors.add(new QName(null, "body", null, null));
117 }
118
119 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
120 domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
121
122 final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
123
124 htmlPage.registerParsingStart();
125 htmlPage.registerSnippetParsingStart();
126 try {
127 domBuilder.parse(in);
128 }
129 finally {
130 htmlPage.registerParsingEnd();
131 htmlPage.registerSnippetParsingEnd();
132 }
133 }
134
135
136
137
138 @Override
139 public void parse(final WebClient webClient, final WebResponse webResponse, final HtmlPage page,
140 final boolean xhtml, final boolean createdByJavascript) throws IOException {
141 final URL url = webResponse.getWebRequest().getUrl();
142 final HtmlUnitNekoDOMBuilder domBuilder =
143 new HtmlUnitNekoDOMBuilder(this, page, url, null, createdByJavascript);
144
145 final Charset charset = webResponse.getContentCharset();
146 try {
147 if (!webResponse.wasContentCharsetTentative()) {
148
149 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
150 }
151
152
153 if (xhtml) {
154 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
155 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
156 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
157 domBuilder.setFeature(HTMLScanner.CDATA_SECTIONS, true);
158 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
159 }
160
161 if (webClient != null) {
162 final int bufferSize = webClient.getOptions().getNekoReaderBufferSize();
163 if (bufferSize > 0) {
164 domBuilder.setProperty(HTMLScanner.READER_BUFFER_SIZE, bufferSize);
165 }
166 }
167 }
168 catch (final Exception e) {
169 throw new ObjectInstantiationException("Error setting HTML parser feature", e);
170 }
171
172 try (InputStream content = webResponse.getContentAsStream()) {
173 final String encoding = charset.name();
174 final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
175
176 page.registerParsingStart();
177 try {
178 domBuilder.parse(in);
179 }
180 catch (final XNIException e) {
181
182 final Throwable origin = extractNestedException(e);
183 throw new RuntimeException("Failed parsing content from " + url, origin);
184 }
185 }
186 finally {
187 page.registerParsingEnd();
188 }
189 }
190
191
192
193
194
195
196
197 static Throwable extractNestedException(final Throwable e) {
198 Throwable originalException;
199 Throwable cause = e;
200 do {
201 originalException = cause;
202
203 if (cause instanceof XNIException) {
204 cause = cause.getCause();
205 }
206 else {
207 cause = null;
208 }
209 }
210 while (cause != null);
211
212 return originalException;
213 }
214
215
216
217
218 @Override
219 public ElementFactory getSvgFactory() {
220 return SVG_FACTORY;
221 }
222
223
224
225
226 @Override
227 public ElementFactory getFactory(final String tagName) {
228 final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
229
230 if (result != null) {
231 return result;
232 }
233 return UnknownElementFactory.INSTANCE;
234 }
235
236
237
238
239
240
241
242
243
244
245
246
247 @Override
248 public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
249 final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
250 if (insideSvg) {
251 return SVG_FACTORY;
252 }
253
254 if (namespaceURI == null || namespaceURI.isEmpty()
255 || Html.XHTML_NAMESPACE.equals(namespaceURI)
256 || Html.SVG_NAMESPACE.equals(namespaceURI)
257 || !qualifiedName.contains(":")) {
258
259 String tagName = qualifiedName;
260 final int index = tagName.indexOf(':');
261 if (index == -1) {
262 tagName = StringUtils.toRootLowerCase(tagName);
263 }
264 else {
265 tagName = tagName.substring(index + 1);
266 }
267 final ElementFactory factory;
268 if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
269 factory = SVG_FACTORY;
270 }
271 else {
272 factory = ELEMENT_FACTORIES.get(tagName);
273 }
274
275 if (factory != null) {
276 return factory;
277 }
278 }
279 return UnknownElementFactory.INSTANCE;
280 }
281 }
282
283
284
285
286 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
287 private final HTMLParserListener listener_;
288 private final URL url_;
289 private final String html_;
290
291 HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
292 WebAssert.notNull("listener", listener);
293 WebAssert.notNull("url", url);
294 listener_ = listener;
295 url_ = url;
296 html_ = htmlContent;
297 }
298
299
300
301
302 @Override
303 public void error(final String domain, final String key,
304 final XMLParseException exception) throws XNIException {
305 listener_.error(exception.getMessage(),
306 url_,
307 html_,
308 exception.getLineNumber(),
309 exception.getColumnNumber(),
310 key);
311 }
312
313
314
315
316 @Override
317 public void warning(final String domain, final String key,
318 final XMLParseException exception) throws XNIException {
319 listener_.warning(exception.getMessage(),
320 url_,
321 html_,
322 exception.getLineNumber(),
323 exception.getColumnNumber(),
324 key);
325 }
326
327 @Override
328 public void fatalError(final String domain, final String key,
329 final XMLParseException exception) throws XNIException {
330 listener_.error(exception.getMessage(),
331 url_,
332 html_,
333 exception.getLineNumber(),
334 exception.getColumnNumber(),
335 key);
336 }
337 }