1 /*
2 * Copyright (c) 2002-2025 Gargoyle Software Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 * https://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15 package org.htmlunit;
16
17 import static java.nio.charset.StandardCharsets.UTF_16BE;
18 import static java.nio.charset.StandardCharsets.UTF_16LE;
19 import static java.nio.charset.StandardCharsets.UTF_8;
20
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Serializable;
24 import java.net.URL;
25 import java.nio.charset.Charset;
26 import java.util.List;
27
28 import org.apache.commons.io.ByteOrderMark;
29 import org.apache.commons.io.IOUtils;
30 import org.apache.commons.io.input.BOMInputStream;
31 import org.apache.commons.logging.Log;
32 import org.apache.commons.logging.LogFactory;
33 import org.htmlunit.http.HttpStatus;
34 import org.htmlunit.util.EncodingSniffer;
35 import org.htmlunit.util.MimeType;
36 import org.htmlunit.util.NameValuePair;
37 import org.htmlunit.util.StringUtils;
38
39 /**
40 * A response from a web server.
41 *
42 * @author Mike Bowler
43 * @author Brad Clarke
44 * @author Noboru Sinohara
45 * @author Marc Guillemot
46 * @author Ahmed Ashour
47 * @author Ronald Brill
48 * @author Lai Quang Duong
49 */
50 public class WebResponse implements Serializable {
51
52 private static final Log LOG = LogFactory.getLog(WebResponse.class);
53 private static final ByteOrderMark[] BOM_HEADERS = {
54 ByteOrderMark.UTF_8,
55 ByteOrderMark.UTF_16LE,
56 ByteOrderMark.UTF_16BE};
57
58 private final long loadTime_;
59 private final WebResponseData responseData_;
60 private final WebRequest request_;
61 private boolean wasContentCharsetTentative_;
62 private boolean wasBlocked_;
63 private String blockReason_;
64
65 /**
66 * Constructs with all data.
67 *
68 * @param responseData Data that was send back
69 * @param url Where this response came from
70 * @param requestMethod the method used to get this response
71 * @param loadTime How long the response took to be sent
72 */
73 public WebResponse(final WebResponseData responseData, final URL url,
74 final HttpMethod requestMethod, final long loadTime) {
75 this(responseData, new WebRequest(url, requestMethod), loadTime);
76 }
77
78 /**
79 * Constructs with all data.
80 *
81 * @param responseData Data that was send back
82 * @param request the request used to get this response
83 * @param loadTime How long the response took to be sent
84 */
85 public WebResponse(final WebResponseData responseData,
86 final WebRequest request, final long loadTime) {
87 responseData_ = responseData;
88 request_ = request;
89 loadTime_ = loadTime;
90 }
91
92 /**
93 * Returns the request used to load this response.
94 * @return the request used to load this response
95 */
96 public WebRequest getWebRequest() {
97 return request_;
98 }
99
100 /**
101 * Returns the response headers as a list of {@link NameValuePair}s.
102 * @return the response headers as a list of {@link NameValuePair}s
103 */
104 public List<NameValuePair> getResponseHeaders() {
105 return responseData_.getResponseHeaders();
106 }
107
108 /**
109 * Returns the value of the specified response header.
110 * @param headerName the name of the header whose value is to be returned
111 * @return the header value, {@code null} if no response header exists with this name
112 */
113 public String getResponseHeaderValue(final String headerName) {
114 for (final NameValuePair pair : responseData_.getResponseHeaders()) {
115 if (pair.getName().equalsIgnoreCase(headerName)) {
116 return pair.getValue();
117 }
118 }
119 return null;
120 }
121
122 /**
123 * Returns the status code that was returned by the server.
124 * @return the status code that was returned by the server
125 */
126 public int getStatusCode() {
127 return responseData_.getStatusCode();
128 }
129
130 /**
131 * Returns the status message that was returned from the server.
132 * @return the status message that was returned from the server
133 */
134 public String getStatusMessage() {
135 return responseData_.getStatusMessage();
136 }
137
138 /**
139 * Returns the content type returned from the server, e.g. "text/html".
140 * @return the content type returned from the server, e.g. "text/html"
141 */
142 public String getContentType() {
143 final String contentTypeHeader = getResponseHeaderValue(HttpHeader.CONTENT_TYPE_LC);
144 if (contentTypeHeader == null) {
145 // Not technically legal but some servers don't return a content-type
146 return "";
147 }
148 final int index = contentTypeHeader.indexOf(';');
149 if (index == -1) {
150 return contentTypeHeader;
151 }
152 return contentTypeHeader.substring(0, index);
153 }
154
155 /**
156 * Returns the content charset specified explicitly in the {@code Content-Type} header
157 * or {@code null} if none was specified.
158 * @return the content charset specified header or {@code null} if none was specified
159 */
160 public Charset getHeaderContentCharset() {
161 final String contentType = getResponseHeaderValue(HttpHeader.CONTENT_TYPE_LC);
162 if (contentType == null) {
163 return null;
164 }
165
166 final int index = contentType.indexOf(';');
167 if (index == -1 || index == 0) {
168 return null;
169 }
170 if (StringUtils.isBlank(contentType.substring(0, index))) {
171 return null;
172 }
173
174 return EncodingSniffer.extractEncodingFromContentType(contentType);
175 }
176
177 /**
178 * Returns the content charset for this response, even if no charset was specified explicitly.
179 * <p>
180 * This method always returns a valid charset. This method first checks the {@code Content-Type}
181 * header or in the content BOM for viable charset. If not found, it attempts to determine the
182 * charset based on the type of the content. As a last resort, this method returns the
183 * value of {@link org.htmlunit.WebRequest#getDefaultResponseContentCharset()} which is
184 * {@link java.nio.charset.StandardCharsets#UTF_8} by default.
185 * @return the content charset for this response
186 */
187 public Charset getContentCharset() {
188 wasContentCharsetTentative_ = false;
189
190 try (InputStream is = getContentAsStreamWithBomIfApplicable()) {
191 if (is instanceof BOMInputStream) {
192 final String bomCharsetName = ((BOMInputStream) is).getBOMCharsetName();
193 if (bomCharsetName != null) {
194 return Charset.forName(bomCharsetName);
195 }
196 }
197
198 Charset charset = getHeaderContentCharset();
199 if (charset != null) {
200 return charset;
201 }
202
203 final String contentType = getContentType();
204 switch (DefaultPageCreator.determinePageType(contentType)) {
205 case HTML:
206 charset = EncodingSniffer.sniffEncodingFromMetaTag(is);
207 wasContentCharsetTentative_ = true;
208 break;
209 case XML:
210 charset = EncodingSniffer.sniffEncodingFromXmlDeclaration(is);
211 if (charset == null) {
212 charset = UTF_8;
213 }
214 break;
215 default:
216 if (MimeType.TEXT_CSS.equals(contentType)) {
217 charset = EncodingSniffer.sniffEncodingFromCssDeclaration(is);
218 }
219 break;
220 }
221
222 if (charset != null) {
223 return charset;
224 }
225 }
226 catch (final IOException e) {
227 LOG.warn("Error trying to sniff encoding.", e);
228 wasContentCharsetTentative_ = true;
229 }
230 return getWebRequest().getDefaultResponseContentCharset();
231 }
232
233 /**
234 * Returns whether the charset of the previous call to {@link #getContentCharset()} was "tentative".
235 * <p>
236 * A charset is classed as "tentative" if its detection is prone to false positive/negatives.
237 * <p>
238 * For example, HTML meta-tag sniffing can be fooled by text that looks-like-a-meta-tag inside
239 * JavaScript code (false positive) or if the meta-tag is after the first 1024 bytes (false negative).
240 * @return {@code true} if the charset of the previous call to {@link #getContentCharset()} was
241 * "tentative".
242 * @see <a href="https://html.spec.whatwg.org/multipage/parsing.html#concept-encoding-confidence">
243 * https://html.spec.whatwg.org/multipage/parsing.html#concept-encoding-confidence</a>
244 */
245 public boolean wasContentCharsetTentative() {
246 return wasContentCharsetTentative_;
247 }
248
249 /**
250 * Returns the response content as a string, using the charset/encoding specified in the server response.
251 * @return the response content as a string, using the charset/encoding specified in the server response
252 * or null if the content retrieval was failing
253 */
254 public String getContentAsString() {
255 return getContentAsString(getContentCharset());
256 }
257
258 /**
259 * Returns the response content as a string, using the specified charset,
260 * rather than the charset/encoding specified in the server response.
261 * If there is a bom header the charset parameter will be overwritten by the bom.
262 * @param encoding the charset/encoding to use to convert the response content into a string
263 * @return the response content as a string or null if the content retrieval was failing
264 */
265 public String getContentAsString(final Charset encoding) {
266 if (responseData_ != null) {
267 try (InputStream in = responseData_.getInputStreamWithBomIfApplicable(BOM_HEADERS)) {
268 if (in instanceof BOMInputStream) {
269 try (BOMInputStream bomIn = (BOMInputStream) in) {
270 // there seems to be a bug in BOMInputStream
271 // we have to call this before hasBOM(ByteOrderMark)
272 if (bomIn.hasBOM()) {
273 if (bomIn.hasBOM(ByteOrderMark.UTF_8)) {
274 return IOUtils.toString(bomIn, UTF_8);
275 }
276 if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
277 return IOUtils.toString(bomIn, UTF_16BE);
278 }
279 if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
280 return IOUtils.toString(bomIn, UTF_16LE);
281 }
282 }
283 return IOUtils.toString(bomIn, encoding);
284 }
285 }
286
287 return IOUtils.toString(in, encoding);
288 }
289 catch (final IOException e) {
290 LOG.warn(e.getMessage(), e);
291 }
292 }
293 return null;
294 }
295
296 /**
297 * Returns length of the content data.
298 * @return the length
299 */
300 public long getContentLength() {
301 if (responseData_ == null) {
302 return 0;
303 }
304 return responseData_.getContentLength();
305 }
306
307 /**
308 * Returns the response content as an input stream.
309 * @return the response content as an input stream
310 * @throws IOException in case of IOProblems
311 */
312 public InputStream getContentAsStream() throws IOException {
313 return responseData_.getInputStream();
314 }
315
316 /**
317 * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
318 *
319 * @return the associated InputStream wrapped with a bom input stream if applicable
320 * @throws IOException in case of IO problems
321 */
322 public InputStream getContentAsStreamWithBomIfApplicable() throws IOException {
323 if (responseData_ != null) {
324 return responseData_.getInputStreamWithBomIfApplicable(BOM_HEADERS);
325 }
326 return null;
327 }
328
329 /**
330 * Returns the time it took to load this web response, in milliseconds.
331 * @return the time it took to load this web response, in milliseconds
332 */
333 public long getLoadTime() {
334 return loadTime_;
335 }
336
337 /**
338 * Clean up the response data.
339 */
340 public void cleanUp() {
341 if (responseData_ != null) {
342 responseData_.cleanUp();
343 }
344 }
345
346 /**
347 * @return true if the 2xx
348 */
349 public boolean isSuccess() {
350 final int statusCode = getStatusCode();
351 return statusCode >= HttpStatus.OK_200 && statusCode < HttpStatus.MULTIPLE_CHOICES_300;
352 }
353
354 /**
355 * @return true if the 2xx or 305
356 */
357 public boolean isSuccessOrUseProxy() {
358 final int statusCode = getStatusCode();
359 return (statusCode >= HttpStatus.OK_200 && statusCode < HttpStatus.MULTIPLE_CHOICES_300)
360 || statusCode == HttpStatus.USE_PROXY_305;
361 }
362
363 /**
364 * @return true if the 2xx or 305
365 */
366 public boolean isSuccessOrUseProxyOrNotModified() {
367 final int statusCode = getStatusCode();
368 return (statusCode >= HttpStatus.OK_200 && statusCode < HttpStatus.MULTIPLE_CHOICES_300)
369 || statusCode == HttpStatus.USE_PROXY_305
370 || statusCode == HttpStatus.NOT_MODIFIED_304;
371 }
372
373 /**
374 * @return true if the request was blocked
375 */
376 public boolean wasBlocked() {
377 return wasBlocked_;
378 }
379
380 /**
381 * @return the reason for blocking or null
382 */
383 public String getBlockReason() {
384 return blockReason_;
385 }
386
387 /**
388 * Sets the wasBlocked state to true.
389 *
390 * @param blockReason the reason
391 */
392 public void markAsBlocked(final String blockReason) {
393 wasBlocked_ = true;
394 blockReason_ = blockReason;
395 }
396 }