1 files changed, 43 insertions, 4 deletions
diff --git a/background/stream_filter.js b/background/stream_filter.js
index 2dce811..96b6132 100644
--- a/background/stream_filter.js
+++ b/background/stream_filter.js
@@ -12,6 +12,7 @@
 /*
  * IMPORTS_START
  * IMPORT browser
+ * IMPORT is_csp_header_name
  * IMPORTS_END
  */
 
@@ -110,6 +111,35 @@ function create_decoder(properties, data)
     return new TextDecoder(charset || "latin1");
 }
 
+function may_define_csp_rules(html)
+{
+    const doc = new DOMParser().parseFromString(html, "text/html");
+
+    for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
+	if (is_csp_header_name(meta.getAttribute("http-equiv"), true) &&
+	    meta.content)
+	    return true;
+    }
+
+    /*
+     * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
+     * data could add some. Before we return `false' we need to be sure we
+     * reached the start of `<body>' where `<meta>' tags are no longer valid.
+     */
+
+    if (doc.documentElement.nextSibling || doc.body.nextSibling ||
+	doc.body.childNodes.length > 1)
+	return false;
+
+    if (!doc.body.firstChild)
+	return true;
+
+    if (doc.body.firstChild.nodeName !== "#text")
+	return false;
+
+    return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
+}
+
 function filter_data(properties, event)
 {
     const data = new Uint8Array(event.data);
@@ -118,13 +148,15 @@ function filter_data(properties, event)
 	first_chunk = true;
 	properties.decoder = create_decoder(properties, data);
 	properties.encoder = new TextEncoder();
-	/* Force UTF-8, this is the only encoding we can produce. */
-	properties.filter.write(new Uint8Array(UTF8_BOM));
     }
 
     let decoded = properties.decoder.decode(data);
 
-    if (first_chunk) {
+    /* Force UTF-8, this is the only encoding we can produce. */
+    if (first_chunk)
+	properties.filter.write(new Uint8Array(UTF8_BOM));
+
+    if (first_chunk && may_define_csp_rules(decoded)) {
 	/*
 	 * HAX! Our content scripts that execute at `document_start' will always
 	 * run before the first script in the document, but under Mozilla some
@@ -133,7 +165,14 @@ function filter_data(properties, event)
 	 * will force `document_start' to happen earlier. This way our content
 	 * scripts will be able to sanitize `http-equiv' tags with CSP rules
 	 * that would otherwise stop our injected scripts from executing.
+	 *
+	 * As we want to only process HTML files that happen to have naughty
+	 * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
+	 * `may_define_rules()'. We don't do any additional MIME sniffing as it
+	 * is too unreliable (and our heuristic will likely mark non-HTML files
+	 * as harmless anyway).
 	 */
+
 	const dummy_script =
 	      `<script data-hachette-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`;
 	const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
@@ -149,7 +188,7 @@ function filter_data(properties, event)
 
 function apply_stream_filter(details, headers, policy)
 {
-    if (policy.allow)
+    if (!policy.has_payload)
 	return headers;
 
     const properties = properties_from_headers(headers);