background/stream_filter.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215

/**
 * Hachette modifying a web page using the StreamFilter API
 *
 * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
 * Copyright (C) 2021 Wojtek Kosior
 * Redistribution terms are gathered in the `copyright' file.
 *
 * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
 * in LibreJS.
 */

/*
 * IMPORTS_START
 * IMPORT browser
 * IMPORT is_csp_header_name
 * IMPORTS_END
 */

function validate_encoding(charset)
{
    try {
	new TextDecoder();
	return charset;
    } catch(e) {
	return undefined;
    }
}

function is_content_type_header(header)
{
    header.name.toLowerCase().trim() === "content-type";
}

const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;

function properties_from_headers(headers)
{
    const properties = {};

    for (const header of headers.filter(is_content_type_header)) {
	const match = charset_reg.exec(header.value);
	if (!properties.detected_charset && validate_encoding(match[1]))
	    properties.detected_charset = match[1];

	if (/html/i.test(header.value))
	    properties.html = true;
    }

    return properties;
}

const UTF8_BOM = [0xef, 0xbb, 0xbf];
const BOMs = [
    [UTF8_BOM, "utf-8"],
    [[0xfe, 0xff], "utf-16be"],
    [[0xff, 0xfe], "utf-16le"]
];

function charset_from_BOM(data)
{
    for (const [BOM, charset] of BOMs) {
	if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
	    return charset;
    }

    return "";
}

const charset_attrs =
      ['charset', 'http-equiv="content-type"', 'content*="charset"'];
const charset_meta_selector =
      charset_attrs.map(a => `head>meta[${a}]`).join(", ");

function charset_from_meta_tags(doc)
{
    for (const meta of doc.querySelectorAll(charset_meta_selector)) {
	const maybe_charset = meta.getAttribute("charset");
	if (maybe_charset && validate_encoding(maybe_charset))
	    return maybe_charset;

        const match = charset_reg.exec(meta.getAttribute("content"));
        if (match && validate_encoding(match[1]))
	    return match[1];
    }

    return undefined;
}

function create_decoder(properties, data)
{
    let charset = charset_from_BOM(data) || properties.detected_charset;
    if (!charset && data.indexOf(0) !== -1) {
        console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
		      properties);
	return new TextDecoder("utf-16be");
    }

    /* Missing HTTP charset, sniffing in content... */
    /*
     * TODO: I recall there is some standard saying how early in the doc the
     * charset has to be specified. We could process just this part of data.
     */
    const text = new TextDecoder("latin1").decode(data, {stream: true});
    properties.html = properties.html || /html/i.test(text);

    if (properties.html) {
	const tmp_doc = new DOMParser().parseFromString(text, "text/html");
	charset = charset_from_meta_tags(tmp_doc);
    }

    return new TextDecoder(charset || "latin1");
}

function may_define_csp_rules(html)
{
    const doc = new DOMParser().parseFromString(html, "text/html");

    for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
	if (is_csp_header_name(meta.getAttribute("http-equiv"), true) &&
	    meta.content)
	    return true;
    }

    /*
     * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
     * data could add some. Before we return `false' we need to be sure we
     * reached the start of `<body>' where `<meta>' tags are no longer valid.
     */

    if (doc.documentElement.nextSibling || doc.body.nextSibling ||
	doc.body.childNodes.length > 1)
	return false;

    if (!doc.body.firstChild)
	return true;

    if (doc.body.firstChild.nodeName !== "#text")
	return false;

    return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
}

function filter_data(properties, event)
{
    const data = new Uint8Array(event.data);
    let first_chunk = false;
    if (!properties.decoder) {
	first_chunk = true;
	properties.decoder = create_decoder(properties, data);
	properties.encoder = new TextEncoder();
    }

    let decoded = properties.decoder.decode(data);

    /* Force UTF-8, this is the only encoding we can produce. */
    if (first_chunk)
	properties.filter.write(new Uint8Array(UTF8_BOM));

    if (first_chunk && may_define_csp_rules(decoded)) {
	/*
	 * HAX! Our content scripts that execute at `document_start' will always
	 * run before the first script in the document, but under Mozilla some
	 * `<meta>' tags might already be loaded at that point. Here we inject a
	 * dummy `<script>' at the beginning (before any `<meta>' tags) that
	 * will force `document_start' to happen earlier. This way our content
	 * scripts will be able to sanitize `http-equiv' tags with CSP rules
	 * that would otherwise stop our injected scripts from executing.
	 *
	 * As we want to only process HTML files that happen to have naughty
	 * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
	 * `may_define_rules()'. We don't do any additional MIME sniffing as it
	 * is too unreliable (and our heuristic will likely mark non-HTML files
	 * as harmless anyway).
	 */

	const dummy_script =
	      `<script data-hachette-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`;
	const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
	decoded = doctype_decl + dummy_script +
	    decoded.substring(doctype_decl.length);
    }

    properties.filter.write(properties.encoder.encode(decoded));

    if (properties.decoder.encoding === "utf-8")
	properties.filter.disconnect();
}

function apply_stream_filter(details, headers, policy)
{
    if (!policy.has_payload)
	return headers;

    const properties = properties_from_headers(headers);
    properties.policy = policy;

    properties.filter =
	browser.webRequest.filterResponseData(details.requestId);

    properties.filter.ondata = event => filter_data(properties, event);
    properties.filter.onstop = () => properties.filter.close();

    /*
     * In the future we might consider modifying the headers that specify
     * encoding. For now we are not yet doing it, though. However, we
     * prepend the data with UTF-8 BOM which should be enough.
     */
    return headers;
}

/*
 * EXPORTS_START
 * EXPORT apply_stream_filter
 * EXPORTS_END
 */