From 64afd5b9415d62c1f178ca78a8358bd3503d5855 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Mon, 26 Jul 2021 13:37:05 +0200 Subject: provide a facility to sanitize externally-obtained JSON --- common/sanitize_JSON.js | 412 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 412 insertions(+) create mode 100644 common/sanitize_JSON.js diff --git a/common/sanitize_JSON.js b/common/sanitize_JSON.js new file mode 100644 index 0000000..3fc5007 --- /dev/null +++ b/common/sanitize_JSON.js @@ -0,0 +1,412 @@ +/** + * part of Hachette + * Powerful, full-blown format enforcer for externally-obtained JSON + * + * Copyright (C) 2021 Wojtek Kosior + * Redistribution terms are gathered in the `copyright' file. + */ + +var error_path; +var invalid_schema; + +function parse_json_with_schema(schema, json_string) +{ + error_path = []; + invalid_schema = false; + + try { + return sanitize_unknown(schema, JSON.parse(json_string)); + } catch (e) { + throw `Invalid JSON${invalid_schema ? " schema" : ""}: ${e}.`; + } finally { + /* Allow garbage collection. */ + error_path = undefined; + } +} + +function error_message(cause) +{ + return `object${error_path.join("")} ${cause}`; +} + +function sanitize_unknown(schema, item) +{ + console.log(`sanitize_unknown ${JSON.stringify(schema)}`); + let error_msg = undefined; + let schema_options = []; + let has_default = false; + let _default = undefined; + + if (!Array.isArray(schema) || schema[1] === "matchentry" || + schema.length < 2 || !["ordefault", "or"].includes(schema)) + return sanitize_unknown_no_alternatives(schema, item); + + if ((schema.length & 1) !== 1) { + invalid_schema = true; + throw error_message("was not understood"); + } + + for (let i = 0; i < schema.length; i++) { + if ((i & 1) !== 1) { + schema_options.push(schema[i]); + continue; + } + + if (schema[i] === "or") + continue; + if (schema[i] === "ordefault" && schema.length === i + 2) { + has_default = true; + _default = schema[i + 1]; + break; + } + + invalid_schema = true; + throw error_message("was not understood"); + } + + for (const schema_option of schema_options) { + try { + return sanitize_unknown_no_alternatives(schema_option, item); + } catch (e) { + if (invalid_schema) + throw e; + + if (has_default) + continue; + + if (error_msg === undefined) + error_msg = e; + else + error_msg = `${error_msg}, or ${e}`; + } + } + + if (has_default) + return _default; + + throw error_msg; +} + +function sanitize_unknown_no_alternatives(schema, item) +{ + console.log(`sanitize_unknown_no_alternatives ${JSON.stringify(schema)}`); + for (const [schema_check, item_check, sanitizer, type_name] of checks) { + console.log(`checking ${type_name}`); + if (schema_check(schema)) { + if (item_check(item)) + return sanitizer(schema, item); + throw error_message(`should be ${type_name} but is not`); + } + } + + invalid_schema = true; + throw error_message("was not understood"); +} + +function key_error_path_segment(key) +{ + return /^[a-zA-Z_][a-zA-Z_0-9]*$/.exec(key) ? + `.${key}` : `[${JSON.stringify(key)}]`; +} + +/* + * Generic object - one that can contain arbitrary keys (in addition to ones + * specified explicitly in the schema). + */ +function sanitize_genobj(schema, object) +{ + let max_matched_entries = Infinity; + let min_matched_entries = 0; + let matched_entries = 0; + const entry_schemas = []; + schema = [...schema]; + + if (schema[2] === "minentries") { + if (schema.length < 4) { + invalid_schema = true; + throw error_message("was not understood"); + } + + min_matched_entries = schema[3]; + schema.splice(2, 2); + } + + if (min_matched_entries < 0) { + invalid_schema = true; + throw error_message('specifies invalid "minentries" (should be a non-negative number)'); + } + + if (schema[2] === "maxentries") { + if (schema.length < 4) { + invalid_schema = true; + throw error_message("was not understood"); + } + + max_matched_entries = schema[3]; + schema.splice(2, 2); + } + + if (max_matched_entries < 0) { + invalid_schema = true; + throw error_message('specifies invalid "maxentries" (should be a non-negative number)'); + } + + while (schema.length > 2) { + let regex = /.+/; + + if (schema.length > 3) { + regex = schema[2]; + schema.splice(2, 1); + } + + if (typeof regex === "string") + regex = new RegExp(regex); + + entry_schemas.push([regex, schema[2]]); + schema.splice(2, 1); + } + + const result = sanitize_object(schema[0], object); + + for (const [key, entry] of Object.entries(object)) { + if (result.hasOwnProperty(key)) + continue; + + matched_entries += 1; + if (matched_entries > max_matched_entries) + throw error_message(`has more than ${max_matched_entries} matched entr${max_matched_entries === 1 ? "y" : "ies"}`); + + error_path.push(key_error_path_segment(key)); + + let match = false; + for (const [key_regex, entry_schema] of entry_schemas) { + if (!key_regex.exec(key)) + continue; + + match = true; + + sanitize_object_entry(result, key, entry_schema, object); + break; + } + + if (!match) { + const regex_list = entry_schemas.map(i => i[0]).join(", "); + throw error_message(`does not match any of key regexes: [${regex_list}]`); + } + + error_path.pop(); + } + + if (matched_entries < min_matched_entries) + throw error_message(`has less than ${min_matched_entries} matched entr${min_matched_entries === 1 ? "y" : "ies"}`); + + return result; +} + +function sanitize_array(schema, array) +{ + console.log(`sanitize_array ${JSON.stringify(schema)}`); + let min_length = 0; + let max_length = Infinity; + let repeat_length = 1; + let i = 0; + const result = []; + + schema = [...schema]; + if (schema[schema.length - 2] === "maxlen") { + max_length = schema[schema.length - 1]; + schema.splice(schema.length - 2); + } + + if (schema[schema.length - 2] === "minlen") { + min_length = schema[schema.length - 1]; + schema.splice(schema.length - 2); + } + + if (["repeat", "repeatfull"].includes(schema[schema.length - 2])) + repeat_length = schema.pop(); + if (repeat_length < 1) { + invalid_schema = true; + throw error_message('specifies invalid "${schema[schema.length - 2]}" (should be number greater than 1)'); + } + if (["repeat", "repeatfull"].includes(schema[schema.length - 1])) { + var repeat_directive = schema.pop(); + repeat = schema.splice(schema.length - repeat_length); + } else if (schema.length !== array.length) { + throw error_message(`does not not have exactly ${schema.length} items`); + } + + if (repeat_directive === "repeatfull" && + (array.length - schema.length) % repeat_length !== 0) + throw error_message(`does not not contain a full number of item group repetitions`); + + if (array.length < min_length) + throw error_message(`has less than ${min_length} element${min_length === 1 ? "" : "s"}`); + + if (array.length > max_length) + throw error_message(`has more than ${max_length} element${max_length === 1 ? "" : "s"}`); + + console.log(schema, repeat); + + for (const item of array) { + if (i >= schema.length) { + i = 0; + schema = repeat; + } + + error_path.push(`[${i}]`); + const sanitized = sanitize_unknown(schema[i], item); + if (sanitized !== discard) + result.push(sanitized); + error_path.pop(); + + i++; + } + + return result; +} + +function sanitize_regex(schema, string) +{ + console.log(`sanitize_regex ${schema}`); + if (schema.test(string)) + return string; + + throw error_message(`does not match regex ${schema}`); +} + +const string_spec_regex = /^string(:(.*))?$/; + +function sanitize_string(schema, string) +{ + console.log(`sanitize_string ${JSON.stringify(schema)}`); + const regex = string_spec_regex.exec(schema)[2]; + + if (regex === undefined) + return string; + + return sanitize_regex(new RegExp(regex), string); +} + +function sanitize_object(schema, object) +{ + console.log(`sanitize_object ${JSON.stringify(schema)}`); + const result = {}; + + for (let [key, entry_schema] of Object.entries(schema)) { + error_path.push(key_error_path_segment(key)); + sanitize_object_entry(result, key, entry_schema, object); + error_path.pop(); + } + + return result; +} + +function sanitize_object_entry(result, key, entry_schema, object) +{ + console.log(`sanitize_object_entry ${JSON.stringify(entry_schema)}`); + let optional = false; + let has_default = false; + let _default = undefined; + + if (Array.isArray(entry_schema) && entry_schema.length > 1) { + if (entry_schema[0] === "optional") { + optional = true; + entry_schema = [...entry_schema].splice(1); + + const idx_def = entry_schema.length - (entry_schema.length & 1) - 1; + if (entry_schema[idx_def] === "default") { + has_default = true; + _default = entry_schema[idx_def + 1]; + entry_schema.splice(idx_def); + } else if ((entry_schema.length & 1) !== 1) { + invalid_schema = true; + throw error_message("was not understood"); + } + + if (entry_schema.length < 2) + entry_schema = entry_schema[0]; + } + } + + let unsanitized_value = object[key]; + if (unsanitized_value === undefined) { + if (!optional) + throw error_message("is missing"); + + if (has_default) + result[key] = _default; + + return; + } + + const sanitized = sanitize_unknown(entry_schema, unsanitized_value); + if (sanitized !== discard) + result[key] = sanitized; +} + +function take_literal(schema, item) +{ + console.log(`take_literal ${JSON.stringify(schema)}`); + return item; +} + +/* + * This function is used like a symbol. Other parts of code do sth like + * `item === discard` to check if item was returned by this function. + */ +function discard(schema, item) +{ + console.log(`discard ${JSON.stringify(schema)}`); + return discard; +} + +/* + * The following are some helper functions to categorize various + * schema item specifiers (used in the array below). + */ + +function is_genobj_spec(item) +{ + return Array.isArray(item) && item[1] === "matchentry"; +} + +function is_regex(item) +{ + return typeof item === "object" && typeof item.test === "function"; +} + +function is_string_spec(item) +{ + return typeof item === "string" && string_spec_regex.test(item); +} + +function is_object(item) +{ + return typeof item === "object"; +} + +function eq(what) +{ + return i => i === what; +} + +/* Array and null checks must go before object check. */ +const checks = [ + [is_genobj_spec, is_object, sanitize_genobj, "an object"], + [Array.isArray, Array.isArray, sanitize_array, "an array"], + [eq(null), i => i === null, take_literal, "null"], + [is_regex, i => typeof i === "string", sanitize_regex, "a string"], + [is_string_spec, i => typeof i === "string", sanitize_string, "a string"], + [is_object, is_object, sanitize_object, "an object"], + [eq("number"), i => typeof i === "number", take_literal, "a number"], + [eq("boolean"), i => typeof i === "boolean", take_literal, "a boolean"], + [eq("anything"), i => true, take_literal, "dummy"], + [eq("discard"), i => true, discard, "dummy"] +]; + +/* + * EXPORTS_START + * EXPORT parse_json_with_schema + * EXPORTS_END + */ -- cgit v1.2.3