summaryrefslogtreecommitdiff
path: root/common/sanitize_JSON.js
diff options
context:
space:
mode:
Diffstat (limited to 'common/sanitize_JSON.js')
-rw-r--r--common/sanitize_JSON.js412
1 files changed, 412 insertions, 0 deletions
diff --git a/common/sanitize_JSON.js b/common/sanitize_JSON.js
new file mode 100644
index 0000000..3fc5007
--- /dev/null
+++ b/common/sanitize_JSON.js
@@ -0,0 +1,412 @@
+/**
+ * part of Hachette
+ * Powerful, full-blown format enforcer for externally-obtained JSON
+ *
+ * Copyright (C) 2021 Wojtek Kosior
+ * Redistribution terms are gathered in the `copyright' file.
+ */
+
+var error_path;
+var invalid_schema;
+
+function parse_json_with_schema(schema, json_string)
+{
+ error_path = [];
+ invalid_schema = false;
+
+ try {
+ return sanitize_unknown(schema, JSON.parse(json_string));
+ } catch (e) {
+ throw `Invalid JSON${invalid_schema ? " schema" : ""}: ${e}.`;
+ } finally {
+ /* Allow garbage collection. */
+ error_path = undefined;
+ }
+}
+
+function error_message(cause)
+{
+ return `object${error_path.join("")} ${cause}`;
+}
+
+function sanitize_unknown(schema, item)
+{
+ console.log(`sanitize_unknown ${JSON.stringify(schema)}`);
+ let error_msg = undefined;
+ let schema_options = [];
+ let has_default = false;
+ let _default = undefined;
+
+ if (!Array.isArray(schema) || schema[1] === "matchentry" ||
+ schema.length < 2 || !["ordefault", "or"].includes(schema))
+ return sanitize_unknown_no_alternatives(schema, item);
+
+ if ((schema.length & 1) !== 1) {
+ invalid_schema = true;
+ throw error_message("was not understood");
+ }
+
+ for (let i = 0; i < schema.length; i++) {
+ if ((i & 1) !== 1) {
+ schema_options.push(schema[i]);
+ continue;
+ }
+
+ if (schema[i] === "or")
+ continue;
+ if (schema[i] === "ordefault" && schema.length === i + 2) {
+ has_default = true;
+ _default = schema[i + 1];
+ break;
+ }
+
+ invalid_schema = true;
+ throw error_message("was not understood");
+ }
+
+ for (const schema_option of schema_options) {
+ try {
+ return sanitize_unknown_no_alternatives(schema_option, item);
+ } catch (e) {
+ if (invalid_schema)
+ throw e;
+
+ if (has_default)
+ continue;
+
+ if (error_msg === undefined)
+ error_msg = e;
+ else
+ error_msg = `${error_msg}, or ${e}`;
+ }
+ }
+
+ if (has_default)
+ return _default;
+
+ throw error_msg;
+}
+
+function sanitize_unknown_no_alternatives(schema, item)
+{
+ console.log(`sanitize_unknown_no_alternatives ${JSON.stringify(schema)}`);
+ for (const [schema_check, item_check, sanitizer, type_name] of checks) {
+ console.log(`checking ${type_name}`);
+ if (schema_check(schema)) {
+ if (item_check(item))
+ return sanitizer(schema, item);
+ throw error_message(`should be ${type_name} but is not`);
+ }
+ }
+
+ invalid_schema = true;
+ throw error_message("was not understood");
+}
+
+function key_error_path_segment(key)
+{
+ return /^[a-zA-Z_][a-zA-Z_0-9]*$/.exec(key) ?
+ `.${key}` : `[${JSON.stringify(key)}]`;
+}
+
+/*
+ * Generic object - one that can contain arbitrary keys (in addition to ones
+ * specified explicitly in the schema).
+ */
+function sanitize_genobj(schema, object)
+{
+ let max_matched_entries = Infinity;
+ let min_matched_entries = 0;
+ let matched_entries = 0;
+ const entry_schemas = [];
+ schema = [...schema];
+
+ if (schema[2] === "minentries") {
+ if (schema.length < 4) {
+ invalid_schema = true;
+ throw error_message("was not understood");
+ }
+
+ min_matched_entries = schema[3];
+ schema.splice(2, 2);
+ }
+
+ if (min_matched_entries < 0) {
+ invalid_schema = true;
+ throw error_message('specifies invalid "minentries" (should be a non-negative number)');
+ }
+
+ if (schema[2] === "maxentries") {
+ if (schema.length < 4) {
+ invalid_schema = true;
+ throw error_message("was not understood");
+ }
+
+ max_matched_entries = schema[3];
+ schema.splice(2, 2);
+ }
+
+ if (max_matched_entries < 0) {
+ invalid_schema = true;
+ throw error_message('specifies invalid "maxentries" (should be a non-negative number)');
+ }
+
+ while (schema.length > 2) {
+ let regex = /.+/;
+
+ if (schema.length > 3) {
+ regex = schema[2];
+ schema.splice(2, 1);
+ }
+
+ if (typeof regex === "string")
+ regex = new RegExp(regex);
+
+ entry_schemas.push([regex, schema[2]]);
+ schema.splice(2, 1);
+ }
+
+ const result = sanitize_object(schema[0], object);
+
+ for (const [key, entry] of Object.entries(object)) {
+ if (result.hasOwnProperty(key))
+ continue;
+
+ matched_entries += 1;
+ if (matched_entries > max_matched_entries)
+ throw error_message(`has more than ${max_matched_entries} matched entr${max_matched_entries === 1 ? "y" : "ies"}`);
+
+ error_path.push(key_error_path_segment(key));
+
+ let match = false;
+ for (const [key_regex, entry_schema] of entry_schemas) {
+ if (!key_regex.exec(key))
+ continue;
+
+ match = true;
+
+ sanitize_object_entry(result, key, entry_schema, object);
+ break;
+ }
+
+ if (!match) {
+ const regex_list = entry_schemas.map(i => i[0]).join(", ");
+ throw error_message(`does not match any of key regexes: [${regex_list}]`);
+ }
+
+ error_path.pop();
+ }
+
+ if (matched_entries < min_matched_entries)
+ throw error_message(`has less than ${min_matched_entries} matched entr${min_matched_entries === 1 ? "y" : "ies"}`);
+
+ return result;
+}
+
+function sanitize_array(schema, array)
+{
+ console.log(`sanitize_array ${JSON.stringify(schema)}`);
+ let min_length = 0;
+ let max_length = Infinity;
+ let repeat_length = 1;
+ let i = 0;
+ const result = [];
+
+ schema = [...schema];
+ if (schema[schema.length - 2] === "maxlen") {
+ max_length = schema[schema.length - 1];
+ schema.splice(schema.length - 2);
+ }
+
+ if (schema[schema.length - 2] === "minlen") {
+ min_length = schema[schema.length - 1];
+ schema.splice(schema.length - 2);
+ }
+
+ if (["repeat", "repeatfull"].includes(schema[schema.length - 2]))
+ repeat_length = schema.pop();
+ if (repeat_length < 1) {
+ invalid_schema = true;
+ throw error_message('specifies invalid "${schema[schema.length - 2]}" (should be number greater than 1)');
+ }
+ if (["repeat", "repeatfull"].includes(schema[schema.length - 1])) {
+ var repeat_directive = schema.pop();
+ repeat = schema.splice(schema.length - repeat_length);
+ } else if (schema.length !== array.length) {
+ throw error_message(`does not not have exactly ${schema.length} items`);
+ }
+
+ if (repeat_directive === "repeatfull" &&
+ (array.length - schema.length) % repeat_length !== 0)
+ throw error_message(`does not not contain a full number of item group repetitions`);
+
+ if (array.length < min_length)
+ throw error_message(`has less than ${min_length} element${min_length === 1 ? "" : "s"}`);
+
+ if (array.length > max_length)
+ throw error_message(`has more than ${max_length} element${max_length === 1 ? "" : "s"}`);
+
+ console.log(schema, repeat);
+
+ for (const item of array) {
+ if (i >= schema.length) {
+ i = 0;
+ schema = repeat;
+ }
+
+ error_path.push(`[${i}]`);
+ const sanitized = sanitize_unknown(schema[i], item);
+ if (sanitized !== discard)
+ result.push(sanitized);
+ error_path.pop();
+
+ i++;
+ }
+
+ return result;
+}
+
+function sanitize_regex(schema, string)
+{
+ console.log(`sanitize_regex ${schema}`);
+ if (schema.test(string))
+ return string;
+
+ throw error_message(`does not match regex ${schema}`);
+}
+
+const string_spec_regex = /^string(:(.*))?$/;
+
+function sanitize_string(schema, string)
+{
+ console.log(`sanitize_string ${JSON.stringify(schema)}`);
+ const regex = string_spec_regex.exec(schema)[2];
+
+ if (regex === undefined)
+ return string;
+
+ return sanitize_regex(new RegExp(regex), string);
+}
+
+function sanitize_object(schema, object)
+{
+ console.log(`sanitize_object ${JSON.stringify(schema)}`);
+ const result = {};
+
+ for (let [key, entry_schema] of Object.entries(schema)) {
+ error_path.push(key_error_path_segment(key));
+ sanitize_object_entry(result, key, entry_schema, object);
+ error_path.pop();
+ }
+
+ return result;
+}
+
+function sanitize_object_entry(result, key, entry_schema, object)
+{
+ console.log(`sanitize_object_entry ${JSON.stringify(entry_schema)}`);
+ let optional = false;
+ let has_default = false;
+ let _default = undefined;
+
+ if (Array.isArray(entry_schema) && entry_schema.length > 1) {
+ if (entry_schema[0] === "optional") {
+ optional = true;
+ entry_schema = [...entry_schema].splice(1);
+
+ const idx_def = entry_schema.length - (entry_schema.length & 1) - 1;
+ if (entry_schema[idx_def] === "default") {
+ has_default = true;
+ _default = entry_schema[idx_def + 1];
+ entry_schema.splice(idx_def);
+ } else if ((entry_schema.length & 1) !== 1) {
+ invalid_schema = true;
+ throw error_message("was not understood");
+ }
+
+ if (entry_schema.length < 2)
+ entry_schema = entry_schema[0];
+ }
+ }
+
+ let unsanitized_value = object[key];
+ if (unsanitized_value === undefined) {
+ if (!optional)
+ throw error_message("is missing");
+
+ if (has_default)
+ result[key] = _default;
+
+ return;
+ }
+
+ const sanitized = sanitize_unknown(entry_schema, unsanitized_value);
+ if (sanitized !== discard)
+ result[key] = sanitized;
+}
+
+function take_literal(schema, item)
+{
+ console.log(`take_literal ${JSON.stringify(schema)}`);
+ return item;
+}
+
+/*
+ * This function is used like a symbol. Other parts of code do sth like
+ * `item === discard` to check if item was returned by this function.
+ */
+function discard(schema, item)
+{
+ console.log(`discard ${JSON.stringify(schema)}`);
+ return discard;
+}
+
+/*
+ * The following are some helper functions to categorize various
+ * schema item specifiers (used in the array below).
+ */
+
+function is_genobj_spec(item)
+{
+ return Array.isArray(item) && item[1] === "matchentry";
+}
+
+function is_regex(item)
+{
+ return typeof item === "object" && typeof item.test === "function";
+}
+
+function is_string_spec(item)
+{
+ return typeof item === "string" && string_spec_regex.test(item);
+}
+
+function is_object(item)
+{
+ return typeof item === "object";
+}
+
+function eq(what)
+{
+ return i => i === what;
+}
+
+/* Array and null checks must go before object check. */
+const checks = [
+ [is_genobj_spec, is_object, sanitize_genobj, "an object"],
+ [Array.isArray, Array.isArray, sanitize_array, "an array"],
+ [eq(null), i => i === null, take_literal, "null"],
+ [is_regex, i => typeof i === "string", sanitize_regex, "a string"],
+ [is_string_spec, i => typeof i === "string", sanitize_string, "a string"],
+ [is_object, is_object, sanitize_object, "an object"],
+ [eq("number"), i => typeof i === "number", take_literal, "a number"],
+ [eq("boolean"), i => typeof i === "boolean", take_literal, "a boolean"],
+ [eq("anything"), i => true, take_literal, "dummy"],
+ [eq("discard"), i => true, discard, "dummy"]
+];
+
+/*
+ * EXPORTS_START
+ * EXPORT parse_json_with_schema
+ * EXPORTS_END
+ */