aboutsummaryrefslogtreecommitdiff
path: root/libxml2-2.9.10/genUnicode.py
diff options
context:
space:
mode:
Diffstat (limited to 'libxml2-2.9.10/genUnicode.py')
-rwxr-xr-xlibxml2-2.9.10/genUnicode.py478
1 files changed, 478 insertions, 0 deletions
diff --git a/libxml2-2.9.10/genUnicode.py b/libxml2-2.9.10/genUnicode.py
new file mode 100755
index 0000000..4487eeb
--- /dev/null
+++ b/libxml2-2.9.10/genUnicode.py
@@ -0,0 +1,478 @@
+#!/usr/bin/python -u
+#
+# Original script modified in November 2003 to take advantage of
+# the character-validation range routines, and updated to the
+# current Unicode information (Version 4.0.1)
+#
+# NOTE: there is an 'alias' facility for blocks which are not present in
+# the current release, but are needed for ABI compatibility. This
+# must be accomplished MANUALLY! Please see the comments below under
+# 'blockAliases'
+#
+import sys
+import string
+import time
+
+webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
+sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
+
+#
+# blockAliases is a small hack - it is used for mapping block names which
+# were were used in the 3.1 release, but are missing or changed in the current
+# release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
+blockAliases = []
+blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
+blockAliases.append("Greek:GreekandCoptic")
+blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
+ "SupplementaryPrivateUseArea-B")
+
+# minTableSize gives the minimum number of ranges which must be present
+# before a range table is produced. If there are less than this
+# number, inline comparisons are generated
+minTableSize = 8
+
+(blockfile, catfile) = string.split(sources)
+
+
+#
+# Now process the "blocks" file, reducing it to a dictionary
+# indexed by blockname, containing a tuple with the applicable
+# block range
+#
+BlockNames = {}
+try:
+ blocks = open(blockfile, "r")
+except:
+ print "Missing %s, aborting ..." % blockfile
+ sys.exit(1)
+
+for line in blocks.readlines():
+ if line[0] == '#':
+ continue
+ line = string.strip(line)
+ if line == '':
+ continue
+ try:
+ fields = string.split(line, ';')
+ range = string.strip(fields[0])
+ (start, end) = string.split(range, "..")
+ name = string.strip(fields[1])
+ name = string.replace(name, ' ', '')
+ except:
+ print "Failed to process line: %s" % (line)
+ continue
+ start = "0x" + start
+ end = "0x" + end
+ try:
+ BlockNames[name].append((start, end))
+ except:
+ BlockNames[name] = [(start, end)]
+blocks.close()
+print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
+
+for block in blockAliases:
+ alias = string.split(block,':')
+ alist = string.split(alias[1],',')
+ for comp in alist:
+ if BlockNames.has_key(comp):
+ if alias[0] not in BlockNames:
+ BlockNames[alias[0]] = []
+ for r in BlockNames[comp]:
+ BlockNames[alias[0]].append(r)
+ else:
+ print "Alias %s: %s not in Blocks" % (alias[0], comp)
+ continue
+
+#
+# Next process the Categories file. This is more complex, since
+# the file is in code sequence, and we need to invert it. We use
+# a dictionary with index category-name, with each entry containing
+# all the ranges (codepoints) of that category. Note that category
+# names comprise two parts - the general category, and the "subclass"
+# within that category. Therefore, both "general category" (which is
+# the first character of the 2-character category-name) and the full
+# (2-character) name are entered into this dictionary.
+#
+try:
+ data = open(catfile, "r")
+except:
+ print "Missing %s, aborting ..." % catfile
+ sys.exit(1)
+
+nbchar = 0;
+Categories = {}
+for line in data.readlines():
+ if line[0] == '#':
+ continue
+ line = string.strip(line)
+ if line == '':
+ continue
+ try:
+ fields = string.split(line, ';')
+ point = string.strip(fields[0])
+ value = 0
+ while point != '':
+ value = value * 16
+ if point[0] >= '0' and point[0] <= '9':
+ value = value + ord(point[0]) - ord('0')
+ elif point[0] >= 'A' and point[0] <= 'F':
+ value = value + 10 + ord(point[0]) - ord('A')
+ elif point[0] >= 'a' and point[0] <= 'f':
+ value = value + 10 + ord(point[0]) - ord('a')
+ point = point[1:]
+ name = fields[2]
+ except:
+ print "Failed to process line: %s" % (line)
+ continue
+
+ nbchar = nbchar + 1
+ # update entry for "full name"
+ try:
+ Categories[name].append(value)
+ except:
+ try:
+ Categories[name] = [value]
+ except:
+ print "Failed to process line: %s" % (line)
+ # update "general category" name
+ try:
+ Categories[name[0]].append(value)
+ except:
+ try:
+ Categories[name[0]] = [value]
+ except:
+ print "Failed to process line: %s" % (line)
+
+blocks.close()
+print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
+
+#
+# The data is now all read. Time to process it into a more useful form.
+#
+# reduce the number list into ranges
+for cat in Categories.keys():
+ list = Categories[cat]
+ start = -1
+ prev = -1
+ end = -1
+ ranges = []
+ for val in list:
+ if start == -1:
+ start = val
+ prev = val
+ continue
+ elif val == prev + 1:
+ prev = val
+ continue
+ elif prev == start:
+ ranges.append((prev, prev))
+ start = val
+ prev = val
+ continue
+ else:
+ ranges.append((start, prev))
+ start = val
+ prev = val
+ continue
+ if prev == start:
+ ranges.append((prev, prev))
+ else:
+ ranges.append((start, prev))
+ Categories[cat] = ranges
+
+#
+# Assure all data is in alphabetic order, since we will be doing binary
+# searches on the tables.
+#
+bkeys = BlockNames.keys()
+bkeys.sort()
+
+ckeys = Categories.keys()
+ckeys.sort()
+
+#
+# Generate the resulting files
+#
+try:
+ header = open("include/libxml/xmlunicode.h", "w")
+except:
+ print "Failed to open include/libxml/xmlunicode.h"
+ sys.exit(1)
+
+try:
+ output = open("xmlunicode.c", "w")
+except:
+ print "Failed to open xmlunicode.c"
+ sys.exit(1)
+
+date = time.asctime(time.localtime(time.time()))
+
+header.write(
+"""/*
+ * Summary: Unicode character APIs
+ * Description: API for the Unicode character APIs
+ *
+ * This file is automatically generated from the
+ * UCS description files of the Unicode Character Database
+ * %s
+ * using the genUnicode.py Python script.
+ *
+ * Generation date: %s
+ * Sources: %s
+ * Author: Daniel Veillard
+ */
+
+#ifndef __XML_UNICODE_H__
+#define __XML_UNICODE_H__
+
+#include <libxml/xmlversion.h>
+
+#ifdef LIBXML_UNICODE_ENABLED
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+""" % (webpage, date, sources));
+
+output.write(
+"""/*
+ * xmlunicode.c: this module implements the Unicode character APIs
+ *
+ * This file is automatically generated from the
+ * UCS description files of the Unicode Character Database
+ * %s
+ * using the genUnicode.py Python script.
+ *
+ * Generation date: %s
+ * Sources: %s
+ * Daniel Veillard <veillard@redhat.com>
+ */
+
+#define IN_LIBXML
+#include "libxml.h"
+
+#ifdef LIBXML_UNICODE_ENABLED
+
+#include <string.h>
+#include <libxml/xmlversion.h>
+#include <libxml/xmlunicode.h>
+#include <libxml/chvalid.h>
+
+typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
+
+typedef struct {
+ const char *rangename;
+ xmlIntFunc *func;
+} xmlUnicodeRange;
+
+typedef struct {
+ const xmlUnicodeRange *table;
+ int numentries;
+} xmlUnicodeNameTable;
+
+
+static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
+
+static const xmlUnicodeRange xmlUnicodeBlocks[] = {
+""" % (webpage, date, sources));
+
+flag = 0
+for block in bkeys:
+ name = string.replace(block, '-', '')
+ if flag:
+ output.write(',\n')
+ else:
+ flag = 1
+ output.write(' {"%s", xmlUCSIs%s}' % (block, name))
+output.write('};\n\n')
+
+output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
+flag = 0;
+for name in ckeys:
+ if flag:
+ output.write(',\n')
+ else:
+ flag = 1
+ output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
+output.write('};\n\n')
+
+#
+# For any categories with more than minTableSize ranges we generate
+# a range table suitable for xmlCharInRange
+#
+for name in ckeys:
+ if len(Categories[name]) > minTableSize:
+ numshort = 0
+ numlong = 0
+ ranges = Categories[name]
+ sptr = "NULL"
+ lptr = "NULL"
+ for range in ranges:
+ (low, high) = range
+ if high < 0x10000:
+ if numshort == 0:
+ pline = "static const xmlChSRange xml%sS[] = {" % name
+ sptr = "xml%sS" % name
+ else:
+ pline += ", "
+ numshort += 1
+ else:
+ if numlong == 0:
+ if numshort > 0:
+ output.write(pline + " };\n")
+ pline = "static const xmlChLRange xml%sL[] = {" % name
+ lptr = "xml%sL" % name
+ else:
+ pline += ", "
+ numlong += 1
+ if len(pline) > 60:
+ output.write(pline + "\n")
+ pline = " "
+ pline += "{%s, %s}" % (hex(low), hex(high))
+ output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
+ % (name, numshort, numlong, sptr, lptr))
+
+
+output.write(
+"""static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
+static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
+
+/**
+ * xmlUnicodeLookup:
+ * @tptr: pointer to the name table
+ * @name: name to be found
+ *
+ * binary table lookup for user-supplied name
+ *
+ * Returns pointer to range function if found, otherwise NULL
+ */
+static xmlIntFunc
+*xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
+ int low, high, mid, cmp;
+ xmlUnicodeRange *sptr;
+
+ if ((tptr == NULL) || (tname == NULL)) return(NULL);
+
+ low = 0;
+ high = tptr->numentries - 1;
+ sptr = tptr->table;
+ while (low <= high) {
+ mid = (low + high) / 2;
+ if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
+ return (sptr[mid].func);
+ if (cmp < 0)
+ high = mid - 1;
+ else
+ low = mid + 1;
+ }
+ return (NULL);
+}
+
+""" % (len(BlockNames), len(Categories)) )
+
+for block in bkeys:
+ name = string.replace(block, '-', '')
+ header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
+ output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
+ output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
+ (block))
+ output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
+ output.write("int\nxmlUCSIs%s(int code) {\n return(" % name)
+ flag = 0
+ for (start, end) in BlockNames[block]:
+ if flag:
+ output.write(" ||\n ")
+ else:
+ flag = 1
+ output.write("((code >= %s) && (code <= %s))" % (start, end))
+ output.write(");\n}\n\n")
+
+header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
+output.write(
+"""/**
+ * xmlUCSIsBlock:
+ * @code: UCS code point
+ * @block: UCS block name
+ *
+ * Check whether the character is part of the UCS Block
+ *
+ * Returns 1 if true, 0 if false and -1 on unknown block
+ */
+int
+xmlUCSIsBlock(int code, const char *block) {
+ xmlIntFunc *func;
+
+ func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
+ if (func == NULL)
+ return (-1);
+ return (func(code));
+}
+
+""")
+
+for name in ckeys:
+ ranges = Categories[name]
+ header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
+ output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
+ output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
+ (name))
+ output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
+ output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
+ if len(Categories[name]) > minTableSize:
+ output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
+ % name)
+ else:
+ start = 1
+ for range in ranges:
+ (begin, end) = range;
+ if start:
+ output.write(" return(");
+ start = 0
+ else:
+ output.write(" ||\n ");
+ if (begin == end):
+ output.write("(code == %s)" % (hex(begin)))
+ else:
+ output.write("((code >= %s) && (code <= %s))" % (
+ hex(begin), hex(end)))
+ output.write(");\n}\n\n")
+
+header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
+output.write(
+"""/**
+ * xmlUCSIsCat:
+ * @code: UCS code point
+ * @cat: UCS Category name
+ *
+ * Check whether the character is part of the UCS Category
+ *
+ * Returns 1 if true, 0 if false and -1 on unknown category
+ */
+int
+xmlUCSIsCat(int code, const char *cat) {
+ xmlIntFunc *func;
+
+ func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
+ if (func == NULL)
+ return (-1);
+ return (func(code));
+}
+
+#define bottom_xmlunicode
+#include "elfgcchack.h"
+#endif /* LIBXML_UNICODE_ENABLED */
+""")
+
+header.write("""
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBXML_UNICODE_ENABLED */
+
+#endif /* __XML_UNICODE_H__ */
+""");
+
+header.close()
+output.close()