From 35a201cc8ef0c3f5b2df88d2e528aabee1048348 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 30 Apr 2021 18:47:09 +0200 Subject: Initial/Final commit --- libxml2-2.9.10/genUnicode.py | 478 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 478 insertions(+) create mode 100755 libxml2-2.9.10/genUnicode.py (limited to 'libxml2-2.9.10/genUnicode.py') diff --git a/libxml2-2.9.10/genUnicode.py b/libxml2-2.9.10/genUnicode.py new file mode 100755 index 0000000..4487eeb --- /dev/null +++ b/libxml2-2.9.10/genUnicode.py @@ -0,0 +1,478 @@ +#!/usr/bin/python -u +# +# Original script modified in November 2003 to take advantage of +# the character-validation range routines, and updated to the +# current Unicode information (Version 4.0.1) +# +# NOTE: there is an 'alias' facility for blocks which are not present in +# the current release, but are needed for ABI compatibility. This +# must be accomplished MANUALLY! Please see the comments below under +# 'blockAliases' +# +import sys +import string +import time + +webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html" +sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt" + +# +# blockAliases is a small hack - it is used for mapping block names which +# were were used in the 3.1 release, but are missing or changed in the current +# release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]" +blockAliases = [] +blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols") +blockAliases.append("Greek:GreekandCoptic") +blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + + "SupplementaryPrivateUseArea-B") + +# minTableSize gives the minimum number of ranges which must be present +# before a range table is produced. If there are less than this +# number, inline comparisons are generated +minTableSize = 8 + +(blockfile, catfile) = string.split(sources) + + +# +# Now process the "blocks" file, reducing it to a dictionary +# indexed by blockname, containing a tuple with the applicable +# block range +# +BlockNames = {} +try: + blocks = open(blockfile, "r") +except: + print "Missing %s, aborting ..." % blockfile + sys.exit(1) + +for line in blocks.readlines(): + if line[0] == '#': + continue + line = string.strip(line) + if line == '': + continue + try: + fields = string.split(line, ';') + range = string.strip(fields[0]) + (start, end) = string.split(range, "..") + name = string.strip(fields[1]) + name = string.replace(name, ' ', '') + except: + print "Failed to process line: %s" % (line) + continue + start = "0x" + start + end = "0x" + end + try: + BlockNames[name].append((start, end)) + except: + BlockNames[name] = [(start, end)] +blocks.close() +print "Parsed %d blocks descriptions" % (len(BlockNames.keys())) + +for block in blockAliases: + alias = string.split(block,':') + alist = string.split(alias[1],',') + for comp in alist: + if BlockNames.has_key(comp): + if alias[0] not in BlockNames: + BlockNames[alias[0]] = [] + for r in BlockNames[comp]: + BlockNames[alias[0]].append(r) + else: + print "Alias %s: %s not in Blocks" % (alias[0], comp) + continue + +# +# Next process the Categories file. This is more complex, since +# the file is in code sequence, and we need to invert it. We use +# a dictionary with index category-name, with each entry containing +# all the ranges (codepoints) of that category. Note that category +# names comprise two parts - the general category, and the "subclass" +# within that category. Therefore, both "general category" (which is +# the first character of the 2-character category-name) and the full +# (2-character) name are entered into this dictionary. +# +try: + data = open(catfile, "r") +except: + print "Missing %s, aborting ..." % catfile + sys.exit(1) + +nbchar = 0; +Categories = {} +for line in data.readlines(): + if line[0] == '#': + continue + line = string.strip(line) + if line == '': + continue + try: + fields = string.split(line, ';') + point = string.strip(fields[0]) + value = 0 + while point != '': + value = value * 16 + if point[0] >= '0' and point[0] <= '9': + value = value + ord(point[0]) - ord('0') + elif point[0] >= 'A' and point[0] <= 'F': + value = value + 10 + ord(point[0]) - ord('A') + elif point[0] >= 'a' and point[0] <= 'f': + value = value + 10 + ord(point[0]) - ord('a') + point = point[1:] + name = fields[2] + except: + print "Failed to process line: %s" % (line) + continue + + nbchar = nbchar + 1 + # update entry for "full name" + try: + Categories[name].append(value) + except: + try: + Categories[name] = [value] + except: + print "Failed to process line: %s" % (line) + # update "general category" name + try: + Categories[name[0]].append(value) + except: + try: + Categories[name[0]] = [value] + except: + print "Failed to process line: %s" % (line) + +blocks.close() +print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())) + +# +# The data is now all read. Time to process it into a more useful form. +# +# reduce the number list into ranges +for cat in Categories.keys(): + list = Categories[cat] + start = -1 + prev = -1 + end = -1 + ranges = [] + for val in list: + if start == -1: + start = val + prev = val + continue + elif val == prev + 1: + prev = val + continue + elif prev == start: + ranges.append((prev, prev)) + start = val + prev = val + continue + else: + ranges.append((start, prev)) + start = val + prev = val + continue + if prev == start: + ranges.append((prev, prev)) + else: + ranges.append((start, prev)) + Categories[cat] = ranges + +# +# Assure all data is in alphabetic order, since we will be doing binary +# searches on the tables. +# +bkeys = BlockNames.keys() +bkeys.sort() + +ckeys = Categories.keys() +ckeys.sort() + +# +# Generate the resulting files +# +try: + header = open("include/libxml/xmlunicode.h", "w") +except: + print "Failed to open include/libxml/xmlunicode.h" + sys.exit(1) + +try: + output = open("xmlunicode.c", "w") +except: + print "Failed to open xmlunicode.c" + sys.exit(1) + +date = time.asctime(time.localtime(time.time())) + +header.write( +"""/* + * Summary: Unicode character APIs + * Description: API for the Unicode character APIs + * + * This file is automatically generated from the + * UCS description files of the Unicode Character Database + * %s + * using the genUnicode.py Python script. + * + * Generation date: %s + * Sources: %s + * Author: Daniel Veillard + */ + +#ifndef __XML_UNICODE_H__ +#define __XML_UNICODE_H__ + +#include + +#ifdef LIBXML_UNICODE_ENABLED + +#ifdef __cplusplus +extern "C" { +#endif + +""" % (webpage, date, sources)); + +output.write( +"""/* + * xmlunicode.c: this module implements the Unicode character APIs + * + * This file is automatically generated from the + * UCS description files of the Unicode Character Database + * %s + * using the genUnicode.py Python script. + * + * Generation date: %s + * Sources: %s + * Daniel Veillard + */ + +#define IN_LIBXML +#include "libxml.h" + +#ifdef LIBXML_UNICODE_ENABLED + +#include +#include +#include +#include + +typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */ + +typedef struct { + const char *rangename; + xmlIntFunc *func; +} xmlUnicodeRange; + +typedef struct { + const xmlUnicodeRange *table; + int numentries; +} xmlUnicodeNameTable; + + +static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname); + +static const xmlUnicodeRange xmlUnicodeBlocks[] = { +""" % (webpage, date, sources)); + +flag = 0 +for block in bkeys: + name = string.replace(block, '-', '') + if flag: + output.write(',\n') + else: + flag = 1 + output.write(' {"%s", xmlUCSIs%s}' % (block, name)) +output.write('};\n\n') + +output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n') +flag = 0; +for name in ckeys: + if flag: + output.write(',\n') + else: + flag = 1 + output.write(' {"%s", xmlUCSIsCat%s}' % (name, name)) +output.write('};\n\n') + +# +# For any categories with more than minTableSize ranges we generate +# a range table suitable for xmlCharInRange +# +for name in ckeys: + if len(Categories[name]) > minTableSize: + numshort = 0 + numlong = 0 + ranges = Categories[name] + sptr = "NULL" + lptr = "NULL" + for range in ranges: + (low, high) = range + if high < 0x10000: + if numshort == 0: + pline = "static const xmlChSRange xml%sS[] = {" % name + sptr = "xml%sS" % name + else: + pline += ", " + numshort += 1 + else: + if numlong == 0: + if numshort > 0: + output.write(pline + " };\n") + pline = "static const xmlChLRange xml%sL[] = {" % name + lptr = "xml%sL" % name + else: + pline += ", " + numlong += 1 + if len(pline) > 60: + output.write(pline + "\n") + pline = " " + pline += "{%s, %s}" % (hex(low), hex(high)) + output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n" + % (name, numshort, numlong, sptr, lptr)) + + +output.write( +"""static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s}; +static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s}; + +/** + * xmlUnicodeLookup: + * @tptr: pointer to the name table + * @name: name to be found + * + * binary table lookup for user-supplied name + * + * Returns pointer to range function if found, otherwise NULL + */ +static xmlIntFunc +*xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) { + int low, high, mid, cmp; + xmlUnicodeRange *sptr; + + if ((tptr == NULL) || (tname == NULL)) return(NULL); + + low = 0; + high = tptr->numentries - 1; + sptr = tptr->table; + while (low <= high) { + mid = (low + high) / 2; + if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0) + return (sptr[mid].func); + if (cmp < 0) + high = mid - 1; + else + low = mid + 1; + } + return (NULL); +} + +""" % (len(BlockNames), len(Categories)) ) + +for block in bkeys: + name = string.replace(block, '-', '') + header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name) + output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name)) + output.write(" *\n * Check whether the character is part of %s UCS Block\n"% + (block)) + output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); + output.write("int\nxmlUCSIs%s(int code) {\n return(" % name) + flag = 0 + for (start, end) in BlockNames[block]: + if flag: + output.write(" ||\n ") + else: + flag = 1 + output.write("((code >= %s) && (code <= %s))" % (start, end)) + output.write(");\n}\n\n") + +header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n") +output.write( +"""/** + * xmlUCSIsBlock: + * @code: UCS code point + * @block: UCS block name + * + * Check whether the character is part of the UCS Block + * + * Returns 1 if true, 0 if false and -1 on unknown block + */ +int +xmlUCSIsBlock(int code, const char *block) { + xmlIntFunc *func; + + func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block); + if (func == NULL) + return (-1); + return (func(code)); +} + +""") + +for name in ckeys: + ranges = Categories[name] + header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name) + output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name)) + output.write(" *\n * Check whether the character is part of %s UCS Category\n"% + (name)) + output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); + output.write("int\nxmlUCSIsCat%s(int code) {\n" % name) + if len(Categories[name]) > minTableSize: + output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)" + % name) + else: + start = 1 + for range in ranges: + (begin, end) = range; + if start: + output.write(" return("); + start = 0 + else: + output.write(" ||\n "); + if (begin == end): + output.write("(code == %s)" % (hex(begin))) + else: + output.write("((code >= %s) && (code <= %s))" % ( + hex(begin), hex(end))) + output.write(");\n}\n\n") + +header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n") +output.write( +"""/** + * xmlUCSIsCat: + * @code: UCS code point + * @cat: UCS Category name + * + * Check whether the character is part of the UCS Category + * + * Returns 1 if true, 0 if false and -1 on unknown category + */ +int +xmlUCSIsCat(int code, const char *cat) { + xmlIntFunc *func; + + func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat); + if (func == NULL) + return (-1); + return (func(code)); +} + +#define bottom_xmlunicode +#include "elfgcchack.h" +#endif /* LIBXML_UNICODE_ENABLED */ +""") + +header.write(""" +#ifdef __cplusplus +} +#endif + +#endif /* LIBXML_UNICODE_ENABLED */ + +#endif /* __XML_UNICODE_H__ */ +"""); + +header.close() +output.close() -- cgit v1.2.3