From 35a201cc8ef0c3f5b2df88d2e528aabee1048348 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 30 Apr 2021 18:47:09 +0200 Subject: Initial/Final commit --- libxml2-2.9.10/os400/iconv/bldcsndfa/bldcsndfa.c | 1953 ++++++++++++++++++++++ 1 file changed, 1953 insertions(+) create mode 100644 libxml2-2.9.10/os400/iconv/bldcsndfa/bldcsndfa.c (limited to 'libxml2-2.9.10/os400/iconv/bldcsndfa/bldcsndfa.c') diff --git a/libxml2-2.9.10/os400/iconv/bldcsndfa/bldcsndfa.c b/libxml2-2.9.10/os400/iconv/bldcsndfa/bldcsndfa.c new file mode 100644 index 0000000..48afd54 --- /dev/null +++ b/libxml2-2.9.10/os400/iconv/bldcsndfa/bldcsndfa.c @@ -0,0 +1,1953 @@ +/** +*** Build a deterministic finite automaton to associate CCSIDs with +*** character set names. +*** +*** Compile on OS/400 with options SYSIFCOPT(*IFSIO). +*** +*** See Copyright for the status of this software. +*** +*** Author: Patrick Monnerat , DATASPHERE S.A. +**/ + +#include +#include +#include +#include +#include +#include + +#include + + +#ifdef OLDXML +#include "xml.h" +#else +#include +#include +#include +#include +#endif + + +#ifdef __OS400__ +#define iconv_open_error(cd) ((cd).return_value == -1) +#define set_iconv_open_error(cd) ((cd).return_value = -1) +#else +#define iconv_open_error(cd) ((cd) == (iconv_t) -1) +#define set_iconv_open_error(cd) ((cd) = (iconv_t) -1) +#endif + + +#define C_SOURCE_CCSID 500 +#define C_UTF8_CCSID 1208 + + +#define UTF8_SPACE 0x20 +#define UTF8_HT 0x09 +#define UTF8_0 0x30 +#define UTF8_9 0x39 +#define UTF8_A 0x41 +#define UTF8_Z 0x5A +#define UTF8_a 0x61 +#define UTF8_z 0x7A + + +#define GRANULE 128 /* Memory allocation granule. */ + +#define EPSILON 0x100 /* Token for empty transition. */ + + +#ifndef OFFSETOF +#define OFFSETOF(t, f) ((unsigned int) ((char *) &((t *) 0)->f - (char *) 0)) +#endif + +#ifndef OFFSETBY +#define OFFSETBY(t, p, o) ((t *) ((char *) (p) + (unsigned int) (o))) +#endif + + +typedef struct t_transition t_transition; /* NFA/DFA transition. */ +typedef struct t_state t_state; /* NFA/DFA state node. */ +typedef struct t_symlist t_symlist; /* Symbol (i.e.: name) list. */ +typedef struct t_chset t_chset; /* Character set. */ +typedef struct t_stategroup t_stategroup; /* Optimization group. */ +typedef unsigned char utf8char; /* UTF-8 character byte. */ +typedef unsigned char byte; /* Untyped data byte. */ + + +typedef struct { /* Set of pointers. */ + unsigned int p_size; /* Current allocated size. */ + unsigned int p_card; /* Current element count. */ + void * p_set[1]; /* Element array. */ +} t_powerset; + + +struct t_transition { + t_transition * t_forwprev; /* Head of forward transition list. */ + t_transition * t_forwnext; /* Tail of forward transition list. */ + t_transition * t_backprev; /* Head of backward transition list. */ + t_transition * t_backnext; /* Tail of backward transition list. */ + t_state * t_from; /* Incoming state. */ + t_state * t_to; /* Destination state. */ + unsigned short t_token; /* Transition token. */ + unsigned int t_index; /* Transition array index. */ +}; + + +struct t_state { + t_state * s_next; /* Next state (for DFA construction). */ + t_state * s_stack; /* Unprocessed DFA states stack. */ + t_transition * s_forward; /* Forward transitions. */ + t_transition * s_backward; /* Backward transitions. */ + t_chset * s_final; /* Recognized character set. */ + t_powerset * s_nfastates; /* Corresponding NFA states. */ + unsigned int s_index; /* State index. */ +}; + + +struct t_symlist { + t_symlist * l_next; /* Next name in list. */ + utf8char l_symbol[1]; /* Name bytes. */ +}; + + +struct t_chset { + t_chset * c_next; /* Next character set. */ + t_symlist * c_names; /* Character set name list. */ + iconv_t c_fromUTF8; /* Conversion from UTF-8. */ + unsigned int c_ccsid; /* IBM character set code. */ + unsigned int c_mibenum; /* IANA character code. */ +}; + + +struct t_stategroup { + t_stategroup * g_next; /* Next group. */ + t_state * g_member; /* Group member (s_stack) list. */ + unsigned int g_id; /* Group ident. */ +}; + + + +t_chset * chset_list; /* Character set list. */ +t_state * initial_state; /* Initial NFA state. */ +iconv_t job2utf8; /* Job CCSID to UTF-8 conversion. */ +iconv_t utf82job; /* UTF-8 to job CCSID conversion. */ +t_state * dfa_states; /* List of DFA states. */ +unsigned int groupid; /* Group ident counter. */ + + +/** +*** UTF-8 strings. +**/ + +#pragma convert(819) + +static const utf8char utf8_MIBenum[] = "MIBenum"; +static const utf8char utf8_mibenum[] = "mibenum"; +static const utf8char utf8_ibm_[] = "ibm-"; +static const utf8char utf8_IBMCCSID[] = "IBMCCSID"; +static const utf8char utf8_iana_[] = "iana-"; +static const utf8char utf8_Name[] = "Name"; +static const utf8char utf8_Pref_MIME_Name[] = "Preferred MIME Name"; +static const utf8char utf8_Aliases[] = "Aliases"; +static const utf8char utf8_html[] = "html"; +static const utf8char utf8_htmluri[] = "http://www.w3.org/1999/xhtml"; +static const utf8char utf8_A[] = "A"; +static const utf8char utf8_C[] = "C"; +static const utf8char utf8_M[] = "M"; +static const utf8char utf8_N[] = "N"; +static const utf8char utf8_P[] = "P"; +static const utf8char utf8_T[] = "T"; +static const utf8char utf8_ccsid[] = "ccsid"; +static const utf8char utf8_EBCDIC[] = "EBCDIC"; +static const utf8char utf8_ASCII[] = "ASCII"; +static const utf8char utf8_assocnodes[] = "/ccsid_mibenum/assoc[@ccsid]"; +static const utf8char utf8_aliastext[] = + "/ccsid_mibenum/assoc[@ccsid=$C]/alias/text()"; +#ifdef OLDXML +static const utf8char utf8_tablerows[] = + "//table[@id='table-character-sets-1']/*/tr"; +static const utf8char utf8_headerpos[] = + "count(th[text()=$T]/preceding-sibling::th)+1"; +static const utf8char utf8_getmibenum[] = "number(td[$M])"; +static const utf8char utf8_getprefname[] = "string(td[$P])"; +static const utf8char utf8_getname[] = "string(td[$N])"; +static const utf8char utf8_getaliases[] = "td[$A]/text()"; +#else +static const utf8char utf8_tablerows[] = + "//html:table[@id='table-character-sets-1']/*/html:tr"; +static const utf8char utf8_headerpos[] = + "count(html:th[text()=$T]/preceding-sibling::html:th)+1"; +static const utf8char utf8_getmibenum[] = "number(html:td[$M])"; +static const utf8char utf8_getprefname[] = "string(html:td[$P])"; +static const utf8char utf8_getname[] = "string(html:td[$N])"; +static const utf8char utf8_getaliases[] = "html:td[$A]/text()"; +#endif + +#pragma convert(0) + + +/** +*** UTF-8 character length table. +*** +*** Index is first character byte, value is the character byte count. +**/ + +static signed char utf8_chlen[] = { +/* 00-07 */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 08-0F */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 10-17 */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 18-1F */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 20-27 */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 28-2F */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 30-37 */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 38-3F */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 40-47 */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 48-4F */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 50-57 */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 58-5F */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 60-67 */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 68-6F */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 70-77 */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 78-7F */ 1, 1, 1, 1, 1, 1, 1, 1, +/* 80-87 */ -1, -1, -1, -1, -1, -1, -1, -1, +/* 88-8F */ -1, -1, -1, -1, -1, -1, -1, -1, +/* 90-97 */ -1, -1, -1, -1, -1, -1, -1, -1, +/* 98-9F */ -1, -1, -1, -1, -1, -1, -1, -1, +/* A0-A7 */ -1, -1, -1, -1, -1, -1, -1, -1, +/* A8-AF */ -1, -1, -1, -1, -1, -1, -1, -1, +/* B0-B7 */ -1, -1, -1, -1, -1, -1, -1, -1, +/* B8-BF */ -1, -1, -1, -1, -1, -1, -1, -1, +/* C0-C7 */ 2, 2, 2, 2, 2, 2, 2, 2, +/* C8-CF */ 2, 2, 2, 2, 2, 2, 2, 2, +/* D0-D7 */ 2, 2, 2, 2, 2, 2, 2, 2, +/* D8-DF */ 2, 2, 2, 2, 2, 2, 2, 2, +/* E0-E7 */ 3, 3, 3, 3, 3, 3, 3, 3, +/* E8-EF */ 3, 3, 3, 3, 3, 3, 3, 3, +/* F0-F7 */ 4, 4, 4, 4, 4, 4, 4, 4, +/* F8-FF */ 5, 5, 5, 5, 6, 6, -1, -1 +}; + + + +void +chknull(void * p) + +{ + if (p) + return; + + fprintf(stderr, "Not enough memory\n"); + exit(1); +} + + +void +makecode(char * buf, unsigned int ccsid) + +{ + ccsid &= 0xFFFF; + memset(buf, 0, 32); + sprintf(buf, "IBMCCSID%05u0000000", ccsid); +} + + +iconv_t +iconv_open_ccsid(unsigned int ccsidout, + unsigned int ccsidin, unsigned int nullflag) + +{ + char fromcode[33]; + char tocode[33]; + + makecode(fromcode, ccsidin); + makecode(tocode, ccsidout); + memset(tocode + 13, 0, sizeof tocode - 13); + + if (nullflag) + fromcode[18] = '1'; + + return iconv_open(tocode, fromcode); +} + + +unsigned int +getnum(char * * cpp) + +{ + unsigned int n; + char * cp; + + cp = *cpp; + n = 0; + + while (isdigit(*cp)) + n = 10 * n + *cp++ - '0'; + + *cpp = cp; + return n; +} + + +const utf8char * +hashBinaryKey(const byte * bytes, unsigned int len) + +{ + const byte * bp; + utf8char * key; + utf8char * cp; + unsigned int n; + unsigned int n4; + unsigned int i; + + /** + *** Encode binary data in character form to be used as hash + *** table key. + **/ + + n = (4 * len + 2) / 3; + key = (utf8char *) malloc(n + 1); + chknull(key); + bp = bytes; + cp = key; + + for (n4 = n >> 2; n4; n4--) { + i = (bp[0] << 16) | (bp[1] << 8) | bp[2]; + *cp++ = 0x21 + ((i >> 18) & 0x3F); + *cp++ = 0x21 + ((i >> 12) & 0x3F); + *cp++ = 0x21 + ((i >> 6) & 0x3F); + *cp++ = 0x21 + (i & 0x3F); + bp += 3; + } + + switch (n & 0x3) { + + case 2: + *cp++ = 0x21 + ((*bp >> 2) & 0x3F); + *cp++ = 0x21 + ((*bp << 4) & 0x3F); + break; + + case 3: + i = (bp[0] << 8) | bp[1]; + *cp++ = 0x21 + ((i >> 10) & 0x3F); + *cp++ = 0x21 + ((i >> 4) & 0x3F); + *cp++ = 0x21 + ((i << 2) & 0x3F); + break; + } + + *cp = '\0'; + return key; +} + + +void * +hash_get(xmlHashTablePtr h, const void * binkey, unsigned int len) + +{ + const utf8char * key; + void * result; + + key = hashBinaryKey((const byte *) binkey, len); + result = xmlHashLookup(h, key); + free((char *) key); + return result; +} + + +int +hash_add(xmlHashTablePtr h, const void * binkey, unsigned int len, void * data) + +{ + const utf8char * key; + int result; + + key = hashBinaryKey((const byte *) binkey, len); + result = xmlHashAddEntry(h, key, data); + free((char *) key); + return result; +} + + +xmlDocPtr +loadXMLFile(const char * filename) + +{ + struct stat sbuf; + byte * databuf; + int fd; + int i; + xmlDocPtr doc; + + if (stat(filename, &sbuf)) + return (xmlDocPtr) NULL; + + databuf = malloc(sbuf.st_size + 4); + + if (!databuf) + return (xmlDocPtr) NULL; + + fd = open(filename, O_RDONLY +#ifdef O_BINARY + | O_BINARY +#endif + ); + + if (fd < 0) { + free((char *) databuf); + return (xmlDocPtr) NULL; + } + + i = read(fd, (char *) databuf, sbuf.st_size); + close(fd); + + if (i != sbuf.st_size) { + free((char *) databuf); + return (xmlDocPtr) NULL; + } + + databuf[i] = databuf[i + 1] = databuf[i + 2] = databuf[i + 3] = 0; + doc = xmlParseMemory((xmlChar *) databuf, i); + free((char *) databuf); + return doc; +} + + +int +match(char * * cpp, char * s) + +{ + char * cp; + int c1; + int c2; + + cp = *cpp; + + for (cp = *cpp; c2 = *s++; cp++) { + c1 = *cp; + + if (c1 != c2) { + if (isupper(c1)) + c1 = tolower(c1); + + if (isupper(c2)) + c2 = tolower(c2); + } + + if (c1 != c2) + return 0; + } + + c1 = *cp; + + while (c1 == ' ' || c1 == '\t') + c1 = *++cp; + + *cpp = cp; + return 1; +} + + +t_state * +newstate(void) + +{ + t_state * s; + + s = (t_state *) malloc(sizeof *s); + chknull(s); + memset((char *) s, 0, sizeof *s); + return s; +} + + +void +unlink_transition(t_transition * t) + +{ + if (t->t_backnext) + t->t_backnext->t_backprev = t->t_backprev; + + if (t->t_backprev) + t->t_backprev->t_backnext = t->t_backnext; + else if (t->t_to) + t->t_to->s_backward = t->t_backnext; + + if (t->t_forwnext) + t->t_forwnext->t_forwprev = t->t_forwprev; + + if (t->t_forwprev) + t->t_forwprev->t_forwnext = t->t_forwnext; + else if (t->t_from) + t->t_from->s_forward = t->t_forwnext; + + t->t_backprev = (t_transition *) NULL; + t->t_backnext = (t_transition *) NULL; + t->t_forwprev = (t_transition *) NULL; + t->t_forwnext = (t_transition *) NULL; + t->t_from = (t_state *) NULL; + t->t_to = (t_state *) NULL; +} + + +void +link_transition(t_transition * t, t_state * from, t_state * to) + +{ + if (!from) + from = t->t_from; + + if (!to) + to = t->t_to; + + unlink_transition(t); + + if ((t->t_from = from)) { + if ((t->t_forwnext = from->s_forward)) + t->t_forwnext->t_forwprev = t; + + from->s_forward = t; + } + + if ((t->t_to = to)) { + if ((t->t_backnext = to->s_backward)) + t->t_backnext->t_backprev = t; + + to->s_backward = t; + } +} + + +t_transition * +newtransition(unsigned int token, t_state * from, t_state * to) + +{ + t_transition * t; + + t = (t_transition *) malloc(sizeof *t); + chknull(t); + memset((char *) t, 0, sizeof *t); + t->t_token = token; + link_transition(t, from, to); + return t; +} + + +t_transition * +uniquetransition(unsigned int token, t_state * from, t_state * to) + +{ + t_transition * t; + + for (t = from->s_forward; t; t = t->t_forwnext) + if (t->t_token == token && (t->t_to == to || !to)) + return t; + + return to? newtransition(token, from, to): (t_transition *) NULL; +} + + +int +set_position(t_powerset * s, void * e) + +{ + unsigned int l; + unsigned int h; + unsigned int m; + int i; + + l = 0; + h = s->p_card; + + while (l < h) { + m = (l + h) >> 1; + + /** + *** If both pointers belong to different allocation arenas, + *** native comparison may find them neither + *** equal, nor greater, nor smaller. + *** We thus compare using memcmp() to get an orthogonal + *** result. + **/ + + i = memcmp(&e, s->p_set + m, sizeof e); + + if (i < 0) + h = m; + else if (!i) + return m; + else + l = m + 1; + } + + return l; +} + + +t_powerset * +set_include(t_powerset * s, void * e) + +{ + unsigned int pos; + unsigned int n; + + if (!s) { + s = (t_powerset *) malloc(sizeof *s + + GRANULE * sizeof s->p_set); + chknull(s); + s->p_size = GRANULE; + s->p_set[GRANULE] = (t_state *) NULL; + s->p_set[0] = e; + s->p_card = 1; + return s; + } + + pos = set_position(s, e); + + if (pos < s->p_card && s->p_set[pos] == e) + return s; + + if (s->p_card >= s->p_size) { + s->p_size += GRANULE; + s = (t_powerset *) realloc(s, + sizeof *s + s->p_size * sizeof s->p_set); + chknull(s); + s->p_set[s->p_size] = (t_state *) NULL; + } + + n = s->p_card - pos; + + if (n) + memmove((char *) (s->p_set + pos + 1), + (char *) (s->p_set + pos), n * sizeof s->p_set[0]); + + s->p_set[pos] = e; + s->p_card++; + return s; +} + + +t_state * +nfatransition(t_state * to, byte token) + +{ + t_state * from; + + from = newstate(); + newtransition(token, from, to); + return from; +} + + +static t_state * nfadevelop(t_state * from, t_state * final, iconv_t icc, + const utf8char * name, unsigned int len); + + +void +nfaslice(t_state * * from, t_state * * to, iconv_t icc, + const utf8char * chr, unsigned int chlen, + const utf8char * name, unsigned int len, t_state * final) + +{ + char * srcp; + char * dstp; + size_t srcc; + size_t dstc; + unsigned int cnt; + t_state * f; + t_state * t; + t_transition * tp; + byte bytebuf[8]; + + srcp = (char *) chr; + srcc = chlen; + dstp = (char *) bytebuf; + dstc = sizeof bytebuf; + iconv(icc, &srcp, &srcc, &dstp, &dstc); + dstp = (char *) bytebuf; + cnt = sizeof bytebuf - dstc; + t = *to; + f = *from; + + /** + *** Check for end of string. + **/ + + if (!len) + if (t && t != final) + uniquetransition(EPSILON, t, final); + else + t = final; + + if (f) + while (cnt) { + tp = uniquetransition(*dstp, f, (t_state *) NULL); + + if (!tp) + break; + + f = tp->t_to; + dstp++; + cnt--; + } + + if (!cnt) { + if (!t) + t = nfadevelop(f, final, icc, name, len); + + *to = t; + return; + } + + if (!t) { + t = nfadevelop((t_state *) NULL, final, icc, name, len); + *to = t; + } + + if (!f) + *from = f = newstate(); + + while (cnt > 1) + t = nfatransition(t, dstp[--cnt]); + + newtransition(*dstp, f, t); +} + + +t_state * +nfadevelop(t_state * from, t_state * final, iconv_t icc, + const utf8char * name, unsigned int len) + +{ + int chlen; + int i; + t_state * to; + int uccnt; + int lccnt; + utf8char chr; + + chlen = utf8_chlen[*name]; + + for (i = 1; i < chlen; i++) + if ((name[i] & 0xC0) != 0x80) + break; + + if (i != chlen) { + fprintf(stderr, + "Invalid UTF8 character in character set name\n"); + return (t_state *) NULL; + } + + to = (t_state *) NULL; + nfaslice(&from, &to, + icc, name, chlen, name + chlen, len - chlen, final); + + if (*name >= UTF8_a && *name <= UTF8_z) + chr = *name - UTF8_a + UTF8_A; + else if (*name >= UTF8_A && *name <= UTF8_Z) + chr = *name - UTF8_A + UTF8_a; + else + return from; + + nfaslice(&from, &to, icc, &chr, 1, name + chlen, len - chlen, final); + return from; +} + + + +void +nfaenter(const utf8char * name, int len, t_chset * charset) + +{ + t_chset * s; + t_state * final; + t_state * sp; + t_symlist * lp; + + /** + *** Enter case-insensitive `name' in NFA in all known + *** character codes. + *** Redundant shift state changes as well as shift state + *** differences between uppercase and lowercase are + *** not handled. + **/ + + if (len < 0) + len = strlen(name) + 1; + + for (lp = charset->c_names; lp; lp = lp->l_next) + if (!memcmp(name, lp->l_symbol, len)) + return; /* Already entered. */ + + lp = (t_symlist *) malloc(sizeof *lp + len); + chknull(lp); + memcpy(lp->l_symbol, name, len); + lp->l_symbol[len] = '\0'; + lp->l_next = charset->c_names; + charset->c_names = lp; + final = newstate(); + final->s_final = charset; + + for (s = chset_list; s; s = s->c_next) + if (!iconv_open_error(s->c_fromUTF8)) + sp = nfadevelop(initial_state, final, + s->c_fromUTF8, name, len); +} + + +unsigned int +utf8_utostr(utf8char * s, unsigned int v) + +{ + unsigned int d; + unsigned int i; + + d = v / 10; + v -= d * 10; + i = d? utf8_utostr(s, d): 0; + s[i++] = v + UTF8_0; + s[i] = '\0'; + return i; +} + + +unsigned int +utf8_utostrpad(utf8char * s, unsigned int v, int digits) + +{ + unsigned int i = utf8_utostr(s, v); + utf8char pad = UTF8_SPACE; + + if (digits < 0) { + pad = UTF8_0; + digits = -digits; + } + + if (i >= digits) + return i; + + memmove(s + digits - i, s, i + 1); + memset(s, pad, digits - i); + return digits; +} + + +unsigned int +utf8_strtou(const utf8char * s) + +{ + unsigned int v; + + while (*s == UTF8_SPACE || *s == UTF8_HT) + s++; + + for (v = 0; *s >= UTF8_0 && *s <= UTF8_9;) + v = 10 * v + *s++ - UTF8_0; + + return v; +} + + +unsigned int +getNumAttr(xmlNodePtr node, const xmlChar * name) + +{ + const xmlChar * s; + unsigned int val; + + s = xmlGetProp(node, name); + + if (!s) + return 0; + + val = utf8_strtou(s); + xmlFree((xmlChar *) s); + return val; +} + + +void +read_assocs(const char * filename) + +{ + xmlDocPtr doc; + xmlXPathContextPtr ctxt; + xmlXPathObjectPtr obj; + xmlNodePtr node; + t_chset * sp; + int i; + unsigned int ccsid; + unsigned int mibenum; + utf8char symbuf[32]; + + doc = loadXMLFile(filename); + + if (!doc) { + fprintf(stderr, "Cannot load file %s\n", filename); + exit(1); + } + + ctxt = xmlXPathNewContext(doc); + obj = xmlXPathEval(utf8_assocnodes, ctxt); + + if (!obj || obj->type != XPATH_NODESET || !obj->nodesetval || + !obj->nodesetval->nodeTab || !obj->nodesetval->nodeNr) { + fprintf(stderr, "No association found in %s\n", filename); + exit(1); + } + + for (i = 0; i < obj->nodesetval->nodeNr; i++) { + node = obj->nodesetval->nodeTab[i]; + ccsid = getNumAttr(node, utf8_ccsid); + mibenum = getNumAttr(node, utf8_mibenum); + + /** + *** Check for duplicate. + **/ + + for (sp = chset_list; sp; sp = sp->c_next) + if (ccsid && ccsid == sp->c_ccsid || + mibenum && mibenum == sp->c_mibenum) { + fprintf(stderr, "Duplicate character set: "); + fprintf(stderr, "CCSID = %u/%u, ", + ccsid, sp->c_ccsid); + fprintf(stderr, "MIBenum = %u/%u\n", + mibenum, sp->c_mibenum); + break; + } + + if (sp) + continue; + + /** + *** Allocate the new character set. + **/ + + sp = (t_chset *) malloc(sizeof *sp); + chknull(sp); + memset(sp, 0, sizeof *sp); + + if (!ccsid) /* Do not attempt with current job CCSID. */ + set_iconv_open_error(sp->c_fromUTF8); + else { + sp->c_fromUTF8 = + iconv_open_ccsid(ccsid, C_UTF8_CCSID, 0); + + if (iconv_open_error(sp->c_fromUTF8) == -1) + fprintf(stderr, + "Cannot convert into CCSID %u: ignored\n", + ccsid); + } + + sp->c_ccsid = ccsid; + sp->c_mibenum = mibenum; + sp->c_next = chset_list; + chset_list = sp; + } + + xmlXPathFreeObject(obj); + + /** + *** Enter aliases. + **/ + + for (sp = chset_list; sp; sp = sp->c_next) { + strcpy(symbuf, utf8_ibm_); + utf8_utostr(symbuf + 4, sp->c_ccsid); + nfaenter(symbuf, -1, sp); + strcpy(symbuf, utf8_IBMCCSID); + utf8_utostrpad(symbuf + 8, sp->c_ccsid, -5); + nfaenter(symbuf, 13, sp); /* Not null-terminated. */ + + if (sp->c_mibenum) { + strcpy(symbuf, utf8_iana_); + utf8_utostr(symbuf + 5, sp->c_mibenum); + nfaenter(symbuf, -1, sp); + } + + xmlXPathRegisterVariable(ctxt, utf8_C, + xmlXPathNewFloat((double) sp->c_ccsid)); + obj = xmlXPathEval(utf8_aliastext, ctxt); + + if (!obj || obj->type != XPATH_NODESET) { + fprintf(stderr, "getAlias failed in %s\n", filename); + exit(1); + } + + if (obj->nodesetval && + obj->nodesetval->nodeTab && obj->nodesetval->nodeNr) { + for (i = 0; i < obj->nodesetval->nodeNr; i++) { + node = obj->nodesetval->nodeTab[i]; + nfaenter(node->content, -1, sp); + } + } + + xmlXPathFreeObject(obj); + } + + xmlXPathFreeContext(ctxt); + xmlFreeDoc(doc); +} + + +unsigned int +columnPosition(xmlXPathContextPtr ctxt, const xmlChar * header) + +{ + xmlXPathObjectPtr obj; + unsigned int res = 0; + + xmlXPathRegisterVariable(ctxt, utf8_T, xmlXPathNewString(header)); + obj = xmlXPathEval(utf8_headerpos, ctxt); + + if (obj) { + if (obj->type == XPATH_NUMBER) + res = (unsigned int) obj->floatval; + + xmlXPathFreeObject(obj); + } + + return res; +} + + +void +read_iana(const char * filename) + +{ + xmlDocPtr doc; + xmlXPathContextPtr ctxt; + xmlXPathObjectPtr obj1; + xmlXPathObjectPtr obj2; + xmlNodePtr node; + int prefnamecol; + int namecol; + int mibenumcol; + int aliascol; + int mibenum; + t_chset * sp; + int n; + int i; + + doc = loadXMLFile(filename); + + if (!doc) { + fprintf(stderr, "Cannot load file %s\n", filename); + exit(1); + } + + ctxt = xmlXPathNewContext(doc); + +#ifndef OLDXML + xmlXPathRegisterNs(ctxt, utf8_html, utf8_htmluri); +#endif + + obj1 = xmlXPathEval(utf8_tablerows, ctxt); + + if (!obj1 || obj1->type != XPATH_NODESET || !obj1->nodesetval || + !obj1->nodesetval->nodeTab || obj1->nodesetval->nodeNr <= 1) { + fprintf(stderr, "No data in %s\n", filename); + exit(1); + } + + /** + *** Identify columns. + **/ + + xmlXPathSetContextNode(obj1->nodesetval->nodeTab[0], ctxt); + prefnamecol = columnPosition(ctxt, utf8_Pref_MIME_Name); + namecol = columnPosition(ctxt, utf8_Name); + mibenumcol = columnPosition(ctxt, utf8_MIBenum); + aliascol = columnPosition(ctxt, utf8_Aliases); + + if (!prefnamecol || !namecol || !mibenumcol || !aliascol) { + fprintf(stderr, "Key column(s) missing in %s\n", filename); + exit(1); + } + + xmlXPathRegisterVariable(ctxt, utf8_P, + xmlXPathNewFloat((double) prefnamecol)); + xmlXPathRegisterVariable(ctxt, utf8_N, + xmlXPathNewFloat((double) namecol)); + xmlXPathRegisterVariable(ctxt, utf8_M, + xmlXPathNewFloat((double) mibenumcol)); + xmlXPathRegisterVariable(ctxt, utf8_A, + xmlXPathNewFloat((double) aliascol)); + + /** + *** Process each row. + **/ + + for (n = 1; n < obj1->nodesetval->nodeNr; n++) { + xmlXPathSetContextNode(obj1->nodesetval->nodeTab[n], ctxt); + + /** + *** Get the MIBenum from current row. + */ + + obj2 = xmlXPathEval(utf8_getmibenum, ctxt); + + if (!obj2 || obj2->type != XPATH_NUMBER) { + fprintf(stderr, "get MIBenum failed at row %u\n", n); + exit(1); + } + + if (xmlXPathIsNaN(obj2->floatval) || + obj2->floatval < 1.0 || obj2->floatval > 65535.0 || + ((unsigned int) obj2->floatval) != obj2->floatval) { + fprintf(stderr, "invalid MIBenum at row %u\n", n); + xmlXPathFreeObject(obj2); + continue; + } + + mibenum = obj2->floatval; + xmlXPathFreeObject(obj2); + + /** + *** Search the associations for a corresponding CCSID. + **/ + + for (sp = chset_list; sp; sp = sp->c_next) + if (sp->c_mibenum == mibenum) + break; + + if (!sp) + continue; /* No CCSID for this MIBenum. */ + + /** + *** Process preferred MIME name. + **/ + + obj2 = xmlXPathEval(utf8_getprefname, ctxt); + + if (!obj2 || obj2->type != XPATH_STRING) { + fprintf(stderr, + "get Preferred_MIME_Name failed at row %u\n", n); + exit(1); + } + + if (obj2->stringval && obj2->stringval[0]) + nfaenter(obj2->stringval, -1, sp); + + xmlXPathFreeObject(obj2); + + /** + *** Process name. + **/ + + obj2 = xmlXPathEval(utf8_getname, ctxt); + + if (!obj2 || obj2->type != XPATH_STRING) { + fprintf(stderr, "get name failed at row %u\n", n); + exit(1); + } + + if (obj2->stringval && obj2->stringval[0]) + nfaenter(obj2->stringval, -1, sp); + + xmlXPathFreeObject(obj2); + + /** + *** Process aliases. + **/ + + obj2 = xmlXPathEval(utf8_getaliases, ctxt); + + if (!obj2 || obj2->type != XPATH_NODESET) { + fprintf(stderr, "get aliases failed at row %u\n", n); + exit(1); + } + + if (obj2->nodesetval && obj2->nodesetval->nodeTab) + for (i = 0; i < obj2->nodesetval->nodeNr; i++) { + node = obj2->nodesetval->nodeTab[i]; + + if (node && node->content && node->content[0]) + nfaenter(node->content, -1, sp); + } + + xmlXPathFreeObject(obj2); + } + + xmlXPathFreeObject(obj1); + xmlXPathFreeContext(ctxt); + xmlFreeDoc(doc); +} + + +t_powerset * closureset(t_powerset * dst, t_powerset * src); + + +t_powerset * +closure(t_powerset * dst, t_state * src) + +{ + t_transition * t; + unsigned int oldcard; + + if (src->s_nfastates) { + /** + *** Is a DFA state: return closure of set of equivalent + *** NFA states. + **/ + + return closureset(dst, src->s_nfastates); + } + + /** + *** Compute closure of NFA state. + **/ + + dst = set_include(dst, src); + + for (t = src->s_forward; t; t = t->t_forwnext) + if (t->t_token == EPSILON) { + oldcard = dst->p_card; + dst = set_include(dst, t->t_to); + + if (oldcard != dst->p_card) + dst = closure(dst, t->t_to); + } + + return dst; +} + + +t_powerset * +closureset(t_powerset * dst, t_powerset * src) + +{ + unsigned int i; + + for (i = 0; i < src->p_card; i++) + dst = closure(dst, (t_state *) src->p_set[i]); + + return dst; +} + + +t_state * +get_dfa_state(t_state * * stack, + t_powerset * nfastates, xmlHashTablePtr sethash) + +{ + t_state * s; + + if (s = hash_get(sethash, nfastates->p_set, + nfastates->p_card * sizeof nfastates->p_set[0])) { + /** + *** DFA state already present. + *** Release the NFA state set and return + *** the address of the old DFA state. + **/ + + free((char *) nfastates); + return s; + } + + /** + *** Build the new state. + **/ + + s = newstate(); + s->s_nfastates = nfastates; + s->s_next = dfa_states; + dfa_states = s; + s->s_stack = *stack; + *stack = s; + + /** + *** Enter it in hash. + **/ + + if (hash_add(sethash, nfastates->p_set, + nfastates->p_card * sizeof nfastates->p_set[0], s)) + chknull(NULL); /* Memory allocation error. */ + + return s; +} + + +int +transcmp(const void * p1, const void * p2) + +{ + t_transition * t1; + t_transition * t2; + + t1 = *(t_transition * *) p1; + t2 = *(t_transition * *) p2; + return ((int) t1->t_token) - ((int) t2->t_token); +} + + +void +builddfa(void) + +{ + t_powerset * transset; + t_powerset * stateset; + t_state * s; + t_state * s2; + unsigned int n; + unsigned int i; + unsigned int token; + t_transition * t; + t_state * stack; + xmlHashTablePtr sethash; + unsigned int nst; + + transset = set_include(NULL, NULL); + chknull(transset); + stateset = set_include(NULL, NULL); + chknull(stateset); + sethash = xmlHashCreate(1); + chknull(sethash); + dfa_states = (t_state *) NULL; + stack = (t_state *) NULL; + nst = 0; + + /** + *** Build the DFA initial state. + **/ + + get_dfa_state(&stack, closure(NULL, initial_state), sethash); + + /** + *** Build the other DFA states by looking at each + *** possible transition from stacked DFA states. + **/ + + do { + if (!(++nst % 100)) + fprintf(stderr, "%u DFA states\n", nst); + + s = stack; + stack = s->s_stack; + s->s_stack = (t_state *) NULL; + + /** + *** Build a set of all non-epsilon transitions from this + *** state. + **/ + + transset->p_card = 0; + + for (n = 0; n < s->s_nfastates->p_card; n++) { + s2 = s->s_nfastates->p_set[n]; + + for (t = s2->s_forward; t; t = t->t_forwnext) + if (t->t_token != EPSILON) { + transset = set_include(transset, t); + chknull(transset); + } + } + + /** + *** Sort transitions by token. + **/ + + qsort(transset->p_set, transset->p_card, + sizeof transset->p_set[0], transcmp); + + /** + *** Process all transitions, grouping them by token. + **/ + + stateset->p_card = 0; + token = EPSILON; + + for (i = 0; i < transset->p_card; i++) { + t = transset->p_set[i]; + + if (token != t->t_token) { + if (stateset->p_card) { + /** + *** Get the equivalent DFA state + *** and create transition. + **/ + + newtransition(token, s, + get_dfa_state(&stack, + closureset(NULL, stateset), + sethash)); + stateset->p_card = 0; + } + + token = t->t_token; + } + + stateset = set_include(stateset, t->t_to); + } + + if (stateset->p_card) + newtransition(token, s, get_dfa_state(&stack, + closureset(NULL, stateset), sethash)); + } while (stack); + + free((char *) transset); + free((char *) stateset); + xmlHashFree(sethash, NULL); + + /** + *** Reverse the state list to get the initial state first, + *** check for ambiguous prefixes, determine final states, + *** destroy NFA state sets. + **/ + + while (s = dfa_states) { + dfa_states = s->s_next; + s->s_next = stack; + stack = s; + stateset = s->s_nfastates; + s->s_nfastates = (t_powerset *) NULL; + + for (n = 0; n < stateset->p_card; n++) { + s2 = (t_state *) stateset->p_set[n]; + + if (s2->s_final) { + if (s->s_final && s->s_final != s2->s_final) + fprintf(stderr, + "Ambiguous name for CCSIDs %u/%u\n", + s->s_final->c_ccsid, + s2->s_final->c_ccsid); + + s->s_final = s2->s_final; + } + } + + free((char *) stateset); + } + + dfa_states = stack; +} + + +void +deletenfa(void) + +{ + t_transition * t; + t_state * s; + t_state * u; + t_state * stack; + + stack = initial_state; + stack->s_stack = (t_state *) NULL; + + while ((s = stack)) { + stack = s->s_stack; + + while ((t = s->s_forward)) { + u = t->t_to; + unlink_transition(t); + free((char *) t); + + if (!u->s_backward) { + u->s_stack = stack; + stack = u; + } + } + + free((char *) s); + } +} + + +t_stategroup * +newgroup(void) + +{ + t_stategroup * g; + + g = (t_stategroup *) malloc(sizeof *g); + chknull(g); + memset((char *) g, 0, sizeof *g); + g->g_id = groupid++; + return g; +} + + +void +optimizedfa(void) + +{ + unsigned int i; + xmlHashTablePtr h; + t_state * s1; + t_state * s2; + t_state * finstates; + t_state * * sp; + t_stategroup * g1; + t_stategroup * g2; + t_stategroup * ghead; + t_transition * t1; + t_transition * t2; + unsigned int done; + unsigned int startgroup; + unsigned int gtrans[1 << (8 * sizeof(unsigned char))]; + + /** + *** Reduce DFA state count. + **/ + + groupid = 0; + ghead = (t_stategroup *) NULL; + + /** + *** First split: non-final and each distinct final states. + **/ + + h = xmlHashCreate(4); + chknull(h); + + for (s1 = dfa_states; s1; s1 = s1->s_next) { + if (!(g1 = hash_get(h, &s1->s_final, sizeof s1->s_final))) { + g1 = newgroup(); + g1->g_next = ghead; + ghead = g1; + + if (hash_add(h, &s1->s_final, sizeof s1->s_final, g1)) + chknull(NULL); /* Memory allocation error. */ + } + + s1->s_index = g1->g_id; + s1->s_stack = g1->g_member; + g1->g_member = s1; + } + + xmlHashFree(h, NULL); + + /** + *** Subsequent splits: states that have the same forward + *** transition tokens to states in the same group. + **/ + + do { + done = 1; + + for (g2 = ghead; g2; g2 = g2->g_next) { + s1 = g2->g_member; + + if (!s1->s_stack) + continue; + + h = xmlHashCreate(1); + chknull(h); + + /** + *** Build the group transition map. + **/ + + memset((char *) gtrans, ~0, sizeof gtrans); + + for (t1 = s1->s_forward; t1; t1 = t1->t_forwnext) + gtrans[t1->t_token] = t1->t_to->s_index; + + if (hash_add(h, gtrans, sizeof gtrans, g2)) + chknull(NULL); + + /** + *** Process other states in group. + **/ + + sp = &s1->s_stack; + s1 = *sp; + + do { + *sp = s1->s_stack; + + /** + *** Build the transition map. + **/ + + memset((char *) gtrans, ~0, sizeof gtrans); + + for (t1 = s1->s_forward; + t1; t1 = t1->t_forwnext) + gtrans[t1->t_token] = t1->t_to->s_index; + + g1 = hash_get(h, gtrans, sizeof gtrans); + + if (g1 == g2) { + *sp = s1; + sp = &s1->s_stack; + } + else { + if (!g1) { + g1 = newgroup(); + g1->g_next = ghead; + ghead = g1; + + if (hash_add(h, gtrans, + sizeof gtrans, g1)) + chknull(NULL); + } + + s1->s_index = g1->g_id; + s1->s_stack = g1->g_member; + g1->g_member = s1; + done = 0; + } + } while (s1 = *sp); + + xmlHashFree(h, NULL); + } + } while (!done); + + /** + *** Establish group leaders and remap transitions. + **/ + + startgroup = dfa_states->s_index; + + for (g1 = ghead; g1; g1 = g1->g_next) + for (s1 = g1->g_member->s_stack; s1; s1 = s1->s_stack) + for (t1 = s1->s_backward; t1; t1 = t2) { + t2 = t1->t_backnext; + link_transition(t1, NULL, g1->g_member); + } + + /** + *** Remove redundant states and transitions. + **/ + + for (g1 = ghead; g1; g1 = g1->g_next) { + g1->g_member->s_next = (t_state *) NULL; + + while ((s1 = g1->g_member->s_stack)) { + g1->g_member->s_stack = s1->s_stack; + + for (t1 = s1->s_forward; t1; t1 = t2) { + t2 = t1->t_forwnext; + unlink_transition(t1); + free((char *) t1); + } + + free((char *) s1); + } + } + + /** + *** Remove group support and relink DFA states. + **/ + + dfa_states = (t_state *) NULL; + s2 = (t_state *) NULL; + finstates = (t_state *) NULL; + + while (g1 = ghead) { + ghead = g1->g_next; + s1 = g1->g_member; + + if (g1->g_id == startgroup) + dfa_states = s1; /* Keep start state first. */ + else if (s1->s_final) { /* Then final states. */ + s1->s_next = finstates; + finstates = s1; + } + else { /* Finish with non-final states. */ + s1->s_next = s2; + s2 = s1; + } + + free((char *) g1); + } + + for (dfa_states->s_next = finstates; finstates->s_next;) + finstates = finstates->s_next; + + finstates->s_next = s2; +} + + +const char * +inttype(unsigned long max) + +{ + int i; + + for (i = 0; max; i++) + max >>= 1; + + if (i > 8 * sizeof(unsigned int)) + return "unsigned long"; + + if (i > 8 * sizeof(unsigned short)) + return "unsigned int"; + + if (i > 8 * sizeof(unsigned char)) + return "unsigned short"; + + return "unsigned char"; +} + + +listids(FILE * fp) + +{ + unsigned int pos; + t_chset * cp; + t_symlist * lp; + char * srcp; + char * dstp; + size_t srcc; + size_t dstc; + char buf[80]; + + fprintf(fp, "/**\n*** CCSID For arg Recognized name.\n"); + pos = 0; + + for (cp = chset_list; cp; cp = cp->c_next) { + if (pos) { + fprintf(fp, "\n"); + pos = 0; + } + + if (!cp->c_names) + continue; + + pos = fprintf(fp, "*** %5u %c ", cp->c_ccsid, + iconv_open_error(cp->c_fromUTF8)? ' ': 'X'); + + for (lp = cp->c_names; lp; lp = lp->l_next) { + srcp = (char *) lp->l_symbol; + srcc = strlen(srcp); + dstp = buf; + dstc = sizeof buf; + iconv(utf82job, &srcp, &srcc, &dstp, &dstc); + srcc = dstp - buf; + + if (pos + srcc > 79) { + fprintf(fp, "\n***%22c", ' '); + pos = 25; + } + + pos += fprintf(fp, " %.*s", srcc, buf); + } + } + + if (pos) + fprintf(fp, "\n"); + + fprintf(fp, "**/\n\n"); +} + + +void +generate(FILE * fp) + +{ + unsigned int nstates; + unsigned int ntrans; + unsigned int maxfinal; + t_state * s; + t_transition * t; + unsigned int i; + unsigned int pos; + char * ns; + + /** + *** Assign indexes to states and transitions. + **/ + + nstates = 0; + ntrans = 0; + maxfinal = 0; + + for (s = dfa_states; s; s = s->s_next) { + s->s_index = nstates++; + + if (s->s_final) + maxfinal = nstates; + + for (t = s->s_forward; t; t = t->t_forwnext) + t->t_index = ntrans++; + } + + fprintf(fp, + "/**\n*** %u states, %u finals, %u transitions.\n**/\n\n", + nstates, maxfinal, ntrans); + fprintf(stderr, "%u states, %u finals, %u transitions.\n", + nstates, maxfinal, ntrans); + + /** + *** Generate types. + **/ + + fprintf(fp, "typedef unsigned short t_ccsid;\n"); + fprintf(fp, "typedef %-23s t_staterange;\n", inttype(nstates)); + fprintf(fp, "typedef %-23s t_transrange;\n\n", inttype(ntrans)); + + /** + *** Generate first transition index for each state. + **/ + + fprintf(fp, "static t_transrange trans_array[] = {\n"); + pos = 0; + ntrans = 0; + + for (s = dfa_states; s; s = s->s_next) { + pos += fprintf(fp, " %u,", ntrans); + + if (pos > 72) { + fprintf(fp, "\n"); + pos = 0; + } + + for (t = s->s_forward; t; t = t->t_forwnext) + ntrans++; + } + + fprintf(fp, " %u\n};\n\n", ntrans); + + /** + *** Generate final state info. + **/ + + fprintf(fp, "static t_ccsid final_array[] = {\n"); + pos = 0; + ns =""; + i = 0; + + for (s = dfa_states; s && i++ < maxfinal; s = s->s_next) { + pos += fprintf(fp, "%s", ns); + ns = ","; + + if (pos > 72) { + fprintf(fp, "\n"); + pos = 0; + } + + pos += fprintf(fp, " %u", + s->s_final? s->s_final->c_ccsid + 1: 0); + } + + fprintf(fp, "\n};\n\n"); + + /** + *** Generate goto table. + **/ + + fprintf(fp, "static t_staterange goto_array[] = {\n"); + pos = 0; + + for (s = dfa_states; s; s = s->s_next) + for (t = s->s_forward; t; t = t->t_forwnext) { + pos += fprintf(fp, " %u,", t->t_to->s_index); + + if (pos > 72) { + fprintf(fp, "\n"); + pos = 0; + } + } + + fprintf(fp, " %u\n};\n\n", nstates); + + /** + *** Generate transition label table. + **/ + + fprintf(fp, "static unsigned char label_array[] = {\n"); + pos = 0; + ns =""; + + for (s = dfa_states; s; s = s->s_next) + for (t = s->s_forward; t; t = t->t_forwnext) { + pos += fprintf(fp, "%s", ns); + ns = ","; + + if (pos > 72) { + fprintf(fp, "\n"); + pos = 0; + } + + pos += fprintf(fp, " 0x%02X", t->t_token); + } + + fprintf(fp, "\n};\n", nstates); +} + + +main(argc, argv) +int argc; +char * * argv; + +{ + FILE * fp; + t_chset * csp; + char symbuf[20]; + + chset_list = (t_chset *) NULL; + initial_state = newstate(); + job2utf8 = iconv_open_ccsid(C_UTF8_CCSID, C_SOURCE_CCSID, 0); + utf82job = iconv_open_ccsid(C_SOURCE_CCSID, C_UTF8_CCSID, 0); + + if (argc != 4) { + fprintf(stderr, "Usage: %s ", *argv); + fprintf(stderr, " \n"); + exit(1); + } + + /** + *** Read CCSID/MIBenum associations. Define special names. + **/ + + read_assocs(argv[1]); + + /** + *** Read character set names and establish the case-independent + *** name DFA in all possible CCSIDs. + **/ + + read_iana(argv[2]); + + /** + *** Build DFA from NFA. + **/ + + builddfa(); + + /** + *** Delete NFA. + **/ + + deletenfa(); + + /** + *** Minimize the DFA state count. + **/ + + optimizedfa(); + + /** + *** Generate the table. + **/ + + fp = fopen(argv[3], "w+"); + + if (!fp) { + perror(argv[3]); + exit(1); + } + + fprintf(fp, "/**\n"); + fprintf(fp, "*** Character set names table.\n"); + fprintf(fp, "*** Generated by program BLDCSNDFA from"); + fprintf(fp, " IANA character set assignment file\n"); + fprintf(fp, "*** and CCSID/MIBenum equivalence file.\n"); + fprintf(fp, "*** *** Do not edit by hand ***\n"); + fprintf(fp, "**/\n\n"); + listids(fp); + generate(fp); + + if (ferror(fp)) { + perror(argv[3]); + fclose(fp); + exit(1); + } + + fclose(fp); + iconv_close(job2utf8); + iconv_close(utf82job); + exit(0); +} -- cgit v1.2.3