| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2000-2001, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: rptp2ucm.c |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2001feb16 |
| * created by: Markus W. Scherer |
| * |
| * This tool reads two CDRA conversion table files (RPMAP & TPMAP or RXMAP and TXMAP) and |
| * generates a canonicalized ICU .ucm file from them. |
| * If the RPMAP/RXMAP file does not contain a comment line with the substitution character, |
| * then this tool also attempts to read the header of the corresponding UPMAP/UXMAP file |
| * to extract subchar and subchar1. |
| * |
| * R*MAP: Unicode->codepage |
| * T*MAP: codepage->Unicode |
| * |
| * To compile, just call a C compiler/linker with this source file. |
| * On Windows: cl rptp2ucm.c |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <time.h> |
| |
| typedef struct UCMSubchar { |
| const char *name; |
| unsigned long subchar, subchar1; |
| } UCMSubchar; |
| |
| static const UCMSubchar |
| knownSubchars[]={ |
| "274_P100", 0x3f, 0, |
| "850_P100", 0x7f, 0, |
| "913_P100", 0x1a, 0, |
| "1047_P100", 0x3f, 0 |
| }; |
| |
| typedef struct CCSIDStateTable { |
| unsigned int ccsid; |
| const char *table; |
| } CCSIDStateTable; |
| |
| /*Year when the ucm files were produced using this tool*/ |
| #define YEAR "2000" |
| /**/ |
| #define japanesePCDBCSStates "<icu:state> 0-ff:2, 81-9f:1, a0-fc:1\n"\ |
| "<icu:state> 40-7e, 80-fc\n"\ |
| "<icu:state>\n" |
| |
| static const CCSIDStateTable |
| knownStateTables[]={ |
| |
| 301, "<icu:state> 0-ff:2, 81-9f:1, e0-fc:1\n" |
| "<icu:state> 40-7e, 80-fc\n" |
| "<icu:state>\n", |
| 367, "<icu:state> 0-7f\n", |
| |
| 927, japanesePCDBCSStates, |
| |
| 926, japanesePCDBCSStates, |
| |
| 928, japanesePCDBCSStates, |
| |
| 932, "<icu:state> 0-7f,80,81-9f:1,a0-df,fd-ff, e0-fc:1\n" |
| "<icu:state> 40-7e, 80-fc\n", |
| |
| |
| 941, japanesePCDBCSStates, |
| |
| 942, "<icu:state> 0-80, 81-9f:1, a0-df, e0-fc:1, fd-ff\n" |
| "<icu:state> 40-7e, 80-fc\n", |
| |
| 943, "<icu:state> 0-7f, 81-9f:1, a0-df, e0-fc:1\n" |
| "<icu:state> 40-7e, 80-fc\n", |
| |
| 944, "<icu:state> 0-80, 81-bf:1, c0-ff\n" |
| "<icu:state> 40-7e, 80-fe\n", |
| |
| 946, "<icu:state> 0-80, 81-fb:1,fc:2,fd-ff\n" |
| "<icu:state> 40-7e, 80-fe\n" |
| "<icu:state> 80-fe.u,fc", |
| |
| 947, "<icu:state> 0-7f, 80-fe:1\n" |
| "<icu:state> 40-7e, 80-fe\n", |
| |
| 948, "<icu:state> 0-80, 81-fb:1,fc:2,fd-fe\n" |
| "<icu:state> 40-7e, 80-fe\n" |
| "<icu:state> 80-fe.u,fc\n", |
| |
| 949, "<icu:state> 0-84, 8f-fe:1\n" |
| "<icu:state> 40-7e, 80-fe\n", |
| |
| 950, "<icu:state> 0-7f, 81-fe:1\n" |
| "<icu:state> 40-7e, 81-fe\n", |
| |
| 954, "<icu:state> 0-8d, 8e:2, 8f:3, 90-9f, a1-fe:1\n" |
| "<icu:state> a1-fe\n" |
| "<icu:state> a1-e4\n" |
| "<icu:state> a1-fe:1, a1:4\n" |
| "<icu:state> a1-fe.u\n", |
| |
| 955, "<icu:state> 0-20:2, 21-7e:1, 7f-ff:2\n" |
| "<icu:state> 21-7e\n" |
| "<icu:state>\n", |
| |
| 963, "<icu:state> 0-20:2, 21-7e:1, 7f-ff:2\n" |
| "<icu:state> 21-7e\n" |
| "<icu:state>\n", |
| |
| 964, "<icu:state> 0-8d, 8e:2, 90-9f, a1-fe:1, aa-c1:5, c3:5, fe:5\n" |
| "<icu:state> a1-fe\n" |
| "<icu:state> a1-b0:3, a1:4, a2:8, a3-ab:4, ac:7, ad:6, ae-b0:4\n" |
| "<icu:state> a1-fe:1\n" |
| "<icu:state> a1-fe:5\n" |
| "<icu:state> a1-fe.u\n" |
| "<icu:state> a1-a4:1, a5-fe:5\n" |
| "<icu:state> a1-e2:1, e3-fe:5\n" |
| "<icu:state> a1-f2:1, f3-fe:5\n", |
| |
| 970, "<icu:state> 0-9f, a1-fe:1\n" |
| "<icu:state> a1-fe\n", |
| |
| 1363, "<icu:state> 0-7f, 81-fe:1\n" |
| "<icu:state> 40-7e, 80-fe\n", |
| 1350, "<icu:state> 0-8d, 8e:2, 8f:3, 90-9f, a1-fe:1\n" |
| "<icu:state> a1-fe\n" |
| "<icu:state> a1-e4\n" |
| "<icu:state> a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4\n" |
| "<icu:state> a1-fe.u\n", |
| |
| 1351, "<icu:state> 0-ff:2, 81-9f:1, e0-fc:1\n" |
| "<icu:state> 40-7e, 80-fc\n" |
| "<icu:state>\n", |
| |
| 1370, "<icu:state> 0-80, 81-fe:1\n" |
| "<icu:state> 40-7e, 81-fe\n", |
| |
| 1381, "<icu:state> 0-84, 8c-fe:1\n" |
| "<icu:state> a1-fe\n", |
| |
| 1383, "<icu:state> 0-9f, a1-fe:1\n" |
| "<icu:state> a1-fe\n", |
| |
| 1385, "<icu:state> 0-ff:2,81-fe:1\n" |
| "<icu:state> 40-7e, 80-fe\n" |
| "<icu:state>\n", |
| |
| 1386, "<icu:state> 0-7f, 81-fe:1\n" |
| "<icu:state> 40-7e, 80-fe\n", |
| |
| 5039, "<icu:state> 0-80, 81-9f:1, a0-df, e0-fc:1, fd-ff\n" |
| "<icu:state> 40-7e, 80-fc\n", |
| |
| 5050, "<icu:state> 0-8d, 8e:2, 8f:3, 90-9f, a1-fe:1\n" |
| "<icu:state> a1-fe\n" |
| "<icu:state> a1-e4\n" |
| "<icu:state> a1-fe:1, a1:4, a3-af:4, b6:4, d6:4, da-db:4, ed-f2:4\n" |
| "<icu:state> a1-fe.u\n", |
| 5067, "<icu:state> 0-ff:2, 21-7e:1\n" |
| "<icu:state> 21-7e\n" |
| "<icu:state>\n", |
| |
| 5478, "<icu:state> 0-ff:2, 21-7e:1\n" |
| "<icu:state> 21-7e\n" |
| "<icu:state>\n", |
| |
| 21427, "<icu:state> 0-80:2, 81-fe:1, ff:2\n" |
| "<icu:state> 40-7e, 80-fe\n" |
| "<icu:state>\n", |
| 25546, "<icu:state> 0-7f, e:1.s, f:0.s\n" |
| "<icu:state> initial, 0-20:3, e:1.s, f:0.s, 21-7e:2, 7f-ff:3\n" |
| "<icu:state> 0-20:1.i, 21-7e:1., 7f-ff:1.i\n" |
| "<icu:state> 0-ff:1.i\n", |
| |
| 33722, "<icu:state> 0-8d, 8e:2, 8f:3, 90-9f, a1-fe:1\n" |
| "<icu:state> a1-fe\n" |
| "<icu:state> a1-e4\n" |
| "<icu:state> a1-fe:1, a1:4, a3-af:4, b6:4, d6:4, da-db:4, ed-f2:4\n" |
| "<icu:state> a1-fe.u\n" |
| |
| |
| }; |
| |
| typedef struct Mapping { |
| /* |
| * u bits: |
| * 31..24 fallback indicator |
| * 0 roundtrip |
| * 1 Unicode->codepage |
| * 3 codepage->Unicode |
| * 23.. 0 Unicode code point |
| * |
| * b: codepage bytes with leading zeroes |
| */ |
| unsigned long u, b; |
| } Mapping; |
| |
| #define MAX_MAPPINGS_COUNT 200000 |
| |
| static Mapping |
| fromUMappings[MAX_MAPPINGS_COUNT], toUMappings[MAX_MAPPINGS_COUNT]; |
| |
| static long fromUMappingsTop, toUMappingsTop; |
| |
| static unsigned long subchar, subchar1; |
| static unsigned int ccsid; |
| |
| enum { |
| ASCII, |
| EBCDIC, |
| UNKNOWN |
| }; |
| |
| static char |
| minCharLength, |
| maxCharLength, |
| charsetFamily, |
| usesPUA, |
| variantLF, |
| variantASCII, |
| variantControls, |
| variantSUB, |
| is7Bit; |
| |
| static void |
| init() { |
| fromUMappingsTop=toUMappingsTop=0; |
| |
| subchar=subchar1=0; |
| ccsid=0; |
| |
| minCharLength=4; |
| maxCharLength=0; |
| charsetFamily=UNKNOWN; |
| usesPUA=0; |
| variantLF=0; |
| variantASCII=0; |
| variantControls=0; |
| variantSUB=0; |
| is7Bit=0; |
| } |
| |
| /* lexically compare Mappings for sorting */ |
| static int |
| compareMappings(const void *left, const void *right) { |
| const Mapping *l=(const Mapping *)left, *r=(const Mapping *)right; |
| long result; |
| |
| /* the code points use fewer than 32 bits, just cast them to signed values and subtract */ |
| result=(long)(l->u&0xffffff)-(long)(r->u&0xffffff); |
| if(result!=0) { |
| /* shift right 16 with sign-extend to take care of int possibly being 16 bits wide */ |
| return (int)(result>>16)|1; |
| } |
| |
| /* the b fields may use all 32 bits as unsigned long, so result=(long)(l->b-r->b) would not work (try l->b=0x80000000 and r->b=1) */ |
| if(l->b<r->b) { |
| return -1; |
| } else if(l->b>r->b) { |
| return 1; |
| } |
| |
| return (int)(l->u>>24)-(int)(r->u>>24); |
| } |
| |
| static const char * |
| skipWhitespace(const char *s) { |
| while(*s==' ' || *s=='\t') { |
| ++s; |
| } |
| return s; |
| } |
| |
| static long |
| parseMappings(FILE *f, Mapping *mappings) { |
| char line[200]; |
| Mapping *oldMappings; |
| char *s, *end; |
| long mappingsTop=0; |
| |
| oldMappings=mappings; |
| while(fgets(line, sizeof(line), f)!=NULL) { |
| s=(char *)skipWhitespace(line); |
| |
| /* skip empty lines */ |
| if(*s==0 || *s=='\n' || *s=='\r') { |
| continue; |
| } |
| |
| /* explicit end of table */ |
| if(memcmp(s, "END CHARMAP", 11)==0) { |
| break; |
| } |
| |
| /* comment lines, parse substitution characters, otherwise skip them */ |
| if(*s=='#' || *s=='*') { |
| /* get subchar1 */ |
| s=strstr(line, "for U+00xx"); |
| if(s!=NULL) { |
| s=strstr(line, "x'"); |
| if(s!=NULL) { |
| s+=2; |
| subchar1=strtoul(s, &end, 16); |
| if(end!=s+2 || *end!='\'') { |
| fprintf(stderr, "error parsing subchar1 from \"%s\"\n", line); |
| exit(2); |
| } |
| continue; |
| } else { |
| fprintf(stderr, "error finding subchar1 on \"%s\"\n", line); |
| exit(2); |
| } |
| } |
| |
| /* get subchar */ |
| s=strstr(line, "for U+xxxx"); |
| if(s!=NULL) { |
| s=strstr(line, "x'"); |
| if(s!=NULL) { |
| s+=2; |
| subchar=strtoul(s, &end, 16); |
| if(end<s+2 || *end!='\'') { |
| fprintf(stderr, "error parsing subchar from \"%s\"\n", line); |
| exit(2); |
| } |
| continue; |
| } else { |
| fprintf(stderr, "error finding subchar on \"%s\"\n", line); |
| exit(2); |
| } |
| } |
| |
| continue; |
| } |
| |
| mappings->b=strtoul(s, &end, 16); |
| if(s==end || (*end!=' ' && *end!='\t')) { |
| if((s+1)==end && *end=='-' && (mappings->b<=3)) { |
| /* this is a special EUC format where the code set number prepends the bytes */ |
| unsigned long prefix; |
| |
| switch(mappings->b) { |
| case 0: |
| prefix=0; |
| break; |
| case 1: |
| prefix=0; |
| break; |
| case 2: |
| prefix=0x8e; |
| break; |
| case 3: |
| prefix=0x8f; |
| break; |
| default: |
| /* never occurs because of above check */ |
| break; |
| } |
| |
| s+=2; |
| mappings->b=strtoul(s, &end, 16); |
| if(s==end || ((end-s)&1) || (*end!=' ' && *end!='\t')) { |
| fprintf(stderr, "error parsing EUC codepage bytes on \"%s\"\n", line); |
| exit(2); |
| } |
| mappings->b|=prefix<<(4*(end-s)); |
| } else { |
| fprintf(stderr, "error parsing codepage bytes on \"%s\"\n", line); |
| exit(2); |
| } |
| } |
| |
| s=(char *)skipWhitespace(end); |
| mappings->u=strtoul(s, &end, 16); |
| if(s==end || (*end!=' ' && *end!='\t' && *end!='\n' && *end!='\r' && *end!=0)) { |
| if(strncmp(s, "????", 4)==0 || strstr(s, "UNASSIGNED")!=NULL) { |
| /* this is a non-entry, do not add it to the mapping table */ |
| continue; |
| } |
| fprintf(stderr, "error parsing Unicode code point on \"%s\"\n", line); |
| exit(2); |
| } |
| |
| ++mappings; |
| if(++mappingsTop>=MAX_MAPPINGS_COUNT) { |
| fprintf(stderr, "error: too many mappings at \"%s\"\n", line); |
| exit(2); |
| } |
| } |
| |
| /* sort the mappings */ |
| qsort(oldMappings, mappingsTop, sizeof(Mapping), compareMappings); |
| |
| return mappingsTop; |
| } |
| |
| /* merge the mappings into fromUMappings and add fallback indicator values to Mapping.u bits 31..24 */ |
| static void |
| mergeMappings() { |
| long fromUIndex, toUIndex, newFromUMappingsTop=fromUMappingsTop; |
| int cmp; |
| |
| fromUIndex=toUIndex=0; |
| while(fromUIndex<fromUMappingsTop && toUIndex<toUMappingsTop) { |
| cmp=compareMappings(fromUMappings+fromUIndex, toUMappings+toUIndex); |
| if(cmp==0) { |
| /* equal: roundtrip, nothing to do */ |
| ++fromUIndex; |
| ++toUIndex; |
| } else if(cmp<0) { |
| /* |
| * the fromU mapping does not have a toU counterpart: |
| * fallback Unicode->codepage |
| */ |
| if(fromUMappings[fromUIndex].b!=subchar && fromUMappings[fromUIndex].b!=subchar1) { |
| fromUMappings[fromUIndex++].u|=0x1000000; |
| } else { |
| fromUMappings[fromUIndex++].u|=0x2000000; |
| } |
| } else { |
| /* |
| * the toU mapping does not have a fromU counterpart: |
| * (reverse) fallback codepage->Unicode, copy it to the fromU table |
| */ |
| fromUMappings[newFromUMappingsTop].u=toUMappings[toUIndex].u|=0x3000000; |
| fromUMappings[newFromUMappingsTop++].b=toUMappings[toUIndex++].b; |
| } |
| } |
| |
| /* either one or both tables are exhausted */ |
| while(fromUIndex<fromUMappingsTop) { |
| /* leftover fromU mappings are fallbacks */ |
| if(fromUMappings[fromUIndex].b!=subchar && fromUMappings[fromUIndex].b!=subchar1) { |
| fromUMappings[fromUIndex++].u|=0x1000000; |
| } else { |
| fromUMappings[fromUIndex++].u|=0x2000000; |
| } |
| } |
| |
| while(toUIndex<toUMappingsTop) { |
| /* leftover toU mappings are reverse fallbacks */ |
| fromUMappings[newFromUMappingsTop].u=toUMappings[toUIndex].u|=0x3000000; |
| fromUMappings[newFromUMappingsTop++].b=toUMappings[toUIndex++].b; |
| } |
| |
| fromUMappingsTop=newFromUMappingsTop; |
| |
| /* re-sort the mappings */ |
| qsort(fromUMappings, fromUMappingsTop, sizeof(Mapping), compareMappings); |
| } |
| |
| static void |
| analyzeTable() { |
| unsigned long u, b, f, minTwoByte=0xffff, maxTwoByte=0, oredBytes=0; |
| long i, countASCII=0; |
| char length; |
| |
| for(i=0; i<fromUMappingsTop; ++i) { |
| f=fromUMappings[i].u>>24; |
| u=fromUMappings[i].u&0xffffff; |
| b=fromUMappings[i].b; |
| |
| oredBytes|=b; |
| |
| /* character length? */ |
| if(b<=0xff) { |
| length=1; |
| } else if(b<=0xffff) { |
| length=2; |
| if(b<minTwoByte) { |
| minTwoByte=b; |
| } |
| if(b>maxTwoByte) { |
| maxTwoByte=b; |
| } |
| } else if(b<=0xffffff) { |
| length=3; |
| } else { |
| length=4; |
| } |
| if(length<minCharLength) { |
| minCharLength=length; |
| } |
| if(length>maxCharLength) { |
| maxCharLength=length; |
| } |
| |
| /* PUA used? */ |
| if((unsigned long)(u-0xe000)<0x1900 || (unsigned long)(u-0xf0000)<0x20000) { |
| usesPUA=1; |
| } |
| |
| /* only consider roundtrip mappings for the rest */ |
| if(f!=0) { |
| continue; |
| } |
| |
| /* ASCII or EBCDIC? */ |
| if(u==0x41) { |
| if(b==0x41) { |
| charsetFamily=ASCII; |
| } else if(b==0xc1) { |
| charsetFamily=EBCDIC; |
| } |
| } else if(u==0xa) { |
| if(b==0xa) { |
| charsetFamily=ASCII; |
| } else if(b==0x25) { |
| charsetFamily=EBCDIC; |
| variantLF=0; |
| } else if(b==0x15) { |
| charsetFamily=EBCDIC; |
| variantLF=1; |
| } |
| } |
| |
| /* US-ASCII? */ |
| if((unsigned long)(u-0x21)<94) { |
| if(u==b) { |
| ++countASCII; |
| } else { |
| variantASCII=1; |
| } |
| } else if(u<0x20 || u==0x7f) { |
| /* non-ISO C0 controls? */ |
| if(u!=b) { |
| /* IBM PC rotation of SUB and other controls: 0x1a->0x7f->0x1c->0x1a */ |
| if(u==0x1a && b==0x7f || u==0x1c && b==0x1a || u==0x7f && b==0x1c) { |
| charsetFamily=ASCII; |
| variantSUB=1; |
| } else { |
| variantControls=1; |
| } |
| } |
| } |
| } |
| |
| is7Bit= oredBytes<=0x7f; |
| |
| if(charsetFamily==UNKNOWN) { |
| if(minCharLength==2 && maxCharLength==2) { |
| /* guess the charset family for DBCS according to typical byte distributions */ |
| if( ((0x2020<=minTwoByte || minTwoByte<=0x217e) && maxTwoByte<=0x7e7e) || |
| ((0xa0a0<=minTwoByte || minTwoByte<=0xa1fe) && maxTwoByte<=0xfefe) || |
| ((0x8140<=minTwoByte || minTwoByte<=0x81fe) && maxTwoByte<=0xfefe) |
| ) { |
| charsetFamily=ASCII; |
| } else if((minTwoByte==0x4040 || (0x4141<=minTwoByte && minTwoByte<=0x41fe)) && maxTwoByte<=0xfefe) { |
| charsetFamily=EBCDIC; |
| } |
| } |
| if(charsetFamily==UNKNOWN) { |
| fprintf(stderr, "error: unable to determine the charset family\n"); |
| exit(3); |
| } |
| } |
| |
| /* reset variant indicators if they do not apply */ |
| if(charsetFamily!=ASCII || minCharLength!=1) { |
| variantASCII=variantSUB=variantControls=0; |
| } else if(countASCII!=94) { |
| /* if there are not 94 mappings for ASCII graphic characters, then set variantASCII */ |
| variantASCII=1; |
| } |
| |
| if(charsetFamily!=EBCDIC || minCharLength!=1) { |
| variantLF=0; |
| } |
| } |
| |
| static int |
| getSubchar(const char *name) { |
| int i; |
| |
| for(i=0; i<sizeof(knownSubchars)/sizeof(knownSubchars[0]); ++i) { |
| if(strcmp(name, knownSubchars[i].name)==0) { |
| subchar=knownSubchars[i].subchar; |
| subchar1=knownSubchars[i].subchar1; |
| return 1; |
| } |
| } |
| |
| return 0; |
| } |
| |
| static void |
| getSubcharFromUPMAP(FILE *f) { |
| char line[200]; |
| char *s, *end; |
| unsigned long *p; |
| unsigned long value, bytes; |
| |
| while(fgets(line, sizeof(line), f)!=NULL && memcmp(line, "CHARMAP", 7)!=0) { |
| s=(char *)skipWhitespace(line); |
| |
| /* skip empty lines */ |
| if(*s==0 || *s=='\n' || *s=='\r') { |
| continue; |
| } |
| |
| /* look for variations of subchar entries */ |
| if(memcmp(s, "<subchar>", 9)==0) { |
| s=(char *)skipWhitespace(s+9); |
| p=&subchar; |
| } else if(memcmp(s, "<subchar1>", 10)==0) { |
| s=(char *)skipWhitespace(s+10); |
| p=&subchar1; |
| } else if(memcmp(s, "#<subchar1>", 11)==0) { |
| s=(char *)skipWhitespace(s+11); |
| p=&subchar1; |
| } else { |
| continue; |
| } |
| |
| /* get the value and store it in *p */ |
| bytes=0; |
| while(s[0]=='\\' && s[1]=='x') { |
| value=strtoul(s+2, &end, 16); |
| s+=4; |
| if(end!=s) { |
| fprintf(stderr, "error parsing UPMAP subchar from \"%s\"\n", line); |
| exit(2); |
| } |
| bytes=(bytes<<8)|value; |
| } |
| *p=bytes; |
| } |
| } |
| |
| static const char * |
| getStateTable() { |
| int i; |
| |
| for(i=0; i<sizeof(knownStateTables)/sizeof(knownStateTables[0]); ++i) { |
| if(ccsid==knownStateTables[i].ccsid) { |
| return knownStateTables[i].table; |
| } |
| } |
| |
| return NULL; |
| } |
| |
| static void |
| writeBytes(char *s, unsigned long b) { |
| if(b<=0xff) { |
| sprintf(s, "\\x%02lX", b); |
| } else if(b<=0xffff) { |
| sprintf(s, "\\x%02lX\\x%02lX", b>>8, b&0xff); |
| } else if(b<=0xffffff) { |
| sprintf(s, "\\x%02lX\\x%02lX\\x%02lX", b>>16, (b>>8)&0xff, b&0xff); |
| } else { |
| sprintf(s, "\\x%02lX\\x%02lX\\x%02lX\\x%02lX", b>>24, (b>>16)&0xff, (b>>8)&0xff, b&0xff); |
| } |
| } |
| |
| static void |
| writeUCM(FILE *f, const char *ucmname, const char *rpname, const char *tpname) { |
| char buffer[100]; |
| const char *s; |
| long i; |
| |
| /* write the header */ |
| fprintf(f, |
| "# *******************************************************************************\n" |
| "# *\n" |
| "# * Copyright (C) 1995-2001, International Business Machines\n" |
| "# * Corporation and others. All Rights Reserved.\n" |
| "# *\n" |
| "# *******************************************************************************\n" |
| "#\n" |
| "# File created by rptp2ucm (compiled on %s)\n" |
| "# from source files %s and %s\n" |
| "#\n", __DATE__, rpname, tpname); |
| |
| /* ucmname does not have a path or .ucm */ |
| fprintf(f, "<code_set_name> \"%s\"\n", ucmname); |
| |
| fputs("<char_name_mask> \"AXXXX\"\n", f); |
| fprintf(f, "<mb_cur_max> %u\n", maxCharLength); |
| fprintf(f, "<mb_cur_min> %u\n", minCharLength); |
| |
| if(maxCharLength==1) { |
| fputs("<uconv_class> \"SBCS\"\n", f); |
| } else if(maxCharLength==2) { |
| if(minCharLength==1) { |
| if(charsetFamily==EBCDIC) { |
| fputs("<uconv_class> \"EBCDIC_STATEFUL\"\n", f); |
| } else { |
| fputs("<uconv_class> \"MBCS\"\n", f); |
| } |
| } else if(minCharLength==2) { |
| fputs("<uconv_class> \"DBCS\"\n", f); |
| } else { |
| fputs("<uconv_class> \"MBCS\"\n", f); |
| } |
| } else { |
| fputs("<uconv_class> \"MBCS\"\n", f); |
| } |
| |
| if(subchar!=0) { |
| writeBytes(buffer, subchar); |
| fprintf(f, "<subchar> %s\n", buffer); |
| } |
| |
| if(subchar1!=0) { |
| fprintf(f, "<subchar1> \\x%02X\n", subchar1); |
| } |
| |
| /* write charset family */ |
| if(charsetFamily==ASCII) { |
| fputs("<icu:charsetFamily> \"ASCII\"\n", f); |
| } else { |
| fputs("<icu:charsetFamily> \"EBCDIC\"\n", f); |
| } |
| |
| /* write alias describing the codepage */ |
| sprintf(buffer, "<icu:alias> \"ibm-%u", ccsid); |
| if(!usesPUA && !variantLF && !variantASCII && !variantControls && !variantSUB) { |
| strcat(buffer, "_STD\"\n\n"); |
| } else { |
| /* add variant indicators in alphabetic order */ |
| if(variantASCII) { |
| strcat(buffer, "_VASCII"); |
| } |
| if(variantControls) { |
| strcat(buffer, "_VGCTRL"); |
| } |
| if(variantLF) { |
| strcat(buffer, "_VLF"); |
| } |
| if(variantSUB) { |
| strcat(buffer, "_VSUB"); |
| } |
| if(usesPUA) { |
| strcat(buffer, "_VPUA"); |
| } |
| strcat(buffer, "\"\n\n"); |
| } |
| fputs(buffer, f); |
| |
| /* write the state table - <icu:state> */ |
| s=getStateTable(); |
| if(s!=NULL) { |
| fputs(s, f); |
| fputs("\n", f); |
| } else if(is7Bit) { |
| fputs("<icu:state> 0-7f\n\n", f); |
| } |
| |
| /* write the mappings */ |
| fputs("CHARMAP\n", f); |
| for(i=0; i<fromUMappingsTop; ++i) { |
| writeBytes(buffer, fromUMappings[i].b); |
| fprintf(f, "<U%04lX> %s |%lu\n", fromUMappings[i].u&0xffffff, buffer, fromUMappings[i].u>>24); |
| } |
| fputs("END CHARMAP\n", f); |
| } |
| |
| static void |
| processTable(const char *arg) { |
| char filename[1024], tpname[32]; |
| const char *basename, *s; |
| FILE *rpmap, *tpmap, *ucm; |
| unsigned long value, unicode; |
| int length; |
| |
| init(); |
| |
| /* separate path and basename */ |
| basename=strrchr(arg, '/'); |
| if(basename==NULL) { |
| basename=strrchr(arg, '\\'); |
| if(basename==NULL) { |
| basename=arg; |
| } else { |
| ++basename; |
| } |
| } else { |
| ++basename; |
| s=strrchr(arg, '\\'); |
| if(s!=NULL && ++s>basename) { |
| basename=s; |
| } |
| } |
| |
| /* is this a standard RPMAP filename? */ |
| value=strtoul(basename, (char **)&s, 16); |
| if( strlen(basename)!=17 || |
| (memcmp(basename+9, "RPMAP", 5)!=0 && memcmp(basename+9, "rpmap", 5)!=0 && |
| memcmp(basename+9, "RXMAP", 5)!=0 && memcmp(basename+9, "rxmap", 5)!=0) || |
| (s-basename)!=8 || |
| *s!='.' |
| ) { |
| fprintf(stderr, "error: \"%s\" is not a standard RPMAP filename\n", basename); |
| exit(1); |
| } |
| |
| /* is this really a Unicode conversion table? - get the CCSID */ |
| unicode=value&0xffff; |
| if(unicode==13488 || unicode==17584) { |
| ccsid=(unsigned int)(value>>16); |
| } else { |
| unicode=value>>16; |
| if(unicode==13488 || unicode==17584) { |
| ccsid=(unsigned int)(value&0xffff); |
| } else { |
| fprintf(stderr, "error: \"%s\" is not a Unicode conversion table\n", basename); |
| exit(1); |
| } |
| } |
| |
| /* try to open the RPMAP file */ |
| rpmap=fopen(arg, "r"); |
| if(rpmap==NULL) { |
| fprintf(stderr, "error: unable to open \"%s\"\n", arg); |
| exit(1); |
| } |
| |
| /* try to open the TPMAP file */ |
| strcpy(filename, arg); |
| length=strlen(filename); |
| |
| /* guess the TPMAP filename; note that above we have checked the format of the basename */ |
| /* replace the R in RPMAP by T, keep upper- or lowercase */ |
| if(filename[length-8]=='R') { |
| filename[length-8]='T'; |
| } else { |
| filename[length-8]='t'; |
| } |
| |
| /* reverse the CCSIDs */ |
| memcpy(filename+length-17, basename+4, 4); |
| memcpy(filename+length-13, basename, 4); |
| |
| /* first, keep the same suffix */ |
| tpmap=fopen(filename, "r"); |
| if(tpmap==NULL) { |
| /* next, try reducing the second to last digit by 1 */ |
| --filename[length-2]; |
| tpmap=fopen(filename, "r"); |
| if(tpmap==NULL) { |
| /* there is no TPMAP */ |
| fprintf(stderr, "error: unable to find the TPMAP file for \"%s\"\n", arg); |
| exit(1); |
| } |
| } |
| strcpy(tpname, filename+length-17); |
| |
| /* parse both files */ |
| fromUMappingsTop=parseMappings(rpmap, fromUMappings); |
| toUMappingsTop=parseMappings(tpmap, toUMappings); |
| fclose(tpmap); |
| fclose(rpmap); |
| |
| /* if there is no subchar, then try to get it from the corresponding UPMAP */ |
| if(subchar==0) { |
| FILE *f; |
| |
| /* restore the RPMAP filename and just replace the R by U */ |
| strcpy(filename+length-17, basename); |
| if(filename[length-8]=='R') { |
| filename[length-8]='U'; |
| } else { |
| filename[length-8]='u'; |
| } |
| |
| f=fopen(filename, "r"); |
| if(f==NULL) { |
| /* try reversing the CCSIDs */ |
| memcpy(filename+length-17, basename+4, 4); |
| memcpy(filename+length-13, basename, 4); |
| f=fopen(filename, "r"); |
| } |
| if(f!=NULL) { |
| getSubcharFromUPMAP(f); |
| fclose(f); |
| } |
| } |
| |
| /* generate the .ucm filename - necessary before getSubchar() */ |
| length=sprintf(filename, "ibm-%u_", ccsid); |
| |
| /* uppercase and append the suffix */ |
| filename[length++]=toupper(basename[10]); /* P or X */ |
| filename[length++]=toupper(basename[14]); /* last 3 suffix characters */ |
| filename[length++]=toupper(basename[15]); |
| filename[length++]=toupper(basename[16]); |
| filename[length++]='-'; |
| filename[length]=0; |
| /*concatenate year*/ |
| strcat(filename,YEAR); |
| /* find the subchar if still necessary - necessary before merging for correct |2 */ |
| if(subchar==0 && !getSubchar(filename+4)) { |
| fprintf(stderr, "warning: missing subchar in \"%s\" (CCSID=0x%04X)\n", filename, ccsid); |
| } |
| |
| /* merge the mappings */ |
| mergeMappings(); |
| |
| /* analyze the conversion table */ |
| analyzeTable(); |
| |
| /* open the .ucm file */ |
| strcat(filename, ".ucm"); |
| ucm=fopen(filename, "w"); |
| if(ucm==NULL) { |
| fprintf(stderr, "error: unable to open output file \"%s\"\n", filename); |
| exit(4); |
| } |
| |
| /* remove the .ucm from the filename for the following processing */ |
| filename[strlen(filename)-4]=0; |
| |
| /* write the .ucm file */ |
| writeUCM(ucm, filename, basename, tpname); |
| fclose(ucm); |
| } |
| |
| extern int |
| main(int argc, const char *argv[]) { |
| if(argc<2) { |
| fprintf(stderr, |
| "usage: %s { rpmap/rxmap-filename }+\n", |
| argv[0]); |
| exit(1); |
| } |
| |
| while(--argc>0) { |
| processTable(*++argv); |
| } |
| |
| return 0; |
| } |