|  | /* | 
|  | ******************************************************************************* | 
|  | * | 
|  | *   Copyright (C) 2000, International Business Machines | 
|  | *   Corporation and others.  All Rights Reserved. | 
|  | * | 
|  | ******************************************************************************* | 
|  | *   file name:  canonucm.c | 
|  | *   encoding:   US-ASCII | 
|  | *   tab size:   8 (not used) | 
|  | *   indentation:4 | 
|  | * | 
|  | *   created on: 2000nov08 | 
|  | *   created by: Markus W. Scherer | 
|  | * | 
|  | *   This tool reads a .ucm file and canonicalizes it: In the CHARMAP section, | 
|  | *   - sort by Unicode code points | 
|  | *   - print all code points in uppercase hexadecimal | 
|  | *   - print all Unicode code points with 4, 5, or 6 digits as needed | 
|  | *   - remove the comments | 
|  | *   - remove unnecessary spaces | 
|  | * | 
|  | *   To compile, just call a C compiler/linker with this source file. | 
|  | *   On Windows: cl canonucm.c | 
|  | */ | 
|  |  | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include <string.h> | 
|  |  | 
|  | typedef struct Mapping { | 
|  | unsigned long u, b, f; | 
|  | } Mapping; | 
|  |  | 
|  | static Mapping | 
|  | mappings[200000]; | 
|  |  | 
|  | /* lexically compare Mappings for sorting */ | 
|  | static int | 
|  | compareMappings(const void *left, const void *right) { | 
|  | const Mapping *l=(const Mapping *)left, *r=(const Mapping *)right; | 
|  | long result; | 
|  |  | 
|  | /* shift right 16 with sign-extend to take care of int possibly being 16 bits wide */ | 
|  | result=(long)(l->u-r->u); | 
|  | if(result!=0) { | 
|  | return (int)(result>>16)|1; | 
|  | } | 
|  | result=(long)(l->b-r->b); | 
|  | if(result!=0) { | 
|  | return (int)(result>>16)|1; | 
|  | } | 
|  | return (int)(l->f-r->f); | 
|  | } | 
|  |  | 
|  | extern int | 
|  | main(int argc, const char *argv[]) { | 
|  | char line[200]; | 
|  | char *s, *end; | 
|  | unsigned long b, i, mappingsTop=0; | 
|  |  | 
|  | /* parse the input file from stdin */ | 
|  | /* read and copy header */ | 
|  | do { | 
|  | if(gets(line)==NULL) { | 
|  | fprintf(stderr, "error: no mapping section"); | 
|  | return 1; | 
|  | } | 
|  | puts(line); | 
|  | } while(0!=strcmp(line, "CHARMAP")); | 
|  |  | 
|  | /* copy empty and comment lines before the first mapping */ | 
|  | for(;;) { | 
|  | if(gets(line)==NULL) { | 
|  | fprintf(stderr, "error: no mappings"); | 
|  | return 1; | 
|  | } | 
|  | if(line[0]!=0 && line[0]!='#') { | 
|  | break; | 
|  | } | 
|  | puts(line); | 
|  | } | 
|  |  | 
|  | /* process the charmap section, start with the line read above */ | 
|  | for(;;) { | 
|  | /* ignore empty and comment lines */ | 
|  | if(line[0]!=0 && line[0]!='#') { | 
|  | if(0!=strcmp(line, "END CHARMAP")) { | 
|  | if(mappingsTop==sizeof(mappings)/sizeof(mappings[0])) { | 
|  | fprintf(stderr, "too many mappings\n"); | 
|  | return 1; | 
|  | } | 
|  | /* parse mapping */ | 
|  | if(line[0]!='<' || line[1]!='U') { | 
|  | fprintf(stderr, "parse error (does not start with \"<U\") in mapping line \"%s\"\n", line); | 
|  | return 1; | 
|  | } | 
|  | /* parse Unicode code point */ | 
|  | mappings[mappingsTop].u=strtoul(line+2, &end, 16); | 
|  | if(end==line+2 || mappings[mappingsTop].u>0x10ffff || *end!='>') { | 
|  | fprintf(stderr, "parse error (Unicode code point) in mapping line \"%s\"\n", line); | 
|  | return 1; | 
|  | } | 
|  | /* skip white space */ | 
|  | s=end+1; | 
|  | while(*s==' ' || *s=='\t') { | 
|  | ++s; | 
|  | } | 
|  | /* parse codepage bytes */ | 
|  | b=0; | 
|  | for(;;) { | 
|  | if(*s!='\\') { | 
|  | break; | 
|  | } | 
|  | if(s[1]!='x') { | 
|  | fprintf(stderr, "parse error (no 'x' in \"\\xXX\") in mapping line \"%s\"\n", line); | 
|  | return 1; | 
|  | } | 
|  | s+=2; | 
|  | b=(b<<8)|strtoul(s, &end, 16); | 
|  | if(end!=s+2) { | 
|  | fprintf(stderr, "parse error (codepage byte) in mapping line \"%s\"\n", line); | 
|  | return 1; | 
|  | } | 
|  | s+=2; | 
|  | } | 
|  | mappings[mappingsTop].b=b; | 
|  | /* skip everything until the fallback indicator */ | 
|  | while(*s!='|') { | 
|  | if(*s==0) { | 
|  | fprintf(stderr, "parse error (missing '|' fallback indicator) in mapping line \"%s\"\n", line); | 
|  | return 1; | 
|  | } | 
|  | ++s; | 
|  | } | 
|  | /* parse fallback indicator */ | 
|  | i=s[1]-'0'; | 
|  | if(i>3) { | 
|  | fprintf(stderr, "parse error (fallback indicator not 0..3) in mapping line \"%s\"\n", line); | 
|  | return 1; | 
|  | } | 
|  | mappings[mappingsTop++].f=i; | 
|  | } else { | 
|  | /* sort and write all mappings */ | 
|  | if(mappingsTop>0) { | 
|  | qsort(mappings, mappingsTop, sizeof(Mapping), compareMappings); | 
|  | for(i=0; i<mappingsTop; ++i) { | 
|  | b=mappings[i].b; | 
|  | if(b<=0xff) { | 
|  | printf("<U%04lX> \\x%02lX |%lu\n", mappings[i].u, b, mappings[i].f); | 
|  | } else if(b<=0xffff) { | 
|  | printf("<U%04lX> \\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>8, b&0xff, mappings[i].f); | 
|  | } else if(b<=0xffffff) { | 
|  | printf("<U%04lX> \\x%02lX\\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>16, (b>>8)&0xff, b&0xff, mappings[i].f); | 
|  | } else { | 
|  | printf("<U%04lX> \\x%02lX\\x%02lX\\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>24, (b>>16)&0xff, (b>>8)&0xff, b&0xff, mappings[i].f); | 
|  | } | 
|  | } | 
|  | } | 
|  | /* output "END CHARMAP" */ | 
|  | puts(line); | 
|  | return 0; | 
|  | } | 
|  | } | 
|  | /* read the next line */ | 
|  | if(gets(line)==NULL) { | 
|  | fprintf(stderr, "incomplete charmap section\n"); | 
|  | return 1; | 
|  | } | 
|  | } | 
|  | } |