trunk/unicode/c/genprops/misc/ucdmerge.c - external/github.com/unicode-org/icu - Git at Google

 /*
 *******************************************************************************
 *
 *   Copyright (C) 2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 *   file name:  ucdmerge.c
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2003feb20
 *   created by: Markus W. Scherer
 *
 *   Simple tool for Unicode Character Database files with semicolon-delimited fields.
 *   Merges adjacent, identical per-code point data lines into one line with range syntax.
 *
 *   To compile, just call a C compiler/linker with this source file.
 *   On Windows: cl ucdmerge.c
 */

 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>

 static const char *
 skipWhitespace(const char *s) {
     while(*s==' ' || *s=='\t') {
         ++s;
     }
     return s;
 }

 /* return the first character position after the end of the data */
 static char *
 endOfData(const char *l) {
     char *end;
     char c;

     end=strchr(l, '#');
     if(end!=NULL) {
         /* ignore whitespace before the comment */
         while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
             --end;
         }
     } else {
         end=strchr(l, 0);
     }
     return end;
 }

 static int
 sameData(const char *l1, const char *l2) {
     char *end1, *end2;
     int length;

     /* find the first semicolon in each line - there must be one */
     l1=strchr(l1, ';')+1;
     l2=strchr(l2, ';')+1;

     /* find the end of data: end of string or start of comment */
     end1=endOfData(l1);
     end2=endOfData(l2);

     /* compare the line data portions */
     length=end1-l1;
     return length==(end2-l2) && 0==memcmp(l1, l2, length);
 }

 extern int
 main(int argc, const char *argv[]) {
     static char line[2000], firstLine[2000], lastLine[2000];
     char *end;
     long first, last, c;
     int finished;

     first=last=-1;
     finished=0;

     for(;;) {
         if(gets(line)!=NULL) {
             /* parse the initial code point, if any */
             c=strtol(line, &end, 16);
             if(end!=line && *skipWhitespace(end)==';') {
                 /* single code point followed by semicolon and data, keep c */
             } else {
                 c=-1;
             }
         } else {
             line[0]=0;
             c=-1;
             finished=1;
         }

         if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
             /* output the current range */
             if(first==last) {
                 /* there was no range, just output the one line we found */
                 puts(firstLine);
             } else {
                 /* there was a real range, merge their lines */
                 end=strchr(lastLine, '#');
                 if(end==NULL) {
                     /* no comment in second line */
                     printf("%04lX..%04lX%s\n",
                             first, last,            /* code point range */
                             strchr(firstLine, ';'));/* first line starting from the first ; */
                 } else if(strchr(firstLine, '#')==NULL) {
                     /* no comment in first line */
                     printf("%04lX..%04lX%s%s\n",
                             first, last,            /* code point range */
                             strchr(firstLine, ';'), /* first line starting from the first ; */
                             end);                   /* comment from second line */
                 } else {
                     /* merge comments from both lines */
                     printf("%04lX..%04lX%s..%s\n",
                             first, last,            /* code point range */
                             strchr(firstLine, ';'), /* first line starting from the first ; */
                             skipWhitespace(end+1)); /* comment from second line, after # and spaces */
                 }
             }
             first=last=-1;
         }

         if(c<0) {
             if(finished) {
                 break;
             }

             /* no data on this line, output as is */
             puts(line);
         } else {
             /* data on this line, store for possible range compaction */
             if(last<0) {
                 /* set as the first line in a possible range */
                 first=last=c;
                 strcpy(firstLine, line);
                 lastLine[0]=0;
             } else /* must be c==(last+1) && sameData() because of previous conditions */ {
                 /* continue with the current range */
                 last=c;
                 strcpy(lastLine, line);
             }
         }
     }

     return 0;
 }
	/*
	*******************************************************************************
	*
	* Copyright (C) 2003, International Business Machines
	* Corporation and others. All Rights Reserved.
	*
	*******************************************************************************
	* file name: ucdmerge.c
	* encoding: US-ASCII
	* tab size: 8 (not used)
	* indentation:4
	*
	* created on: 2003feb20
	* created by: Markus W. Scherer
	*
	* Simple tool for Unicode Character Database files with semicolon-delimited fields.
	* Merges adjacent, identical per-code point data lines into one line with range syntax.
	*
	* To compile, just call a C compiler/linker with this source file.
	* On Windows: cl ucdmerge.c
	*/

	#include <stdio.h>
	#include <string.h>
	#include <stdlib.h>

	static const char *
	skipWhitespace(const char *s) {
	while(s==' ' \|\| s=='\t') {
	++s;
	}
	return s;
	}

	/* return the first character position after the end of the data */
	static char *
	endOfData(const char *l) {
	char *end;
	char c;

	end=strchr(l, '#');
	if(end!=NULL) {
	/* ignore whitespace before the comment */
	while(l!=end && ((c=*(end-1))==' ' \|\| c=='\t')) {
	--end;
	}
	} else {
	end=strchr(l, 0);
	}
	return end;
	}

	static int
	sameData(const char l1, const char l2) {
	char end1, end2;
	int length;

	/* find the first semicolon in each line - there must be one */
	l1=strchr(l1, ';')+1;
	l2=strchr(l2, ';')+1;

	/* find the end of data: end of string or start of comment */
	end1=endOfData(l1);
	end2=endOfData(l2);

	/* compare the line data portions */
	length=end1-l1;
	return length==(end2-l2) && 0==memcmp(l1, l2, length);
	}

	extern int
	main(int argc, const char *argv[]) {
	static char line[2000], firstLine[2000], lastLine[2000];
	char *end;
	long first, last, c;
	int finished;

	first=last=-1;
	finished=0;

	for(;;) {
	if(gets(line)!=NULL) {
	/* parse the initial code point, if any */
	c=strtol(line, &end, 16);
	if(end!=line && *skipWhitespace(end)==';') {
	/* single code point followed by semicolon and data, keep c */
	} else {
	c=-1;
	}
	} else {
	line[0]=0;
	c=-1;
	finished=1;
	}

	if(last>=0 && (c!=(last+1) \|\| !sameData(firstLine, line))) {
	/* output the current range */
	if(first==last) {
	/* there was no range, just output the one line we found */
	puts(firstLine);
	} else {
	/* there was a real range, merge their lines */
	end=strchr(lastLine, '#');
	if(end==NULL) {
	/* no comment in second line */
	printf("%04lX..%04lX%s\n",
	first, last, /* code point range */
	strchr(firstLine, ';'));/* first line starting from the first ; */
	} else if(strchr(firstLine, '#')==NULL) {
	/* no comment in first line */
	printf("%04lX..%04lX%s%s\n",
	first, last, /* code point range */
	strchr(firstLine, ';'), /* first line starting from the first ; */
	end); /* comment from second line */
	} else {
	/* merge comments from both lines */
	printf("%04lX..%04lX%s..%s\n",
	first, last, /* code point range */
	strchr(firstLine, ';'), /* first line starting from the first ; */
	skipWhitespace(end+1)); /* comment from second line, after # and spaces */
	}
	}
	first=last=-1;
	}

	if(c<0) {
	if(finished) {
	break;
	}

	/* no data on this line, output as is */
	puts(line);
	} else {
	/* data on this line, store for possible range compaction */
	if(last<0) {
	/* set as the first line in a possible range */
	first=last=c;
	strcpy(firstLine, line);
	lastLine[0]=0;
	} else /* must be c==(last+1) && sameData() because of previous conditions */ {
	/* continue with the current range */
	last=c;
	strcpy(lastLine, line);
	}
	}
	}

	return 0;
	}