| /* |
| ********************************************************************** |
| * Copyright (C) 2002, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * file name: utfperf.c |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2002apr17 |
| * created by: Markus W. Scherer |
| * |
| * Performance test program for Unicode converters |
| * (converters that support all Unicode code points). |
| * Takes a UTF-8 file as input. |
| */ |
| |
| #include <stdio.h> |
| #include <string.h> |
| |
| #include <fcntl.h> /* for _O_BINARY */ |
| #include <io.h> /* for _setmode() */ |
| |
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) |
| # include <windows.h> |
| #else |
| # include <sys/time.h> |
| static unsigned long |
| timeGetTime() { |
| struct timeval t; |
| |
| gettimeofday(&t, 0); |
| return t.tv_sec*1000+t.tv_usec/1000; |
| }; |
| #endif |
| |
| #include "unicode/utypes.h" |
| #include "unicode/ucnv.h" |
| #include "unicode/ustring.h" |
| |
| /* definitions and text buffers */ |
| |
| #define INPUT_CAPACITY (1024*1024) |
| #define INTERMEDIATE_CAPACITY 4096 |
| #define INTERMEDIATE_SMALL_CAPACITY 20 |
| #define OUTPUT_CAPACITY INPUT_CAPACITY |
| |
| #define TARGET_MEASURE_TIME_MS 2000 |
| |
| #define PERCENT(a, b) (int)(((a)*200+1)/(2*(b))) |
| |
| #define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0])) |
| |
| static UChar input[INPUT_CAPACITY], output[OUTPUT_CAPACITY]; |
| static char intermediate[INTERMEDIATE_CAPACITY]; |
| |
| static int32_t inputLength, encodedLength, outputLength, countInputCodePoints; |
| |
| static int32_t utf8Length=0; |
| static double utf8Time=0.; |
| |
| static const char *const |
| utfNames[]={ |
| "UTF-8", /* UTF-8 should always be first to serve as percentage reference */ |
| "SCSU", "BOCU-1" /*, "CESU-8" *//*, "UTF-16BE", "UTF-16LE"*//*, "GB18030"*/ |
| }; |
| |
| /* functions */ |
| |
| typedef void |
| RoundtripFn(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode); |
| |
| static void |
| roundtrip(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) { |
| const UChar *pIn, *pInLimit; |
| UChar *pOut, *pOutLimit; |
| char *pInter, *pInterLimit, *p; |
| UBool flush; |
| |
| ucnv_reset(cnv); |
| |
| pIn=input; |
| pInLimit=input+inputLength; |
| |
| pOut=output; |
| pOutLimit=output+OUTPUT_CAPACITY; |
| |
| pInterLimit=intermediate+intermediateCapacity; |
| |
| encodedLength=outputLength=0; |
| flush=FALSE; |
| |
| while(pIn<pInLimit || !flush) { |
| /* convert a block of [pIn..pInLimit[ to the encoding in intermediate[] */ |
| pInter=intermediate; |
| flush=(UBool)(pIn==pInLimit); |
| ucnv_fromUnicode(cnv, |
| &pInter, pInterLimit, |
| &pIn, pInLimit, |
| NULL, flush, |
| pErrorCode); |
| encodedLength+=(int32_t)(pInter-intermediate); |
| |
| if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { |
| /* in case flush was TRUE make sure that we convert once more to really flush */ |
| flush=FALSE; |
| *pErrorCode=U_ZERO_ERROR; |
| } else if(U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| |
| /* convert the block [intermediate..pInter[ back to UTF-16 */ |
| p=intermediate; |
| ucnv_toUnicode(cnv, |
| &pOut, pOutLimit, |
| &p, pInter, |
| NULL, flush, |
| pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| /* intermediate must have been consumed (p==pInter) because of the converter semantics */ |
| } |
| |
| outputLength=pOut-output; |
| if(inputLength!=outputLength) { |
| fprintf(stderr, "error: roundtrip failed, inputLength %d!=outputLength %d\n", inputLength, outputLength); |
| *pErrorCode=U_INTERNAL_PROGRAM_ERROR; |
| } |
| } |
| |
| static void |
| noop(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) { |
| /* do nothing */ |
| } |
| |
| static unsigned long |
| measureRoundtrips(RoundtripFn *fn, UConverter *cnv, const char *encName, int32_t intermediateCapacity, int32_t n) { |
| unsigned long _time; |
| UErrorCode errorCode; |
| |
| _time=timeGetTime(); |
| errorCode=U_ZERO_ERROR; |
| do { |
| fn(cnv, intermediateCapacity, &errorCode); |
| } while(U_SUCCESS(errorCode) && --n>0); |
| _time=timeGetTime()-_time; |
| |
| if(U_FAILURE(errorCode)) { |
| fprintf(stderr, "error in roundtrip conversion (%s): %s\n", encName, u_errorName(errorCode)); |
| return 0x7fffffff; |
| } |
| |
| if(0!=u_memcmp(input, output, inputLength)) { |
| fprintf(stderr, "error: roundtrip failed, input[]!=output[]\n"); |
| return 0x7fffffff; |
| } |
| |
| return _time; |
| } |
| |
| static void |
| perEncAndCapacity(UConverter *cnv, const char *encName, int32_t intermediateCapacity) { |
| double rtTime; |
| unsigned long _time; |
| int32_t n; |
| |
| /*printf("test performance for %s with intermediate capacity %d\n", encName, intermediateCapacity);*/ |
| |
| /* warm up caches and estimate loop time */ |
| n=10; |
| for(;;) { |
| _time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n); |
| if(_time<500 && _time<TARGET_MEASURE_TIME_MS/10) { |
| n*=10; |
| } else { |
| break; |
| } |
| } |
| |
| if(_time<TARGET_MEASURE_TIME_MS) { |
| n=(n*TARGET_MEASURE_TIME_MS)/_time+1; |
| } |
| |
| /* run actual measurement with a target test time of 10s */ |
| _time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n); |
| |
| /* subtract same number of loops over no-operation function */ |
| _time-=measureRoundtrips(noop, cnv, encName, intermediateCapacity, n); |
| |
| rtTime=((double)_time*1000.)/(double)n; |
| |
| /* report */ |
| printf("* performance report for %8s:\n", encName); |
| printf(" intermediate buffer capacity %8d B\n", intermediateCapacity); |
| if(intermediateCapacity==INTERMEDIATE_CAPACITY && utf8Length!=0) { |
| printf(" number of encoding bytes %8d B (%3d%% of UTF-8)\n", encodedLength, PERCENT(encodedLength, utf8Length)); |
| printf(" roundtrip conversion time %8g μs (%3d%% of UTF-8)\n", rtTime, PERCENT(rtTime, utf8Time)); |
| } else { |
| printf(" number of encoding bytes %8d B\n", encodedLength); |
| printf(" roundtrip conversion time %8g μs\n", rtTime); |
| } |
| printf(" average bytes/code point %8g B/cp\n", (double)encodedLength/countInputCodePoints); |
| puts(""); |
| |
| /* set UTF-8 values */ |
| if(intermediateCapacity==INTERMEDIATE_CAPACITY && 0==strcmp(encName, "UTF-8")) { |
| utf8Length=encodedLength; |
| utf8Time=rtTime; |
| } |
| } |
| |
| static void |
| perEnc(UConverter *cnv, const char *encName) { |
| /*printf("test performance for %s\n", encName);*/ |
| perEncAndCapacity(cnv, encName, INTERMEDIATE_CAPACITY); |
| perEncAndCapacity(cnv, encName, INTERMEDIATE_SMALL_CAPACITY); |
| } |
| |
| static void |
| testPerformance() { |
| UConverter *cnv; |
| UErrorCode errorCode; |
| int32_t i; |
| |
| printf("number of code points %8d cp\n", countInputCodePoints); |
| printf("platform endianness: %8s-endian\n", U_IS_BIG_ENDIAN ? "big" : "little"); |
| puts(""); |
| for(i=0; i<ARRAY_LENGTH(utfNames); ++i) { |
| errorCode=U_ZERO_ERROR; |
| cnv=ucnv_open(utfNames[i], &errorCode); |
| if(U_SUCCESS(errorCode)) { |
| perEnc(cnv, utfNames[i]); |
| ucnv_close(cnv); |
| } else { |
| fprintf(stderr, "error opening converter for \"%s\" - %s\n", utfNames[i], u_errorName(errorCode)); |
| } |
| } |
| } |
| |
| /* read a complete block from the input file */ |
| static int32_t |
| readBlock(FILE *in) { |
| int length, blockLength; |
| |
| blockLength=0; |
| while(blockLength<INTERMEDIATE_CAPACITY && !feof(in)) { |
| length=fread(intermediate, 1, INTERMEDIATE_CAPACITY-blockLength, in); |
| if(length<0 || ferror(in)) { |
| return -1; |
| } |
| blockLength+=length; |
| } |
| |
| return (int32_t)blockLength; |
| } |
| |
| static UBool |
| readInput(FILE *in, const char *encName) { |
| UConverter *cnv; |
| UChar *pOut, *pOutLimit; |
| const char *p, *limit; |
| int32_t length; |
| UErrorCode errorCode; |
| |
| pOut=input; |
| pOutLimit=input+INPUT_CAPACITY; |
| |
| errorCode=U_ZERO_ERROR; |
| |
| /* read the first block and open the converter */ |
| length=readBlock(in); |
| if(length<0) { |
| return FALSE; |
| } |
| |
| if(encName==NULL) { |
| int32_t signatureLength; |
| encName=ucnv_detectUnicodeSignature(intermediate, length, |
| &signatureLength, |
| &errorCode); |
| if(U_FAILURE(errorCode) || encName==NULL) { |
| /* default to UTF-8 */ |
| printf("no Unicode signature - using UTF-8\n"); |
| encName="UTF-8"; |
| errorCode=U_ZERO_ERROR; |
| } else { |
| printf("detected signature for %s (removing %d bytes)\n", encName, signatureLength); |
| /* remove signature byte sequence */ |
| memmove(intermediate, intermediate+signatureLength, length-=signatureLength); |
| } |
| } |
| |
| cnv=ucnv_open(encName, &errorCode); |
| if(U_FAILURE(errorCode)) { |
| fprintf(stderr, "error: unable to ucnv_open(\"%s\") - %s\n", encName, u_errorName(errorCode)); |
| return FALSE; |
| } |
| |
| while(length>0) { |
| /* convert the block */ |
| p=intermediate; |
| limit=p+length; |
| |
| ucnv_toUnicode(cnv, |
| &pOut, pOutLimit, |
| &p, limit, |
| NULL, FALSE, |
| &errorCode); |
| if(U_FAILURE(errorCode)) { |
| fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode)); |
| ucnv_close(cnv); |
| return FALSE; |
| } |
| |
| /* read the next block */ |
| length=readBlock(in); |
| if(length<0) { |
| ucnv_close(cnv); |
| return FALSE; |
| } |
| } |
| |
| /* flush the converter */ |
| ucnv_toUnicode(cnv, |
| &pOut, pOutLimit, |
| &p, p, |
| NULL, TRUE, |
| &errorCode); |
| ucnv_close(cnv); |
| |
| if(U_FAILURE(errorCode)) { |
| fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode)); |
| return FALSE; |
| } |
| |
| inputLength=(int32_t)(pOut-input); |
| countInputCodePoints=u_countChar32(input, inputLength); |
| if(inputLength<=0) { |
| fprintf(stderr, "warning: input is empty\n"); |
| return FALSE; |
| } |
| |
| return TRUE; |
| } |
| |
| static void |
| showUsage(const char *myName) { |
| fprintf(stderr, |
| "Usage:\n" |
| "%s [-e encoding-name] filename | '-'\n" |
| " encoding-name must be the name of an encoding supported by ICU\n" |
| " the filename of the input file with text to be used\n" |
| " can be a dash (-) for standard input\n", |
| myName); |
| } |
| |
| /* |
| * Read file using some encoding, convert to 1M UTF-16 input buffer. |
| * For each UTF to be tested: |
| * n times: |
| * convert from UTF-16 input buffer to UTF, 4kB buffer |
| * convert from 4kB buffer to 1M UTF-16 output buffer |
| * adjust n so that time elapsed is 10s (#define) |
| * ->divide 10s by time, increase n by that factor, run 2nd time |
| * n times: |
| * empty function |
| * subtract out loop/function overhead |
| * display #code points - #UTF bytes - time per roundtrip |
| * |
| * * do the same again with an intermediate buffer size of 20 instead of 4kB |
| * |
| * Test following UTFs: |
| * UTF-16BE, UTF-16LE, UTF-8, SCSU, BOCU-1, CESU-8 |
| * |
| * Command-line arguments: |
| * - encoding (default UTF-8, detect BOM) |
| * - filename (allow "-") |
| */ |
| extern int |
| main(int argc, const char *argv[]) { |
| FILE *in; |
| const char *myName, *encName, *filename, *basename; |
| |
| myName=argv[0]; |
| if(argc<2) { |
| showUsage(myName); |
| return 1; |
| } |
| |
| /* get encoding name argument */ |
| if(argv[1][0]=='-' && argv[1][1]=='e') { |
| encName=argv[1]+2; |
| --argc; |
| ++argv; |
| if(*encName==0) { |
| if(argc<2) { |
| showUsage(myName); |
| return 1; |
| } |
| encName=argv[1]; |
| --argc; |
| ++argv; |
| } |
| } else { |
| encName=NULL; |
| } |
| |
| /* get filename argument */ |
| if(argc<2) { |
| showUsage(myName); |
| return 1; |
| } |
| filename=argv[1]; |
| if(filename[0]=='-' && filename[1]==0) { |
| filename="(standard input)"; |
| in=stdin; |
| /* set stdin to binary mode */ |
| _setmode(_fileno(stdin), _O_BINARY); |
| } else { |
| in=fopen(filename, "rb"); |
| if(in==NULL) { |
| fprintf(stderr, "error opening \"%s\"\n", filename); |
| showUsage(myName); |
| return 2; |
| } |
| } |
| |
| /* read input */ |
| basename=strrchr(filename, U_FILE_SEP_CHAR); |
| if(basename!=NULL) { |
| ++basename; |
| } else { |
| basename=filename; |
| } |
| printf("# testing converter performance with file \"%s\"\n", basename); |
| if(!readInput(in, encName)) { |
| fprintf(stderr, "error reading \"%s\" (encoding %s)\n", filename, encName); |
| showUsage(myName); |
| return 2; |
| } |
| if(in!=stdin) { |
| fclose(in); |
| } |
| |
| /* test performance */ |
| testPerformance(); |
| return 0; |
| } |