| /* |
| ********************************************************************** |
| * Copyright (C) 1998-2000, International Business Machines Corporation |
| * and others. All Rights Reserved. |
| ********************************************************************** |
| * |
| */ |
| // XMLConverter.cpp |
| // To convert one encoded XML file to another |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <assert.h> |
| |
| /* Define _XPG4_2 for Solaris and friends. */ |
| #ifndef _XPG4_2 |
| #define _XPG4_2 |
| #endif |
| |
| /* Define __USE_XOPEN_EXTENDED for Linux and glibc. */ |
| #ifndef __USE_XOPEN_EXTENDED |
| #define __USE_XOPEN_EXTENDED |
| #endif |
| |
| #include <string.h> |
| #include <cstring.h> |
| |
| #ifdef _WIN32 |
| # include <windows.h> |
| #endif |
| |
| #include "unicode/utypes.h" |
| #include "unicode/ustring.h" |
| #include "unicode/ucnv.h" |
| #include "unicode/ucnv_err.h" |
| #include "unicode/uloc.h" |
| #include "unicode/uchar.h" |
| |
| #define MAXFILENAMELEN 1024 |
| #define RAWBUFSIZE 4096 |
| #define ENCODINGCOUNT 5 |
| #define FIRSTLINEBUF 256 |
| typedef unsigned char BYTE; |
| |
| |
| char firstLine[128]; |
| char encodingNameInFile[256]; |
| UBool verbose = FALSE; |
| |
| extern void convertFile(char*, char*, char*, UConverter*); |
| extern void usage(); |
| extern void printChars(unsigned char*, int); |
| extern int getInputEncodingType(const BYTE* rawBuffer, |
| unsigned long byteCount); |
| extern long convertFirstLine(FILE* inF, |
| char* inEncName, |
| FILE* outF, |
| char* outEncName, |
| char* ptrBuf, |
| unsigned long toRead, |
| UChar* uBuf); |
| extern void catString(char* thisString, UBool quote); |
| extern int32_t XMLUConvert( UConverter* inConverter, |
| UConverter* outConverter, |
| const char* inBuffer, |
| int32_t* inBufSize, |
| char* outBuffer, |
| int32_t outBufCapacity, |
| UBool flush, |
| UErrorCode* err); |
| extern void XMLU_fromCodepageToCodepage( UConverter* outConverter, |
| UConverter* inConverter, |
| char** target, |
| const char* targetLimit, |
| const char** source, |
| const char* sourceLimit, |
| int32_t* offsets, |
| UBool flush, |
| UErrorCode* err); |
| |
| static const BYTE gEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94 }; |
| #if 0 |
| //not supported encodings |
| static const BYTE gUCS4BPre[] = { 0x00, 0x00, 0x00, 0x3C }; |
| static const BYTE gUCS4LPre[] = { 0x3C, 0x00, 0x00, 0x00 }; |
| #endif |
| static const BYTE gUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F }; |
| static const BYTE gUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00 }; |
| static const char gXMLDecl_ASCII[]= { 0x3C, 0x3F, 0x78, 0x6D, 0x6C }; |
| |
| enum Encodings |
| { |
| EBCDIC = 0, |
| UCS_4B = 1, |
| UCS_4L = 2, |
| US_ASCII = 3, |
| UTF_8 = 4, |
| UTF_16B = 5, |
| UTF_16L = 6, |
| |
| Encodings_Count = ENCODINGCOUNT, |
| Encodings_Min = EBCDIC, |
| Encodings_Max = UTF_16L, |
| |
| OtherEncoding = 999 |
| }; |
| |
| |
| void usage(char * exeName) |
| { |
| fprintf(stdout, "\n USAGE: \n \t%s [-h] [-v] -e trgEncName inputFile outputFile \n\n", exeName); |
| fprintf(stdout, " %s = Exe name \n ", exeName); |
| fprintf(stdout, "-h \t= to get help (this information!) \n "); |
| fprintf(stdout, "-v \t= set verbose on; \n \t\t to get more information about the conversion process \n "); |
| fprintf(stdout, "-e \t= This is a mandatory option and follows with the targetEncName"); |
| fprintf(stdout, " \t\t E.g., output encoding can be like : \n \t\t ascii, utf8, utf-16be, utf-16le, ebcdic-cp-us \n"); |
| fprintf(stdout, "trgEncName \t= The output encoding type needed. \n \t\t It always should follow the -e switch\n"); |
| fprintf(stdout, "inputFile \t= The input XML file name \n"); |
| fprintf(stdout, "outputFile \t= The output XML file name \n"); |
| fprintf(stdout, " \n For example: \n "); |
| fprintf(stdout, " \t %s -e utf8 pr-utf-16.xml pr-utf-8.xml \n\n\n ", exeName); |
| } |
| |
| |
| |
| int main(int argc, char** argv) |
| { |
| UErrorCode err = U_ZERO_ERROR; |
| char* inFileName; |
| char* outFileName; |
| char * encName = NULL; |
| |
| UConverter* conv = NULL; |
| |
| for (int i=0; i< argc; i++) |
| { |
| if (!strcmp( argv[i], "-h") || (argc < 5) ) |
| { |
| usage(argv[0]); |
| exit(1); |
| } |
| if (!strcmp( argv[i], "-v")) |
| verbose = TRUE; |
| if (!strcmp( argv[i], "-e")) |
| { |
| if ( argc == i+4) |
| { |
| encName = new char[strlen(argv[i+1]) +1]; |
| strcpy(encName, argv[i+1]); |
| inFileName = new char[strlen(argv[i+2]) +1]; |
| strcpy(inFileName, argv[i+2]); |
| outFileName = new char[strlen(argv[i+3]) +1]; |
| strcpy(outFileName, argv[i+3]); |
| break; |
| } |
| else |
| { |
| usage(argv[0]); |
| exit(1); |
| } |
| } |
| } |
| |
| conv = ucnv_open(encName, &err); |
| if (U_FAILURE(err)) |
| { |
| if (verbose) |
| { |
| fprintf(stderr, "Could not create converter to: %s\n", encName); |
| #if defined(_DEBUG) && defined(XP_CPLUSPLUS) |
| fprintf (stderr,"FAILURE! (%s) (%d)\n", u_errorName(err), err); |
| #endif |
| } |
| ucnv_close(conv); |
| exit(1); |
| } |
| |
| fprintf(stdout, "Converting %s to %s...\n", inFileName, outFileName); |
| convertFile(encName, inFileName, outFileName, conv); |
| fprintf(stdout, "Finished transcoding file: %s\n", inFileName); |
| |
| ucnv_close(conv); |
| if (encName) |
| delete encName; |
| return 0; |
| } |
| |
| void convertFile(char* encName, char* iFN, char* oFN, UConverter* outConvrtr) |
| { |
| //Read the input file |
| // |
| FILE* inFile = fopen( iFN, "rb"); |
| if (inFile == NULL) { |
| if (verbose) |
| fprintf(stderr, "Could not open input file - %s for reading \n", iFN); |
| exit(1); |
| } |
| |
| FILE* outFile = fopen(oFN, "wb"); |
| if (outFile == NULL) |
| { |
| if (verbose) |
| fprintf(stderr, "Could not open output file - %s for writing \n", oFN); |
| fclose(inFile); |
| return; |
| } |
| |
| char rawBuf[RAWBUFSIZE]; |
| char* pRawBuf = NULL; |
| unsigned long bytesRead = 0; |
| UErrorCode err = U_ZERO_ERROR; |
| |
| //get the file size |
| // |
| unsigned int curPos = ftell(inFile); |
| |
| if(verbose) |
| fprintf(stderr, "curPos = %d\n", curPos); |
| |
| if (curPos == 0xFFFFFFFF) |
| { |
| fprintf(stderr, "fileSize - Could not save current pos \n"); |
| exit(1); |
| } |
| |
| // Seek to the end and save that value for return |
| // |
| if ( fseek(inFile, 0 , SEEK_END) ) |
| { |
| fprintf(stderr, "fileSize - Could not seek to end \n"); |
| exit(1); |
| } |
| |
| const unsigned int endPos = ftell(inFile); |
| if (endPos == 0xFFFFFFFF) |
| { |
| fprintf(stderr, "fileSize - Could not get the end pos \n"); |
| exit(1); |
| } |
| |
| // And put the pointer back |
| // |
| if (fseek(inFile, curPos, SEEK_SET)) |
| { |
| fprintf(stderr, "fileSize - Could not seek back to original pos \n"); |
| exit(1); |
| } |
| |
| if (curPos >= endPos) |
| { |
| fprintf(stderr,"Reached end of input file while reading \n"); |
| exit(1); |
| } |
| |
| unsigned int bytesLeft = endPos - curPos; |
| if (verbose) |
| fprintf(stdout,"Input file size is %d \n", bytesLeft); |
| |
| unsigned int toRead = (RAWBUFSIZE > bytesLeft) ? bytesLeft : RAWBUFSIZE; |
| |
| //Read the infile |
| // |
| bytesRead = fread( (void*)rawBuf, 1, toRead, inFile); |
| if (ferror(inFile)) |
| { |
| fprintf(stderr," couldnot read file for input encoding \n"); |
| exit(1); |
| } |
| |
| if (bytesRead == 0) |
| { |
| fprintf(stderr," couldnot fill raw buffer \n"); |
| exit(1); |
| } |
| pRawBuf = rawBuf; |
| |
| // get the input encoding type |
| int inputEnc = getInputEncodingType((const BYTE*)rawBuf, bytesRead); |
| if (inputEnc == OtherEncoding) |
| { |
| fprintf(stderr, " Unknown encoded input file. \n Only input encodings supported in the first line are \n"); |
| fprintf(stderr, " ascii, ebcdic-cp-us, utf8, utf-16be, utf-16le \n"); |
| exit(1); |
| } |
| |
| //transcoding the first line from inEncodName to ascii and then replacing |
| //the encoding=inEncodingName to encoding=outEncodingName |
| // |
| |
| UChar ucBuf[RAWBUFSIZE]; |
| char * inEncodName; |
| char* tmpPtr = (char*) rawBuf; |
| |
| //get the input encoding name |
| // |
| switch (inputEnc) |
| { |
| case 0 : |
| inEncodName = new char[strlen("ebcdic-cp-us") +1]; |
| strcpy(inEncodName, "ebcdic-cp-us"); |
| break; |
| case 3 : |
| inEncodName = new char[strlen("ascii") +1]; |
| strcpy(inEncodName, "ascii"); |
| break; |
| case 4 : |
| inEncodName = new char[strlen("utf8") +1]; |
| strcpy(inEncodName, "utf8"); |
| break; |
| case 5 : |
| inEncodName = new char[strlen("utf-16be") +1]; |
| strcpy(inEncodName, "utf-16be"); |
| break; |
| case 6 : |
| inEncodName = new char[strlen("utf-16le") +1]; |
| strcpy(inEncodName, "utf-16le"); |
| break; |
| default : |
| break; |
| }; |
| |
| if(verbose) |
| { |
| fprintf(stderr, "inConverter = %s\n", inEncodName); |
| } |
| |
| UConverter* inConvrtr = ucnv_open(inEncodName, &err); |
| //now read and transcode the input to output file |
| //Process the firstline separately |
| // |
| long afterFirstLine = convertFirstLine(inFile, inEncodName, outFile, encName, |
| pRawBuf, toRead, (UChar*)ucBuf); |
| |
| //move the pointer after the first line |
| // |
| if (fseek(inFile, (unsigned long) afterFirstLine, SEEK_SET)) |
| { |
| fprintf(stderr, "fileSize - Could not set the cursor to %d after the first line \n", afterFirstLine); |
| exit(1); |
| } |
| else |
| if(verbose) |
| fprintf(stderr,"Seeked to %d OK \n", afterFirstLine); |
| bytesLeft = endPos - afterFirstLine; |
| toRead = (RAWBUFSIZE > bytesLeft) ? bytesLeft : RAWBUFSIZE; |
| |
| // read the rest of the input file |
| // |
| if (verbose) |
| fprintf(stdout,"The first line consists of %d bytes \n", afterFirstLine); |
| if (encodingNameInFile !=NULL) |
| { |
| if (inEncodName) |
| delete inEncodName; |
| inEncodName = new char[strlen(encodingNameInFile)+1]; |
| strcpy(inEncodName, encodingNameInFile); |
| ucnv_close(inConvrtr); |
| inConvrtr = ucnv_open(inEncodName, &err); |
| } |
| if (verbose) |
| fprintf(stdout, "Input Encoding type = %s, Output Encoding type = %s \n", inEncodName, encName); |
| |
| char *outBuf = new char[RAWBUFSIZE]; |
| int outBufSize = RAWBUFSIZE; |
| UBool tFlush = FALSE; |
| err = U_ZERO_ERROR; |
| |
| if (verbose) |
| fprintf(stdout, "processing the rest of the file \n"); |
| while( (bytesRead = fread((void *) rawBuf, 1, toRead, inFile)) > 0 || !tFlush) |
| { |
| int32_t bytesNeeded = XMLUConvert( inConvrtr, |
| outConvrtr, |
| pRawBuf, |
| (int32_t*)&bytesRead, |
| outBuf, |
| outBufSize, |
| tFlush, |
| &err); |
| if (bytesNeeded > 0) |
| { |
| long bout = |
| fwrite((void *) outBuf, 1, bytesNeeded, outFile); |
| if (bout != bytesNeeded) |
| { |
| fprintf(stderr, "Wrote only %d bytes.\n", bout); |
| fclose(inFile); |
| fclose(outFile); |
| } |
| } |
| |
| if ((err != U_BUFFER_OVERFLOW_ERROR) && U_FAILURE(err) ) |
| { |
| #if defined(_DEBUG) |
| fprintf (stderr, "Error transcoding rest of the file: (%s) %d\n", u_errorName(err), err); |
| #endif |
| fclose(inFile); |
| fclose(outFile); |
| exit(1); |
| } |
| if ((bytesRead > 0) && (err !=U_ZERO_ERROR)) |
| { |
| if(verbose) |
| fprintf(stderr, "err=%d * read %d bytes\n", err,bytesRead); |
| |
| if (fseek(inFile, (curPos+bytesRead), SEEK_SET)) |
| { |
| fprintf(stderr, "fileSize - Could not set the input cursor to %d (curpos=%d, bytesRead=%d)\n", curPos+bytesRead,curPos,bytesRead); |
| exit(1); |
| } |
| curPos = ftell(inFile); |
| bytesLeft = endPos - curPos; |
| } |
| else |
| { |
| curPos = ftell(inFile); |
| bytesLeft = endPos - curPos; |
| } |
| toRead = (RAWBUFSIZE > bytesLeft) ? bytesLeft : RAWBUFSIZE; |
| if (toRead < RAWBUFSIZE) tFlush = TRUE; |
| if (err == U_BUFFER_OVERFLOW_ERROR) |
| err = U_ZERO_ERROR; |
| } |
| ucnv_close(inConvrtr); |
| delete inEncodName; |
| fclose(inFile); |
| fclose(outFile); |
| }; |
| |
| |
| |
| int getInputEncodingType(const BYTE* rawBuffer, unsigned long byteCount) |
| { |
| //match the first four bytes of the input buffer with the encoding types available |
| //checking for ASCII |
| // |
| if (byteCount > 5) |
| { |
| if (!memcmp(rawBuffer, gXMLDecl_ASCII, 5)) |
| return US_ASCII; |
| } |
| |
| // If the count of raw bytes is less than 2, it cannot be anything |
| // we understand, so return UTF-8 as a fallback. |
| // |
| if (byteCount < 2) |
| return UTF_8; |
| |
| // We know its at least two bytes, so lets check for a UTF-16 BOM. |
| // |
| if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF)) |
| return UTF_16B; |
| else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) |
| return UTF_16L; |
| |
| // Oh well, not one of those. So now lets see if we have at least 4 |
| // bytes. If not, then we are out of ideas and can return UTF-8 as the |
| // fallback. |
| // |
| if (byteCount < 4) |
| return OtherEncoding; |
| |
| // We have at least 4 bytes. So lets check the 4 byte sequences that |
| // indicate other UTF-16 encodings. |
| // |
| if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C)) |
| { |
| #if 0 |
| //not supported encodings |
| if (!memcmp(rawBuffer, gUCS4BPre, 4)) |
| return UCS_4B; |
| else if (!memcmp(rawBuffer, gUCS4LPre, 4)) |
| return UCS_4L; |
| else |
| #endif |
| if (!memcmp(rawBuffer, gUTF16BPre, 4)) |
| return UTF_16B; |
| else if (!memcmp(rawBuffer, gUTF16LPre, 4)) |
| return UTF_16L; |
| } |
| |
| // See if we have enough bytes to possibly match the EBCDIC prefix. |
| // If so, try it. |
| // |
| if (!memcmp(rawBuffer, gEBCDICPre, 4)) |
| return EBCDIC; |
| |
| // Does not seem to be anything we know, so go with UTF-8 to get at |
| // least through the first line and see what it really is. |
| // |
| return OtherEncoding; |
| } |
| |
| |
| long convertFirstLine( FILE* inF, char* inEncName, |
| FILE* outF, char* outEncName, |
| char* ptrBuf, unsigned long toRead, |
| UChar* uBuf) |
| { |
| //Here we read the inputFile with the specified buffer size. |
| //Then convert this to ascii. then read the first line and convert to |
| //output and input encoding types and return for rest of the conversion |
| // |
| |
| if (fseek(inF, 0, SEEK_SET)) |
| { |
| fprintf(stderr, "file - Could not seek the begin pos \n"); |
| exit(1); |
| } |
| |
| unsigned long bytesRead = fread( (void*)ptrBuf, 1, toRead, inF); |
| |
| char tempBuf[RAWBUFSIZE]; |
| int bufLength = 0; |
| long bytesNeeded = 0; |
| UErrorCode err = U_ZERO_ERROR; |
| |
| bytesNeeded = ucnv_convert("ascii", |
| inEncName, |
| (char*) tempBuf, |
| 0, |
| (const char*) ptrBuf, |
| bytesRead, |
| &err); |
| |
| if (err == U_BUFFER_OVERFLOW_ERROR) |
| { |
| err = U_ZERO_ERROR; |
| } |
| else if (U_FAILURE(err)) |
| { |
| #if defined(_DEBUG) |
| printf ("Error transcoding first line of input file: (%s) %d\n", u_errorName(err), err); |
| #endif |
| fclose(inF); |
| fclose(outF); |
| exit(1); |
| } |
| |
| ucnv_convert("ascii", |
| inEncName, |
| (char*) tempBuf, |
| bytesNeeded, |
| (const char*) ptrBuf, |
| bytesRead, |
| &err); |
| |
| if (U_FAILURE(err)) |
| { |
| #if defined(_DEBUG) |
| printf ("Error transcoding2 first line of input file: (%s) %d\n", u_errorName(err), err); |
| #endif |
| fclose(inF); |
| fclose(outF); |
| exit(1); |
| } |
| else |
| { |
| //read the tempBuf to get the first line |
| // |
| char firstLineBuf[FIRSTLINEBUF]; |
| int tempBufLength = 0; |
| |
| for( bufLength = 0, tempBufLength=0; bufLength < FIRSTLINEBUF; bufLength++, tempBufLength++) |
| { |
| if ((tempBufLength == 0) && ((inEncName == "utf-16be") || (inEncName == "utf-16le") || (inEncName == "utf16")) ) |
| tempBufLength++; |
| firstLineBuf[bufLength] = (char)tempBuf[tempBufLength]; |
| if (tempBuf[tempBufLength] == 0x3E) { |
| firstLineBuf[bufLength+1] = '\0'; |
| break; |
| } |
| |
| } |
| char* pFLB = new char[sizeof(firstLineBuf) +1]; |
| strcpy(pFLB, firstLineBuf); |
| |
| //if the file doesnot contain the version string line then its and illegal file |
| // |
| if (firstLineBuf[0] != 0x3C ) |
| { |
| fprintf(stderr,"Illegal xml file: It doesnot contain the xml declaration statement on the first line \n"); |
| fclose(inF); |
| fclose(outF); |
| exit(1); |
| } |
| |
| UBool encString = TRUE; |
| UBool stdString = TRUE; |
| UBool encInsertMid = FALSE; |
| UBool encInsertLast = FALSE; |
| UBool dQuote = TRUE; |
| char* doubleQuote = "\""; |
| char* singleQuote = "\'"; |
| |
| if (!strstr( (const char*)pFLB, doubleQuote)) |
| { |
| if (!strstr( (const char*)pFLB, singleQuote)) |
| { |
| fprintf(stderr,"Illegal xml file: It doesnot contain the approprite xml declaration \n"); |
| fclose(inF); |
| fclose(outF); |
| exit(1); |
| } |
| dQuote = FALSE; |
| } |
| |
| char* newString = strstr( (const char*) pFLB, "encoding"); |
| char* stringWithEnc = 0; |
| |
| if (!newString) |
| encString = FALSE; |
| else |
| { |
| stringWithEnc = new char[strlen(newString)+1]; |
| strcpy(stringWithEnc, newString); |
| } |
| |
| newString = strstr( (const char*) pFLB, "standalone"); |
| char* stringWithStd = 0; |
| if (!newString) |
| stdString = FALSE; |
| else |
| { |
| stringWithStd = new char[strlen(newString)+1]; |
| strcpy(stringWithStd, newString); |
| } |
| |
| if (!encString && !stdString) |
| encInsertLast = TRUE; |
| if (!encString && stdString) |
| encInsertMid = TRUE; |
| |
| //Encodingname for the rest of the input file could be different. |
| //If its not specified in the first line then assume it to be UTF8 |
| if (encInsertLast || encInsertMid) |
| { |
| //if the encoding type was found utf16 family or ebcdic and |
| // the encoding string is not present in the file then its an error |
| if (!strcmp(inEncName, "utf-16be") |
| || !strcmp(inEncName, "utf-16le") |
| || !strcmp(inEncName, "ebcdic-cp-us")) |
| { |
| fprintf(stderr, "Illegal xml file: it doesnot contain the encoding string in the first line of the input file\n"); |
| fclose(inF); |
| fclose(outF); |
| exit(1); |
| } |
| strcpy(encodingNameInFile, inEncName); |
| } |
| |
| char* tempString = " encoding="; |
| char* dupFLB = uprv_strdup(pFLB); |
| int stringTwoLength = 0; |
| |
| /* build up the length */ |
| stringTwoLength = bufLength; |
| |
| if(tempString) |
| stringTwoLength += strlen(tempString); |
| |
| if(outEncName) |
| stringTwoLength += strlen(outEncName); |
| |
| if(stringWithStd) |
| stringTwoLength += strlen(stringWithStd); |
| |
| stringTwoLength += 5; |
| |
| char* stringTwo = new char[stringTwoLength]; |
| |
| if (encInsertLast) { |
| char* stringOne = new char[bufLength]; |
| strncpy(stringOne, pFLB, bufLength-1); |
| strcpy(stringOne+bufLength-1, ""); |
| stringTwo = strcpy(stringTwo, stringOne); |
| strcat(stringTwo, tempString); |
| catString(stringTwo, dQuote); |
| strcat(stringTwo, outEncName); |
| catString(stringTwo, dQuote); |
| strcat(stringTwo , " ?>"); |
| delete stringOne; |
| } |
| //insert the string before 'standalone' statement |
| else if (encInsertMid) { |
| char* stringThree = new char[bufLength + strlen(tempString) + strlen(outEncName) + 5]; |
| if (dQuote) |
| stringThree = strtok(dupFLB, doubleQuote); |
| else |
| stringThree = strtok(dupFLB, singleQuote); |
| |
| strcpy(stringTwo, stringThree); |
| catString(stringTwo, dQuote); |
| |
| char* tmpString; |
| if (dQuote) |
| tmpString = strtok(0, doubleQuote); |
| else |
| tmpString = strtok(0, singleQuote); |
| if (tmpString != NULL) |
| strcat(stringTwo, tmpString); |
| |
| catString(stringTwo, dQuote); |
| strcat(stringTwo, tempString); |
| catString(stringTwo, dQuote); |
| |
| strcat(stringTwo, outEncName); |
| if (dQuote) |
| strcat(stringTwo, "\" "); |
| else |
| strcat(stringTwo, "\' "); |
| strcat(stringTwo, stringWithStd); |
| delete stringThree; |
| } |
| //if the encoding string is there then modify the output encoding name in it. |
| else if (encString) |
| { |
| char* stringFive = new char[strlen(dupFLB)+1]; |
| |
| if (dQuote) |
| stringFive = strtok (dupFLB, doubleQuote); |
| else |
| stringFive = strtok (dupFLB, singleQuote); |
| |
| strcpy(stringTwo, stringFive); |
| catString(stringTwo, dQuote); |
| while (stringFive != NULL) |
| { |
| if (dQuote) |
| stringFive = strtok(0,doubleQuote); |
| else |
| stringFive = strtok(0,singleQuote); |
| |
| if (stringFive == NULL) |
| break; |
| strcat(stringTwo, stringFive); |
| |
| char* n1String = strstr(stringFive, ">"); |
| if (!n1String) |
| catString(stringTwo, dQuote); |
| |
| char* nString = strstr(stringFive, "encoding"); |
| if (nString) |
| { |
| strcat(stringTwo, outEncName); |
| if (dQuote) |
| stringFive = strtok(0, doubleQuote); |
| else |
| stringFive = strtok(0, singleQuote); |
| strcpy(encodingNameInFile, stringFive); //this is the encoded string name |
| catString(stringTwo, dQuote); |
| } |
| } |
| if (stringFive != NULL) |
| { |
| delete stringFive; |
| stringFive = 0; |
| } |
| } |
| |
| // introduce the first order bytes for utf16 be and le files |
| // |
| if (!strcmp(outEncName, "utf-16be") || !strcmp(outEncName, "utf16")) |
| { |
| uBuf[0] = 0xFE; |
| fwrite( (void*) uBuf, 1, 1, outF); |
| uBuf[0] = 0xFF; |
| fwrite( (void*) uBuf, 1, 1, outF); |
| } else if (!strcmp(outEncName , "utf-16le")) |
| { |
| uBuf[0] = 0xFF; |
| fwrite( (void*) uBuf, 1, 1, outF); |
| uBuf[0] = 0xFE; |
| fwrite( (void*) uBuf, 1, 1, outF); |
| } |
| |
| err = U_ZERO_ERROR; |
| long oneChar = 0; |
| while ( *stringTwo != '\0' ) |
| { |
| //transcode character-by-character |
| oneChar = ucnv_convert(outEncName, |
| "ascii", |
| (char*) uBuf, |
| 0, |
| (const char*) stringTwo, |
| 1, |
| &err); |
| if (err == U_BUFFER_OVERFLOW_ERROR) |
| { |
| err = U_ZERO_ERROR; |
| } |
| else if (U_FAILURE(err)) |
| { |
| #if defined(_DEBUG) |
| fprintf (stderr, "Error transcoding char-by-char: (%s) %d\n", u_errorName(err), err); |
| #endif |
| fclose(inF); |
| fclose(outF); |
| exit(1); |
| } |
| |
| ucnv_convert(outEncName, |
| "ascii", |
| (char*) uBuf, |
| oneChar, |
| (const char*) stringTwo, |
| 1, |
| &err); |
| if (U_FAILURE(err)) |
| { |
| #if defined(_DEBUG) |
| fprintf (stderr, "Error transcoding2 char-by-char: (%s) %d\n", u_errorName(err), err); |
| #endif |
| fclose(inF); |
| fclose(outF); |
| exit(1); |
| } |
| fwrite( (void*) uBuf, 1, oneChar, outF); |
| stringTwo++; |
| } |
| } |
| |
| |
| //Now get the pointer offset after the first line in the input file |
| //and return this position |
| // |
| char* newInEncName = new char[strlen(inEncName) +1]; |
| strcpy(newInEncName, inEncName); |
| if (encodingNameInFile !=NULL) |
| { |
| if (inEncName) |
| delete newInEncName; |
| newInEncName = new char[strlen(encodingNameInFile)+1]; |
| strcpy(newInEncName, encodingNameInFile); |
| } |
| |
| char oldBuf[RAWBUFSIZE]; |
| int bufHere = bufLength +1; |
| if (!strcmp(newInEncName, "utf-16be") || !strcmp(newInEncName, "utf16") || !strcmp(newInEncName, "utf-16le")) |
| { |
| bufHere +=1; |
| memcpy((void*)oldBuf, (void*) tempBuf, bufHere); |
| } |
| else |
| memcpy((void*)oldBuf, (void*) tempBuf, bufHere); |
| |
| char newBuf[RAWBUFSIZE]; |
| long endBytes = 0; |
| //transcode this ascii type to the input encoding type |
| //and get the pointer to the end of first line in the input buffer |
| // |
| err = U_ZERO_ERROR; |
| endBytes = ucnv_convert(newInEncName, |
| "ascii", |
| (char*) newBuf, |
| 0, |
| (const char*) oldBuf, |
| bufHere, |
| &err); |
| |
| if (err == U_BUFFER_OVERFLOW_ERROR) |
| { |
| err = U_ZERO_ERROR; |
| } |
| else if (U_FAILURE(err)) |
| { |
| #if defined(_DEBUG) |
| fprintf (stderr, "Error transcoding from ascii to input encoding: (%s) %d\n", u_errorName(err), err); |
| #endif |
| fclose(inF); |
| fclose(outF); |
| exit(1); |
| } |
| ucnv_convert(newInEncName, |
| "ascii", |
| (char*) newBuf, |
| endBytes, |
| (const char*) oldBuf, |
| bufHere, |
| &err); |
| if (U_FAILURE(err)) |
| { |
| #if defined(_DEBUG) |
| fprintf (stderr, "Error transcoding2 from ascii to input encoding: (%s) %d\n", u_errorName(err), err); |
| #endif |
| delete newInEncName; |
| fclose(inF); |
| fclose(outF); |
| exit(1); |
| } |
| |
| return endBytes; |
| } |
| |
| |
| int32_t XMLUConvert( UConverter* inConverter, |
| UConverter* outConverter, |
| const char* inBuffer, |
| int32_t* inBufSize, |
| char* outBuffer, |
| int32_t outBufCapacity, |
| UBool flush, |
| UErrorCode* err) |
| { |
| const char* inBufferAlias = inBuffer; |
| char* outBufferAlias = outBuffer; |
| const char* inBufferEnd = inBuffer + *inBufSize; |
| const char* outBufferEnd = outBuffer + outBufCapacity; |
| //const char* consumed; |
| |
| if (U_FAILURE(*err)) return 0; |
| |
| XMLU_fromCodepageToCodepage(outConverter, |
| inConverter, |
| &outBufferAlias, |
| outBufferEnd, |
| &inBufferAlias, |
| inBufferEnd, |
| NULL, |
| flush, |
| err); |
| |
| // *inBufSize = inBufferAlias; |
| return outBufferAlias - outBuffer; |
| } |
| |
| void XMLU_fromCodepageToCodepage( UConverter* outConverter, |
| UConverter* inConverter, |
| char** target, |
| const char* targetLimit, |
| const char** source, |
| const char* sourceLimit, |
| int32_t* offsets, |
| UBool flush, |
| UErrorCode* err) |
| { |
| |
| #if 0 |
| UChar out_chunk[RAWBUFSIZE]; |
| const UChar* out_chunk_limit = out_chunk + RAWBUFSIZE; |
| UChar* out_chunk_alias; |
| UChar const* out_chunk_alias2; |
| UChar const* consumed_UChars; |
| |
| |
| if (U_FAILURE(*err)) return; |
| |
| *consumed = *source; |
| /*loops until the input buffer is completely consumed |
| *or if an error has be encountered |
| *first we convert from inConverter codepage to Unicode |
| *then from Unicode to outConverter codepage |
| */ |
| |
| while ((sourceLimit != *source) && U_SUCCESS(*err)) |
| { |
| out_chunk_alias = out_chunk; |
| *source = *consumed; |
| ucnv_reset(inConverter); |
| ucnv_toUnicode(inConverter, |
| &out_chunk_alias, |
| out_chunk_limit, |
| source, |
| sourceLimit, |
| consumed, |
| flush, |
| err); |
| |
| /*U_BUFFER_OVERFLOW_ERROR means that the output "CHUNK" is full |
| *we will require at least another loop (it's a recoverable error) |
| */ |
| |
| if (U_SUCCESS(*err) || (*err == U_BUFFER_OVERFLOW_ERROR)) |
| { |
| *err = U_ZERO_ERROR; |
| out_chunk_alias2 = out_chunk; |
| |
| while ((out_chunk_alias2 != out_chunk_alias) && U_SUCCESS(*err)) |
| { |
| ucnv_fromUnicode(outConverter, |
| target, |
| targetLimit, |
| &out_chunk_alias2, |
| out_chunk_alias, |
| &consumed_UChars, |
| FALSE, |
| err); |
| |
| } |
| } |
| else break; |
| } |
| return; |
| |
| #endif |
| |
| |
| UChar out_chunk[RAWBUFSIZE]; |
| const UChar *out_chunk_limit = out_chunk + RAWBUFSIZE; |
| UChar *out_chunk_alias; |
| UChar const *out_chunk_alias2; |
| |
| |
| if (U_FAILURE (*err)) return; |
| |
| |
| /*loops until the input buffer is completely consumed |
| *or if an error has be encountered |
| *first we convert from inConverter codepage to Unicode |
| *then from Unicode to outConverter codepage |
| */ |
| while ((*source != sourceLimit) && U_SUCCESS (*err)) |
| { |
| out_chunk_alias = out_chunk; |
| ucnv_toUnicode (inConverter, |
| &out_chunk_alias, |
| out_chunk_limit, |
| source, |
| sourceLimit, |
| NULL, |
| flush, |
| err); |
| |
| /*U_BUFFER_OVERFLOW_ERROR means that the output "CHUNK" is full |
| *we will require at least another loop (it's a recoverable error) |
| */ |
| |
| if (U_SUCCESS (*err) || (*err == U_BUFFER_OVERFLOW_ERROR)) |
| { |
| *err = U_ZERO_ERROR; |
| out_chunk_alias2 = out_chunk; |
| |
| while ((out_chunk_alias2 != out_chunk_alias) && U_SUCCESS (*err)) |
| { |
| ucnv_fromUnicode (outConverter, |
| target, |
| targetLimit, |
| &out_chunk_alias2, |
| out_chunk_alias, |
| NULL, |
| TRUE, |
| err); |
| |
| } |
| } |
| else |
| break; |
| } |
| |
| return; |
| } |
| |
| void catString(char* thisString, UBool quote) |
| { |
| if (quote) |
| strcat(thisString, "\""); |
| else |
| strcat(thisString, "\'"); |
| } |