source/extra/uconv/uconv.cpp - external/github.com/unicode-org/icu - Git at Google

 /******************************************************************************
 *
 *   Copyright (C) 1999-2000, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************/
 //
 // uconv demonstration example of ICU and codepage conversion
 // Purpose is to be a similar tool as the UNIX iconv program.
 // Shows the usage of the ICU classes: UnicodeConverter, UnicodeString
 //
 // Usage: uconv [flag] [file]
 // -f [codeset]  Convert file from this codeset
 // -t [codeset]  Convert file to this code set
 // -l            Display all available converters
 // -x [transliterator]  Run everything through a transliterator
 // -L            Display all available transliterators
 // If no file is given, uconv tries to read from stdin
 //
 // To compile: c++ -o uconv -I${ICUHOME}/include -Wall -g uconv.cpp -L${ICUHOME}/lib -licu-uc -licu-i18n
 //
 // Original contributor was Jonas Utterström <jonas.utterstrom@vittran.norrnod.se> in 1999
 // Permission is granted to use, copy, modify, and distribute this software
 //

 #include <stdio.h>
 #include <errno.h>
 #include <string.h>
 #include <stdlib.h>

 // This is the UnicodeConverter headerfile
 #include "unicode/convert.h"

 // This is the UnicodeString headerfile
 #include "unicode/unistr.h"

 // Our message printer..
 #include "unicode/uwmsg.h"

 #ifdef WIN32
 #include <string.h>
 #include <io.h>
 #include <fcntl.h>
 #endif

 #ifdef USE_TRANSLIT
 # include "unicode/translit.h"
 #endif

 static const size_t buffsize = 4096;

 // Print all available codepage converters
 static void printAllConverters()
 {
     UErrorCode err = U_ZERO_ERROR;
     int32_t num;
     size_t numprint = 0;
     static const size_t maxline = 70;

     // getAvailable returns a string-table with all available codepages
     const char* const* convtable = UnicodeConverter::getAvailableNames(num, err);
     if (U_FAILURE(err))
     {
       u_wmsg("cantGetNames", u_wmsg_errorName(err));
       return;
     }

     for (int32_t i = 0; i<num-1; i++)
     {
         // ucnv_getAvailableName gets the codepage name at a specific
         // index
         numprint += printf("%-20s", convtable[i]);
         if (numprint>maxline)
         {
             putchar('\n');
             numprint = 0;
         }
     }
     puts(convtable[num-1]);
 }

 // Convert a file from one encoding to another
 static UBool convertFile(const char* fromcpage,
                  const char* tocpage,
                  FILE* infile,
                  FILE* outfile)
 {
   UBool ret = TRUE;
     UnicodeConverter* convfrom = 0;
     UnicodeConverter* convto = 0;
     UErrorCode err = U_ZERO_ERROR;
     UBool  flush;
     const char* cbuffiter;
     char* buffiter;
     const size_t readsize = buffsize-1;
     char* buff = 0;

     const UChar* cuniiter;
     UChar* uniiter;
     UChar* unibuff = 0;

     size_t rd, totbuffsize;

 #if USE_TRANSLIT
     const char *translit;

     Transliterator *t = NULL;

     translit = getenv("TRANSLIT");
     if(translit != NULL && *translit)
       {
         t = Transliterator::createInstance(UnicodeString(translit, ""));
         fprintf(stderr, "Opening transliterator: %s\n", translit, t);
       }
 #endif

     // Create codepage converter. If the codepage or its aliases weren't
     // available, it returns NULL and a failure code
     convfrom = new UnicodeConverter(fromcpage, err);
     if (U_FAILURE(err))
     {
       UnicodeString str(fromcpage,"");
       u_wmsg("cantOpenFromCodeset",str.getBuffer(),
              u_wmsg_errorName(err));
       goto error_exit;
     }

     convto = new UnicodeConverter(tocpage, err);

     if (U_FAILURE(err))
     {
       UnicodeString str(tocpage,"");
       u_wmsg("cantOpenToCodeset",str.getBuffer(),
              u_wmsg_errorName(err));
       goto error_exit;
     }

     // To ensure that the buffer always is of enough size, we
     // must take the worst case scenario, that is the character in the codepage
     // that uses the most bytes and multiply it against the buffsize
     totbuffsize = buffsize*convto->getMaxBytesPerChar();
     buff = new char[totbuffsize];
     unibuff = new UChar[buffsize];

     do
     {
         rd = fread(buff, 1, readsize, infile);
         if (ferror(infile) != 0)
         {
             UnicodeString str(strerror(errno), "");
             u_wmsg("cantRead",str.getBuffer());
             goto error_exit;
         }

         // Convert the read buffer into the new coding
         // After the call 'uniiter' will be placed on the last character that was converted
         // in the 'unibuff'.
         // Also the 'cbuffiter' is positioned on the last converted character.
         // At the last conversion in the file, flush should be set to true so that
         // we get all characters converted
         //
         // The converter must be flushed at the end of conversion so that characters
         // on hold also will be written
         uniiter = unibuff;
         cbuffiter = buff;
         flush = rd!=readsize;
         convfrom->toUnicode(uniiter, uniiter+buffsize, cbuffiter, cbuffiter+rd,
                             NULL, flush, err);

         if (U_FAILURE(err))
         {
             u_wmsg("problemCvtToU", u_wmsg_errorName(err));
             goto error_exit;
         }

         // At the last conversion, the converted characters should be equal to number
         // of chars read.
         if (flush && cbuffiter!=(buff+rd))
         {
             u_wmsg("premEndInput");
             goto error_exit;
         }

         // Convert the Unicode buffer into the destination codepage
         // Again 'buffiter' will be placed on the last converted character
         // And 'cuniiter' will be placed on the last converted unicode character
         // At the last conversion flush should be set to true to ensure that
         // all characters left get converted

         UnicodeString u(unibuff, uniiter-unibuff);
         buffiter = buff;
         cuniiter = unibuff;

 #ifdef USE_TRANSLIT
         if(t)
           {
             t->transliterate(u);
             u.extract(0, u.length(), unibuff, 0);
             uniiter = unibuff + u.length();

           }
 #endif

         convto->fromUnicode(buffiter, buffiter+totbuffsize,
                            cuniiter, cuniiter+(size_t)(uniiter-unibuff),
                            NULL, flush, err);

         if (U_FAILURE(err))
         {
            u_wmsg("problemCvtFromU", u_wmsg_errorName(err));
            goto error_exit;
         }

         // At the last conversion, the converted characters should be equal to number
         // of consumed characters.
         if (flush && cuniiter!=(unibuff+(size_t)(uniiter-unibuff)))
         {
           u_wmsg("premEnd");
           goto error_exit;
         }

         // Finally, write the converted buffer to the output file
         rd =  (size_t)(buffiter-buff);
         if (fwrite(buff, 1, rd, outfile) != rd)
         {
           UnicodeString str(strerror(errno),"");
           u_wmsg("cantWrite", str.getBuffer());
             goto error_exit;
         }

     } while (!flush); // Stop when we have flushed the converters (this means that it's the end of output)

     goto normal_exit;
   error_exit:
     ret = TRUE;
   normal_exit:
     if (convfrom) delete convfrom;
     if (convto) delete convto;

 #ifdef USE_TRANSLIT
     if ( t ) delete t;
 #endif

     // Close the created converters
     if (buff) delete [] buff;
     if (unibuff) delete [] unibuff;
     return ret;
 }

 static UResourceBundle *gBundle = 0;

 static void initMsg(const char *pname) {
     static int ps = 0;

     if (!ps) {
 	char dataPath[500];
 	UErrorCode err = U_ZERO_ERROR;

 	ps = 1;

 	/* Get messages. */

 	strcpy(dataPath, u_getDataDirectory());
 	strcat(dataPath, "uconvmsg");

 	gBundle = u_wmsg_setPath(dataPath, &err);
 	if(U_FAILURE(err))
 	    {
 		fprintf(stderr, "%s: warning: couldn't open resource bundle %s: %s\n",
 			pname,
 			dataPath,
 			u_errorName(err));
 	    }
     }
 }

 static void usage(const char *pname, int ecode)
 {
   const UChar *msg;
   int32_t      msgLen;
   UErrorCode  err = U_ZERO_ERROR;

   initMsg(pname);
   msg = ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord", &msgLen, &err);
   UnicodeString upname(pname);
   UnicodeString mname(msg, msgLen);

   u_wmsg("usage", mname.getBuffer(), upname.getBuffer());
   if (!ecode) {
     putchar('\n');
     u_wmsg("help");
   }

   exit(ecode);
 }

 int main(int argc, char** argv)
 {
     FILE* file = 0;
     FILE* infile;
     int   ret = 0;
     const char* fromcpage = 0;
     const char* tocpage = 0;
     const char* infilestr = 0;

     char** iter = argv+1;
     char** end = argv+argc;

     const char *pname = *argv;

     // First, get the arguments from command-line
     // to know the codepages to convert between
     for (; iter!=end; iter++)
     {
         // Check for from charset
         if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter))
         {
             iter++;
             if (iter!=end)
                 fromcpage = *iter;
         }
         else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter))
         {
             iter++;
             if (iter!=end)
                 tocpage = *iter;
         }
         else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter))
         {
             printAllConverters();
             goto normal_exit;
         }
         else if (strcmp("-h", *iter) == 0 || !strcmp("--help", *iter))
         {
             usage(pname, 0);
         }
         else if (**iter == '-' && (*iter)[1]) {
 	    usage(pname, 1);
 	} else if (!infilestr) {
             infilestr = *iter;
         } else {
 	    usage(pname, 1);
 	}
     }

     if (fromcpage==0 && tocpage==0)
     {
         usage(pname, 1);
     }

     if (fromcpage==0)
     {
       initMsg(pname);
       u_wmsg("noFromCodeset");
       //"No conversion from codeset given (use -f)\n");
         goto error_exit;
     }
     if (tocpage==0)
     {
       initMsg(pname);
       u_wmsg("noToCodeset");
       // "No conversion to codeset given (use -t)\n");
       goto error_exit;
     }

     // Open the correct input file or connect to stdin for reading input
     if (infilestr!=0 && strcmp(infilestr, "-"))
     {
         file = fopen(infilestr, "rb");
         if (file==0)
         {
           UnicodeString str1(infilestr,"");
           UnicodeString str2(strerror(errno),"");
           initMsg(pname);
           u_wmsg("cantOpenInputF",
                  str1.getBuffer(),
                  str2.getBuffer());
           return 1;
         }
         infile = file;
     }
     else {
         infile = stdin;
 #ifdef WIN32
         if( setmode( fileno ( stdin ), O_BINARY ) == -1 ) {
                 perror ( "Cannot set stdin to binary mode" );
                 exit(-1);
         }
 #endif
     }
 #ifdef WIN32
   if( setmode( fileno ( stdout ), O_BINARY ) == -1 ) {
           perror ( "Cannot set stdout to binary mode" );
           exit(-1);
   }
 #endif
     if (!convertFile(fromcpage, tocpage, infile, stdout))
         goto error_exit;

     goto normal_exit;
   error_exit:
     ret = 1;
   normal_exit:

     if (file!=0)
         fclose(file);
     return ret;
 }
	/******************************************************************************
	*
	* Copyright (C) 1999-2000, International Business Machines
	* Corporation and others. All Rights Reserved.
	*
	*******************************************************************************/
	//
	// uconv demonstration example of ICU and codepage conversion
	// Purpose is to be a similar tool as the UNIX iconv program.
	// Shows the usage of the ICU classes: UnicodeConverter, UnicodeString
	//
	// Usage: uconv [flag] [file]
	// -f [codeset] Convert file from this codeset
	// -t [codeset] Convert file to this code set
	// -l Display all available converters
	// -x [transliterator] Run everything through a transliterator
	// -L Display all available transliterators
	// If no file is given, uconv tries to read from stdin
	//
	// To compile: c++ -o uconv -I${ICUHOME}/include -Wall -g uconv.cpp -L${ICUHOME}/lib -licu-uc -licu-i18n
	//
	// Original contributor was Jonas Utterström <jonas.utterstrom@vittran.norrnod.se> in 1999
	// Permission is granted to use, copy, modify, and distribute this software
	//

	#include <stdio.h>
	#include <errno.h>
	#include <string.h>
	#include <stdlib.h>

	// This is the UnicodeConverter headerfile
	#include "unicode/convert.h"

	// This is the UnicodeString headerfile
	#include "unicode/unistr.h"

	// Our message printer..
	#include "unicode/uwmsg.h"

	#ifdef WIN32
	#include <string.h>
	#include <io.h>
	#include <fcntl.h>
	#endif

	#ifdef USE_TRANSLIT
	# include "unicode/translit.h"
	#endif

	static const size_t buffsize = 4096;

	// Print all available codepage converters
	static void printAllConverters()
	{
	UErrorCode err = U_ZERO_ERROR;
	int32_t num;
	size_t numprint = 0;
	static const size_t maxline = 70;

	// getAvailable returns a string-table with all available codepages
	const char* const* convtable = UnicodeConverter::getAvailableNames(num, err);
	if (U_FAILURE(err))
	{
	u_wmsg("cantGetNames", u_wmsg_errorName(err));
	return;
	}

	for (int32_t i = 0; i<num-1; i++)
	{
	// ucnv_getAvailableName gets the codepage name at a specific
	// index
	numprint += printf("%-20s", convtable[i]);
	if (numprint>maxline)
	{
	putchar('\n');
	numprint = 0;
	}
	}
	puts(convtable[num-1]);
	}

	// Convert a file from one encoding to another
	static UBool convertFile(const char* fromcpage,
	const char* tocpage,
	FILE* infile,
	FILE* outfile)
	{
	UBool ret = TRUE;
	UnicodeConverter* convfrom = 0;
	UnicodeConverter* convto = 0;
	UErrorCode err = U_ZERO_ERROR;
	UBool flush;
	const char* cbuffiter;
	char* buffiter;
	const size_t readsize = buffsize-1;
	char* buff = 0;

	const UChar* cuniiter;
	UChar* uniiter;
	UChar* unibuff = 0;

	size_t rd, totbuffsize;

	#if USE_TRANSLIT
	const char *translit;

	Transliterator *t = NULL;

	translit = getenv("TRANSLIT");
	if(translit != NULL && *translit)
	{
	t = Transliterator::createInstance(UnicodeString(translit, ""));
	fprintf(stderr, "Opening transliterator: %s\n", translit, t);
	}
	#endif

	// Create codepage converter. If the codepage or its aliases weren't
	// available, it returns NULL and a failure code
	convfrom = new UnicodeConverter(fromcpage, err);
	if (U_FAILURE(err))
	{
	UnicodeString str(fromcpage,"");
	u_wmsg("cantOpenFromCodeset",str.getBuffer(),
	u_wmsg_errorName(err));
	goto error_exit;
	}

	convto = new UnicodeConverter(tocpage, err);

	if (U_FAILURE(err))
	{
	UnicodeString str(tocpage,"");
	u_wmsg("cantOpenToCodeset",str.getBuffer(),
	u_wmsg_errorName(err));
	goto error_exit;
	}

	// To ensure that the buffer always is of enough size, we
	// must take the worst case scenario, that is the character in the codepage
	// that uses the most bytes and multiply it against the buffsize
	totbuffsize = buffsize*convto->getMaxBytesPerChar();
	buff = new char[totbuffsize];
	unibuff = new UChar[buffsize];

	do
	{
	rd = fread(buff, 1, readsize, infile);
	if (ferror(infile) != 0)
	{
	UnicodeString str(strerror(errno), "");
	u_wmsg("cantRead",str.getBuffer());
	goto error_exit;
	}

	// Convert the read buffer into the new coding
	// After the call 'uniiter' will be placed on the last character that was converted
	// in the 'unibuff'.
	// Also the 'cbuffiter' is positioned on the last converted character.
	// At the last conversion in the file, flush should be set to true so that
	// we get all characters converted
	//
	// The converter must be flushed at the end of conversion so that characters
	// on hold also will be written
	uniiter = unibuff;
	cbuffiter = buff;
	flush = rd!=readsize;
	convfrom->toUnicode(uniiter, uniiter+buffsize, cbuffiter, cbuffiter+rd,
	NULL, flush, err);

	if (U_FAILURE(err))
	{
	u_wmsg("problemCvtToU", u_wmsg_errorName(err));
	goto error_exit;
	}

	// At the last conversion, the converted characters should be equal to number
	// of chars read.
	if (flush && cbuffiter!=(buff+rd))
	{
	u_wmsg("premEndInput");
	goto error_exit;
	}

	// Convert the Unicode buffer into the destination codepage
	// Again 'buffiter' will be placed on the last converted character
	// And 'cuniiter' will be placed on the last converted unicode character
	// At the last conversion flush should be set to true to ensure that
	// all characters left get converted

	UnicodeString u(unibuff, uniiter-unibuff);
	buffiter = buff;
	cuniiter = unibuff;

	#ifdef USE_TRANSLIT
	if(t)
	{
	t->transliterate(u);
	u.extract(0, u.length(), unibuff, 0);
	uniiter = unibuff + u.length();

	}
	#endif

	convto->fromUnicode(buffiter, buffiter+totbuffsize,
	cuniiter, cuniiter+(size_t)(uniiter-unibuff),
	NULL, flush, err);

	if (U_FAILURE(err))
	{
	u_wmsg("problemCvtFromU", u_wmsg_errorName(err));
	goto error_exit;
	}

	// At the last conversion, the converted characters should be equal to number
	// of consumed characters.
	if (flush && cuniiter!=(unibuff+(size_t)(uniiter-unibuff)))
	{
	u_wmsg("premEnd");
	goto error_exit;
	}

	// Finally, write the converted buffer to the output file
	rd = (size_t)(buffiter-buff);
	if (fwrite(buff, 1, rd, outfile) != rd)
	{
	UnicodeString str(strerror(errno),"");
	u_wmsg("cantWrite", str.getBuffer());
	goto error_exit;
	}

	} while (!flush); // Stop when we have flushed the converters (this means that it's the end of output)

	goto normal_exit;
	error_exit:
	ret = TRUE;
	normal_exit:
	if (convfrom) delete convfrom;
	if (convto) delete convto;

	#ifdef USE_TRANSLIT
	if ( t ) delete t;
	#endif

	// Close the created converters
	if (buff) delete [] buff;
	if (unibuff) delete [] unibuff;
	return ret;
	}

	static UResourceBundle *gBundle = 0;

	static void initMsg(const char *pname) {
	static int ps = 0;

	if (!ps) {
	char dataPath[500];
	UErrorCode err = U_ZERO_ERROR;

	ps = 1;

	/* Get messages. */

	strcpy(dataPath, u_getDataDirectory());
	strcat(dataPath, "uconvmsg");

	gBundle = u_wmsg_setPath(dataPath, &err);
	if(U_FAILURE(err))
	{
	fprintf(stderr, "%s: warning: couldn't open resource bundle %s: %s\n",
	pname,
	dataPath,
	u_errorName(err));
	}
	}
	}

	static void usage(const char *pname, int ecode)
	{
	const UChar *msg;
	int32_t msgLen;
	UErrorCode err = U_ZERO_ERROR;

	initMsg(pname);
	msg = ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord", &msgLen, &err);
	UnicodeString upname(pname);
	UnicodeString mname(msg, msgLen);

	u_wmsg("usage", mname.getBuffer(), upname.getBuffer());
	if (!ecode) {
	putchar('\n');
	u_wmsg("help");
	}

	exit(ecode);
	}

	int main(int argc, char** argv)
	{
	FILE* file = 0;
	FILE* infile;
	int ret = 0;
	const char* fromcpage = 0;
	const char* tocpage = 0;
	const char* infilestr = 0;

	char** iter = argv+1;
	char** end = argv+argc;

	const char pname = argv;

	// First, get the arguments from command-line
	// to know the codepages to convert between
	for (; iter!=end; iter++)
	{
	// Check for from charset
	if (strcmp("-f", iter) == 0 \|\| !strcmp("--from-code", iter))
	{
	iter++;
	if (iter!=end)
	fromcpage = *iter;
	}
	else if (strcmp("-t", iter) == 0 \|\| !strcmp("--to-code", iter))
	{
	iter++;
	if (iter!=end)
	tocpage = *iter;
	}
	else if (strcmp("-l", iter) == 0 \|\| !strcmp("--list", iter))
	{
	printAllConverters();
	goto normal_exit;
	}
	else if (strcmp("-h", iter) == 0 \|\| !strcmp("--help", iter))
	{
	usage(pname, 0);
	}
	else if (*iter == '-' && (iter)[1]) {
	usage(pname, 1);
	} else if (!infilestr) {
	infilestr = *iter;
	} else {
	usage(pname, 1);
	}
	}

	if (fromcpage==0 && tocpage==0)
	{
	usage(pname, 1);
	}

	if (fromcpage==0)
	{
	initMsg(pname);
	u_wmsg("noFromCodeset");
	//"No conversion from codeset given (use -f)\n");
	goto error_exit;
	}
	if (tocpage==0)
	{
	initMsg(pname);
	u_wmsg("noToCodeset");
	// "No conversion to codeset given (use -t)\n");
	goto error_exit;
	}

	// Open the correct input file or connect to stdin for reading input
	if (infilestr!=0 && strcmp(infilestr, "-"))
	{
	file = fopen(infilestr, "rb");
	if (file==0)
	{
	UnicodeString str1(infilestr,"");
	UnicodeString str2(strerror(errno),"");
	initMsg(pname);
	u_wmsg("cantOpenInputF",
	str1.getBuffer(),
	str2.getBuffer());
	return 1;
	}
	infile = file;
	}
	else {
	infile = stdin;
	#ifdef WIN32
	if( setmode( fileno ( stdin ), O_BINARY ) == -1 ) {
	perror ( "Cannot set stdin to binary mode" );
	exit(-1);
	}
	#endif
	}
	#ifdef WIN32
	if( setmode( fileno ( stdout ), O_BINARY ) == -1 ) {
	perror ( "Cannot set stdout to binary mode" );
	exit(-1);
	}
	#endif
	if (!convertFile(fromcpage, tocpage, infile, stdout))
	goto error_exit;

	goto normal_exit;
	error_exit:
	ret = 1;
	normal_exit:

	if (file!=0)
	fclose(file);
	return ret;
	}