blob: 7199c80731970e558453d4025de5b87cecb4637c [file] [log] [blame]
/**************************************************************************
*
* Copyright (C) 2000, International Business Machines
* Corporation and others. All Rights Reserved.
*
***************************************************************************
* file name: convsamp.c
* encoding: ASCII (7-bit)
*
* created on: 2000may30
* created by: Steven R. Loomis
*
* Sample code for the ICU conversion routines.
*
* Note: Nothing special is needed to build this sample. Link with
* the icu UC and icu I18N libraries.
*
* I use 'assert' for error checking, you probably will want
* something more flexible. '***BEGIN SAMPLE***' and
* '***END SAMPLE***' mark pieces suitable for stand alone
* code snippets.
*
*/
#include <stdio.h>
#include <ctype.h> /* for isspace, etc. */
#include <assert.h>
#include "unicode/utypes.h" /* Basic ICU data types */
#include "unicode/ucnv.h" /* C Converter API */
#include "unicode/convert.h" /* C++ Converter API */
#include "unicode/ustring.h" /* some more string fcns*/
/* Some utility functions */
static const UChar kNone[] = { 0x0000 };
/* Print a UChar if possible, in seven characters. */
void prettyPrintUChar(UChar c)
{
if( (c <= 0x007F) &&
(isgraph(c)) ) {
printf(" '%c' ", (char)(0x00FF&c));
} else if ( c > 0x007F ) {
char buf[100];
UErrorCode status = U_ZERO_ERROR;
UTextOffset o;
o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 100, &status);
if(U_SUCCESS(status)) {
buf[6] = 0;
printf("%- 7s", buf);
} else {
printf("??????? ");
}
} else {
switch((char)(c & 0x007F)) {
case ' ':
printf(" ' ' ");
break;
case '\t':
printf(" \t ");
break;
case '\n':
printf(" \n ");
break;
default:
printf(" ");
break;
}
}
}
void printUChars(const char *name = "?",
const UChar *uch = kNone,
int32_t len = -1 )
{
int32_t i;
if( (len == -1) && (uch) ) {
len = u_strlen(uch);
}
printf("% 5s:", name);
for( i = 0; i <len; i++) {
printf("%- 6d ", i);
}
printf("\n");
printf("% 5s: ", "uni");
for( i = 0; i <len; i++) {
printf("\\u%04X ", (int)uch[i]);
}
printf("\n");
printf("% 5s: ", "ch");
for( i = 0; i <len; i++) {
prettyPrintUChar(uch[i]);
}
printf("\n");
}
void printBytes(const char *name = "?",
const char *uch = "",
int32_t len = -1 )
{
int32_t i;
if( (len == -1) && (uch) ) {
len = strlen(uch);
}
printf("% 5s:", name);
for( i = 0; i <len; i++) {
printf(" %- 4d", i);
}
printf("\n");
printf("% 5s: ", "uni");
for( i = 0; i <len; i++) {
printf("\\x%02X ", 0x00FF & (int)uch[i]);
}
printf("\n");
printf("% 5s: ", "ch");
for( i = 0; i <len; i++) {
if(isgraph(uch[i])) {
printf(" '%c' ", (char)uch[i]);
} else {
printf(" ");
}
}
printf("\n");
}
/*******************************************************************
Very simple C++ sample to convert the word 'Moscow' in Russian, followed
by an exclamation mark (!) into the KOI8-R Russian code page.
This example first creates a UnicodeString out of the Unicode chars.
targetSize must be set to the amount of space available in the target
buffer. After UnicodeConverterCPP::fromUnicodeString() is called,
targetSize will contain the number of bytes in target[] which were
used in the resulting codepage. In this case, there is a 1:1 mapping
between the input and output characters. The exclamation mark has the
same value in both KOI8-R and Unicode.
src: 0 1 2 3 4 5 6
uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
targ: 0 1 2 3 4 5 6
uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
ch: '!'
*/
UErrorCode convsample_01()
{
printf("\n\n==============================================\n"
"Sample 01: C++: simple Unicode -> koi8-r conversion\n");
// **************************** START SAMPLE *******************
// "Moscva!" in cyrillic letters, to be converted to the KOI8-R
// Russian code page.
UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
0x0430, 0x0021, 0x0000 };
char target[100];
int32_t targetSize = sizeof(target);
UnicodeString myString(source);
UErrorCode status = U_ZERO_ERROR;
// set up the converter
UnicodeConverterCPP conv("koi8-r", status);
assert(U_SUCCESS(status));
// convert to KOI8-R
conv.fromUnicodeString(target, targetSize, myString, status);
assert(U_SUCCESS(status));
// ***************************** END SAMPLE ********************
// Print it out
printUChars("src", source);
printf("\n");
printBytes("targ", target, targetSize);
return U_ZERO_ERROR;
}
/******************************************************
Similar sample to the preceding one.
You must call ucnv_close to clean up the memory used by the
converter.
'len' returns the number of OUTPUT bytes resulting from the
conversion. In this case, it will be 9 even though there are
only 6 unicode characters going in. This is because the
letters 'cat' each only take up one byte, but the remaining
three UChars each take up 2 bytes in the output codepage.
src: 0 1 2 3 4 5
uni: \u0063 \u0061 \u0074 \u732B \uFF2F \uFF2B
ch: 'c' 'a' 't' CJK UN
targ: 0 1 2 3 4 5 6 7 8
uni: \x63 \x61 \x74 \x94 \x4C \x82 \x6E \x82 \x6A
ch: 'c' 'a' 't' [not readable here........]
*/
UErrorCode convsample_02()
{
printf("\n\n==============================================\n"
"Sample 02: C: simple Unicode -> Shift_Jis conversion\n");
// **************************** START SAMPLE *******************
// "cat<cat>OK"
UChar source[] = { 0x0063, 0x0061, 0x0074, 0x732B, 0xFF2F, 0xFF2B,
0x0000 };
char target[100];
UErrorCode status = U_ZERO_ERROR;
UConverter *conv;
int32_t len;
// set up the converter
conv = ucnv_open("shift_jis", &status);
assert(U_SUCCESS(status));
// convert to shift-jis
len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
assert(U_SUCCESS(status));
// close the converter
ucnv_close(conv);
// ***************************** END SAMPLE ********************
// Print it out
printUChars("src", source);
printf("\n");
printBytes("targ", target, len);
return U_ZERO_ERROR;
}
UErrorCode convsample_03()
{
printf("\n\n==============================================\n"
"Sample 03: C: print out all converters\n");
int32_t count;
int32_t i;
// **************************** START SAMPLE *******************
count = ucnv_countAvailable();
printf("Available converters: %d\n", count);
for(i=0;i<count;i++)
{
printf("%s ", ucnv_getAvailableName(i));
}
// ***************************** END SAMPLE ********************
printf("\n");
return U_ZERO_ERROR;
}
#define BUFFERSIZE 17 /* make it interesting :) */
/*
Converting from a codepage to Unicode in bulk..
What is the best way to determine the buffer size?
The 'buffersize' is in bytes of input.
For a given converter, divinding this by the minimum char size
give you the maximum number of Unicode characters that could be
expected for a given number of input bytes.
see: ucnv_getMinCharSize()
For example, a single byte codepage like 'Latin-3' has a
minimum char size of 1. (It takes at least 1 byte to represent
each Unicode char.) So the unicode buffer has the same number of
UChars as the input buffer has bytes.
In a strictly double byte codepage such as cp1362 (Windows
Korean), the minimum char size is 2. So, only half as many Unicode
chars as bytes are needed.
This work to calculate the buffer size is an optimization. Any
size of input and output buffer can be used, as long as the
program handles the following cases: If the input buffer is empty,
the source pointer will be equal to sourceLimit. If the output
buffer is empty, U_INDEX_OUTOFBOUNDS_ERROR will be returned.
*/
UErrorCode convsample_05()
{
printf("\n\n==============================================\n"
"Sample 05: C: count the number of letters in a UTF-8 document\n");
FILE *f;
int32_t count;
char inBuf[BUFFERSIZE];
const char *source;
const char *sourceLimit;
UChar *uBuf;
UChar *target;
UChar *targetLimit;
UChar *p;
int32_t uBufSize = 0;
UConverter *conv;
UErrorCode status = U_ZERO_ERROR;
uint32_t letters=0, total=0;
f = fopen("data01.ut8", "r");
if(!f)
{
fprintf(stderr, "Couldn't open file 'data01.ut8' (UTF-8 data file).\n");
return U_FILE_ACCESS_ERROR;
}
// **************************** START SAMPLE *******************
conv = ucnv_open("utf-8", &status);
assert(U_SUCCESS(status));
uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
printf("input bytes %d / min chars %d = %d UChars\n",
BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
assert(uBuf!=NULL);
// grab another buffer's worth
while((!feof(f)) &&
((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
{
// Convert bytes to unicode
source = inBuf;
sourceLimit = inBuf + count;
do
{
target = uBuf;
targetLimit = uBuf + uBufSize;
ucnv_toUnicode(conv, &target, targetLimit,
&source, sourceLimit, NULL,
feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
/* is true (when no more data will come) */
&status);
if(status != U_INDEX_OUTOFBOUNDS_ERROR)
{
// simply ran out of space - we'll reset the target ptr the next
// time through the loop.
status = U_ZERO_ERROR;
}
else
{
// Check other errors here.
assert(U_SUCCESS(status));
// Break out of the loop (by force)
}
// Process the Unicode
// Todo: handle UTF-16/surrogates
for(p = uBuf; p<target; p++)
{
if(u_isalpha(*p))
letters++;
total++;
}
} while (source < sourceLimit); // while simply out of space
}
printf("%d letters out of %d total UChars.\n", letters, total);
// ***************************** END SAMPLE ********************
ucnv_close(conv);
printf("\n");
return U_ZERO_ERROR;
}
#undef BUFFERSIZE
/* main */
int main()
{
convsample_01();
convsample_02();
convsample_03();
//convsample_04(); /* not written yet */
convsample_05();
}