| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2000, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: ushape.c |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2000jun29 |
| * created by: Markus W. Scherer |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "unicode/uchar.h" |
| #include "unicode/ustring.h" |
| #include "cmemory.h" |
| #include "unicode/ushape.h" |
| |
| #if UTF_SIZE<16 |
| /* |
| * This implementation assumes that the internal encoding is UTF-16 |
| * or UTF-32, not UTF-8. |
| * The main assumption is that the Arabic characters and their |
| * presentation forms each fit into a single UChar. |
| * With UTF-8, they occupy 2 or 3 bytes, and more than the ASCII |
| * characters. |
| */ |
| # error This implementation assumes UTF-16 or UTF-32 (check UTF_SIZE) |
| #endif |
| |
| /* |
| * This function shapes European digits to Arabic-Indic digits |
| * in-place, writing over the input characters. |
| * Since we know that we are only looking for BMP code points, |
| * we can safely just work with code units (again, at least UTF-16). |
| */ |
| static void |
| _shapeToArabicDigitsWithContext(UChar *s, int32_t length, |
| UChar digitBase, |
| UBool isLogical, UBool lastStrongWasAL) { |
| int32_t i; |
| UChar c; |
| |
| digitBase-=0x30; |
| |
| /* the iteration direction depends on the type of input */ |
| if(isLogical) { |
| for(i=0; i<length; ++i) { |
| c=s[i]; |
| switch(u_charDirection(c)) { |
| case U_LEFT_TO_RIGHT: /* L */ |
| case U_RIGHT_TO_LEFT: /* R */ |
| lastStrongWasAL=FALSE; |
| break; |
| case U_RIGHT_TO_LEFT_ARABIC: /* AL */ |
| lastStrongWasAL=TRUE; |
| break; |
| case U_EUROPEAN_NUMBER: /* EN */ |
| if(lastStrongWasAL && (uint32_t)(c-0x30)<10) { |
| s[i]=(UChar)(digitBase+c); /* digitBase+(c-0x30) - digitBase was modified above */ |
| } |
| break; |
| default : |
| break; |
| } |
| } |
| } else { |
| for(i=length; i>0; /* pre-decrement in the body */) { |
| c=s[--i]; |
| switch(u_charDirection(c)) { |
| case U_LEFT_TO_RIGHT: /* L */ |
| case U_RIGHT_TO_LEFT: /* R */ |
| lastStrongWasAL=FALSE; |
| break; |
| case U_RIGHT_TO_LEFT_ARABIC: /* AL */ |
| lastStrongWasAL=TRUE; |
| break; |
| case U_EUROPEAN_NUMBER: /* EN */ |
| if(lastStrongWasAL && (uint32_t)(c-0x30)<10) { |
| s[i]=(UChar)(digitBase+c); /* digitBase+(c-0x30) - digitBase was modified above */ |
| } |
| break; |
| default : |
| break; |
| } |
| } |
| } |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| u_shapeArabic(const UChar *source, int32_t sourceLength, |
| UChar *dest, int32_t destSize, |
| uint32_t options, |
| UErrorCode *pErrorCode) { |
| /* usual error checking */ |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return 0; |
| } |
| |
| /* make sure that no reserved options values are used; allow dest==NULL only for preflighting */ |
| if( source==NULL || sourceLength<-1 || |
| (dest==NULL && destSize!=0) || destSize<0 || |
| options>=U_SHAPE_DIGIT_TYPE_RESERVED || |
| (options&U_SHAPE_LENGTH_MASK)==U_SHAPE_LENGTH_RESERVED || |
| (options&U_SHAPE_LETTERS_MASK)==U_SHAPE_LETTERS_RESERVED || |
| (options&U_SHAPE_DIGITS_MASK)>=U_SHAPE_DIGITS_RESERVED |
| ) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| /* determine the source length */ |
| if(sourceLength==-1) { |
| sourceLength=u_strlen(source); |
| } |
| if(sourceLength==0) { |
| return 0; |
| } |
| |
| /* check that source and destination do not overlap */ |
| if( dest!=NULL && |
| ((source<=dest && dest<source+sourceLength) || |
| (dest<=source && source<dest+destSize)) |
| ) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| if((options&U_SHAPE_LETTERS_MASK)!=U_SHAPE_LETTERS_NOOP) { |
| /* currently, only number shaping is supported */ |
| *pErrorCode=U_UNSUPPORTED_ERROR; |
| return 0; |
| } else { |
| /* |
| * No letter shaping: |
| * just make sure the destination is large enough and copy the string. |
| */ |
| if(destSize<sourceLength) { |
| /* this catches preflighting, too */ |
| *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| return sourceLength; |
| } |
| uprv_memcpy(dest, source, sourceLength*U_SIZEOF_UCHAR); |
| destSize=sourceLength; |
| } |
| |
| /* |
| * Perform number shaping. |
| * With UTF-16 or UTF-32, the length of the string is constant. |
| * The easiest way to do this is to operate on the destination and |
| * "shape" the digits in-place. |
| */ |
| if((options&U_SHAPE_DIGITS_MASK)!=U_SHAPE_DIGITS_NOOP) { |
| UChar digitBase; |
| int32_t i; |
| |
| /* select the requested digit group */ |
| switch(options&U_SHAPE_DIGIT_TYPE_MASK) { |
| case U_SHAPE_DIGIT_TYPE_AN: |
| digitBase=0x660; /* Unicode: "Arabic-Indic digits" */ |
| break; |
| case U_SHAPE_DIGIT_TYPE_AN_EXTENDED: |
| digitBase=0x6f0; /* Unicode: "Eastern Arabic-Indic digits (Persian and Urdu)" */ |
| break; |
| default: |
| /* will never occur because of validity checks above */ |
| digitBase=0; |
| break; |
| } |
| |
| /* perform the requested operation */ |
| switch(options&U_SHAPE_DIGITS_MASK) { |
| case U_SHAPE_DIGITS_EN2AN: |
| /* add (digitBase-'0') to each European (ASCII) digit code point */ |
| digitBase-=0x30; |
| for(i=0; i<destSize; ++i) { |
| if(((uint32_t)dest[i]-0x30)<10) { |
| dest[i]+=digitBase; |
| } |
| } |
| break; |
| case U_SHAPE_DIGITS_AN2EN: |
| /* subtract (digitBase-'0') from each Arabic digit code point */ |
| for(i=0; i<destSize; ++i) { |
| if(((uint32_t)dest[i]-(uint32_t)digitBase)<10) { |
| dest[i]-=digitBase-0x30; |
| } |
| } |
| break; |
| case U_SHAPE_DIGITS_ALEN2AN_INIT_LR: |
| _shapeToArabicDigitsWithContext(dest, destSize, |
| digitBase, |
| (UBool)((options&U_SHAPE_TEXT_DIRECTION_MASK)==U_SHAPE_TEXT_DIRECTION_LOGICAL), |
| FALSE); |
| break; |
| case U_SHAPE_DIGITS_ALEN2AN_INIT_AL: |
| _shapeToArabicDigitsWithContext(dest, destSize, |
| digitBase, |
| (UBool)((options&U_SHAPE_TEXT_DIRECTION_MASK)==U_SHAPE_TEXT_DIRECTION_LOGICAL), |
| TRUE); |
| break; |
| default: |
| /* will never occur because of validity checks above */ |
| break; |
| } |
| } |
| |
| return destSize; |
| } |