| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ****************************************************************************** |
| * |
| * Copyright (C) 2001-2016, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ****************************************************************************** |
| * |
| * File ustrtrns.cpp |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 9/10/2001 Ram Creation. |
| ****************************************************************************** |
| */ |
| |
| /******************************************************************************* |
| * |
| * u_strTo* and u_strFrom* APIs |
| * WCS functions moved to ustr_wcs.c for better modularization |
| * |
| ******************************************************************************* |
| */ |
| |
| |
| #include "unicode/putil.h" |
| #include "unicode/ustring.h" |
| #include "unicode/utf.h" |
| #include "unicode/utf8.h" |
| #include "unicode/utf16.h" |
| #include "cstring.h" |
| #include "cmemory.h" |
| #include "ustr_imp.h" |
| #include "uassert.h" |
| |
| U_CAPI UChar* U_EXPORT2 |
| u_strFromUTF32WithSub(UChar *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const UChar32 *src, |
| int32_t srcLength, |
| UChar32 subchar, int32_t *pNumSubstitutions, |
| UErrorCode *pErrorCode) { |
| const UChar32 *srcLimit; |
| UChar32 ch; |
| UChar *destLimit; |
| UChar *pDest; |
| int32_t reqLength; |
| int32_t numSubstitutions; |
| |
| /* args check */ |
| if(U_FAILURE(*pErrorCode)){ |
| return NULL; |
| } |
| if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
| subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
| ) { |
| *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| |
| if(pNumSubstitutions != NULL) { |
| *pNumSubstitutions = 0; |
| } |
| |
| pDest = dest; |
| destLimit = (dest!=NULL)?(dest + destCapacity):NULL; |
| reqLength = 0; |
| numSubstitutions = 0; |
| |
| if(srcLength < 0) { |
| /* simple loop for conversion of a NUL-terminated BMP string */ |
| while((ch=*src) != 0 && |
| ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { |
| ++src; |
| if(pDest < destLimit) { |
| *pDest++ = (UChar)ch; |
| } else { |
| ++reqLength; |
| } |
| } |
| srcLimit = src; |
| if(ch != 0) { |
| /* "complicated" case, find the end of the remaining string */ |
| while(*++srcLimit != 0) {} |
| } |
| } else { |
| srcLimit = (src!=NULL)?(src + srcLength):NULL; |
| } |
| |
| /* convert with length */ |
| while(src < srcLimit) { |
| ch = *src++; |
| do { |
| /* usually "loops" once; twice only for writing subchar */ |
| if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { |
| if(pDest < destLimit) { |
| *pDest++ = (UChar)ch; |
| } else { |
| ++reqLength; |
| } |
| break; |
| } else if(0x10000 <= ch && ch <= 0x10ffff) { |
| if(pDest!=NULL && ((pDest + 2) <= destLimit)) { |
| *pDest++ = U16_LEAD(ch); |
| *pDest++ = U16_TRAIL(ch); |
| } else { |
| reqLength += 2; |
| } |
| break; |
| } else if((ch = subchar) < 0) { |
| /* surrogate code point, or not a Unicode code point at all */ |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } else { |
| ++numSubstitutions; |
| } |
| } while(TRUE); |
| } |
| |
| reqLength += (int32_t)(pDest - dest); |
| if(pDestLength) { |
| *pDestLength = reqLength; |
| } |
| if(pNumSubstitutions != NULL) { |
| *pNumSubstitutions = numSubstitutions; |
| } |
| |
| /* Terminate the buffer */ |
| u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
| |
| return dest; |
| } |
| |
| U_CAPI UChar* U_EXPORT2 |
| u_strFromUTF32(UChar *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const UChar32 *src, |
| int32_t srcLength, |
| UErrorCode *pErrorCode) { |
| return u_strFromUTF32WithSub( |
| dest, destCapacity, pDestLength, |
| src, srcLength, |
| U_SENTINEL, NULL, |
| pErrorCode); |
| } |
| |
| U_CAPI UChar32* U_EXPORT2 |
| u_strToUTF32WithSub(UChar32 *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const UChar *src, |
| int32_t srcLength, |
| UChar32 subchar, int32_t *pNumSubstitutions, |
| UErrorCode *pErrorCode) { |
| const UChar *srcLimit; |
| UChar32 ch; |
| UChar ch2; |
| UChar32 *destLimit; |
| UChar32 *pDest; |
| int32_t reqLength; |
| int32_t numSubstitutions; |
| |
| /* args check */ |
| if(U_FAILURE(*pErrorCode)){ |
| return NULL; |
| } |
| if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
| subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
| ) { |
| *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| |
| if(pNumSubstitutions != NULL) { |
| *pNumSubstitutions = 0; |
| } |
| |
| pDest = dest; |
| destLimit = (dest!=NULL)?(dest + destCapacity):NULL; |
| reqLength = 0; |
| numSubstitutions = 0; |
| |
| if(srcLength < 0) { |
| /* simple loop for conversion of a NUL-terminated BMP string */ |
| while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { |
| ++src; |
| if(pDest < destLimit) { |
| *pDest++ = ch; |
| } else { |
| ++reqLength; |
| } |
| } |
| srcLimit = src; |
| if(ch != 0) { |
| /* "complicated" case, find the end of the remaining string */ |
| while(*++srcLimit != 0) {} |
| } |
| } else { |
| srcLimit = (src!=NULL)?(src + srcLength):NULL; |
| } |
| |
| /* convert with length */ |
| while(src < srcLimit) { |
| ch = *src++; |
| if(!U16_IS_SURROGATE(ch)) { |
| /* write or count ch below */ |
| } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { |
| ++src; |
| ch = U16_GET_SUPPLEMENTARY(ch, ch2); |
| } else if((ch = subchar) < 0) { |
| /* unpaired surrogate */ |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } else { |
| ++numSubstitutions; |
| } |
| if(pDest < destLimit) { |
| *pDest++ = ch; |
| } else { |
| ++reqLength; |
| } |
| } |
| |
| reqLength += (int32_t)(pDest - dest); |
| if(pDestLength) { |
| *pDestLength = reqLength; |
| } |
| if(pNumSubstitutions != NULL) { |
| *pNumSubstitutions = numSubstitutions; |
| } |
| |
| /* Terminate the buffer */ |
| u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); |
| |
| return dest; |
| } |
| |
| U_CAPI UChar32* U_EXPORT2 |
| u_strToUTF32(UChar32 *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const UChar *src, |
| int32_t srcLength, |
| UErrorCode *pErrorCode) { |
| return u_strToUTF32WithSub( |
| dest, destCapacity, pDestLength, |
| src, srcLength, |
| U_SENTINEL, NULL, |
| pErrorCode); |
| } |
| |
| U_CAPI UChar* U_EXPORT2 |
| u_strFromUTF8WithSub(UChar *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const char* src, |
| int32_t srcLength, |
| UChar32 subchar, int32_t *pNumSubstitutions, |
| UErrorCode *pErrorCode){ |
| /* args check */ |
| if(U_FAILURE(*pErrorCode)) { |
| return NULL; |
| } |
| if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
| subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
| ) { |
| *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| |
| if(pNumSubstitutions!=NULL) { |
| *pNumSubstitutions=0; |
| } |
| UChar *pDest = dest; |
| UChar *pDestLimit = dest+destCapacity; |
| int32_t reqLength = 0; |
| int32_t numSubstitutions=0; |
| |
| /* |
| * Inline processing of UTF-8 byte sequences: |
| * |
| * Byte sequences for the most common characters are handled inline in |
| * the conversion loops. In order to reduce the path lengths for those |
| * characters, the tests are arranged in a kind of binary search. |
| * ASCII (<=0x7f) is checked first, followed by the dividing point |
| * between 2- and 3-byte sequences (0xe0). |
| * The 3-byte branch is tested first to speed up CJK text. |
| * The compiler should combine the subtractions for the two tests for 0xe0. |
| * Each branch then tests for the other end of its range. |
| */ |
| |
| if(srcLength < 0){ |
| /* |
| * Transform a NUL-terminated string. |
| * The code explicitly checks for NULs only in the lead byte position. |
| * A NUL byte in the trail byte position fails the trail byte range check anyway. |
| */ |
| int32_t i; |
| UChar32 c; |
| for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) { |
| // modified copy of U8_NEXT() |
| ++i; |
| if(U8_IS_SINGLE(c)) { |
| *pDest++=(UChar)c; |
| } else { |
| uint8_t __t1, __t2; |
| if( /* handle U+0800..U+FFFF inline */ |
| (0xe0<=(c) && (c)<0xf0) && |
| U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && |
| (__t2=src[(i)+1]-0x80)<=0x3f) { |
| *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; |
| i+=2; |
| } else if( /* handle U+0080..U+07FF inline */ |
| ((c)<0xe0 && (c)>=0xc2) && |
| (__t1=src[i]-0x80)<=0x3f) { |
| *pDest++ = (((c)&0x1f)<<6)|__t1; |
| ++(i); |
| } else { |
| /* function call for "complicated" and error cases */ |
| (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1); |
| if(c<0 && (++numSubstitutions, c = subchar) < 0) { |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } else if(c<=0xFFFF) { |
| *(pDest++)=(UChar)c; |
| } else { |
| *(pDest++)=U16_LEAD(c); |
| if(pDest<pDestLimit) { |
| *(pDest++)=U16_TRAIL(c); |
| } else { |
| reqLength++; |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| /* Pre-flight the rest of the string. */ |
| while((c = (uint8_t)src[i]) != 0) { |
| // modified copy of U8_NEXT() |
| ++i; |
| if(U8_IS_SINGLE(c)) { |
| ++reqLength; |
| } else { |
| uint8_t __t1, __t2; |
| if( /* handle U+0800..U+FFFF inline */ |
| (0xe0<=(c) && (c)<0xf0) && |
| U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && |
| (__t2=src[(i)+1]-0x80)<=0x3f) { |
| ++reqLength; |
| i+=2; |
| } else if( /* handle U+0080..U+07FF inline */ |
| ((c)<0xe0 && (c)>=0xc2) && |
| (__t1=src[i]-0x80)<=0x3f) { |
| ++reqLength; |
| ++(i); |
| } else { |
| /* function call for "complicated" and error cases */ |
| (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1); |
| if(c<0 && (++numSubstitutions, c = subchar) < 0) { |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } |
| reqLength += U16_LENGTH(c); |
| } |
| } |
| } |
| } else /* srcLength >= 0 */ { |
| /* Faster loop without ongoing checking for srcLength and pDestLimit. */ |
| int32_t i = 0; |
| UChar32 c; |
| for(;;) { |
| /* |
| * Each iteration of the inner loop progresses by at most 3 UTF-8 |
| * bytes and one UChar, for most characters. |
| * For supplementary code points (4 & 2), which are rare, |
| * there is an additional adjustment. |
| */ |
| int32_t count = (int32_t)(pDestLimit - pDest); |
| int32_t count2 = (srcLength - i) / 3; |
| if(count > count2) { |
| count = count2; /* min(remaining dest, remaining src/3) */ |
| } |
| if(count < 3) { |
| /* |
| * Too much overhead if we get near the end of the string, |
| * continue with the next loop. |
| */ |
| break; |
| } |
| |
| do { |
| // modified copy of U8_NEXT() |
| c = (uint8_t)src[i++]; |
| if(U8_IS_SINGLE(c)) { |
| *pDest++=(UChar)c; |
| } else { |
| uint8_t __t1, __t2; |
| if( /* handle U+0800..U+FFFF inline */ |
| (0xe0<=(c) && (c)<0xf0) && |
| ((i)+1)<srcLength && |
| U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && |
| (__t2=src[(i)+1]-0x80)<=0x3f) { |
| *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; |
| i+=2; |
| } else if( /* handle U+0080..U+07FF inline */ |
| ((c)<0xe0 && (c)>=0xc2) && |
| ((i)!=srcLength) && |
| (__t1=src[i]-0x80)<=0x3f) { |
| *pDest++ = (((c)&0x1f)<<6)|__t1; |
| ++(i); |
| } else { |
| if(c >= 0xf0 || subchar > 0xffff) { |
| // We may read up to four bytes and write up to two UChars, |
| // which we didn't account for with computing count, |
| // so we adjust it here. |
| if(--count == 0) { |
| --i; // back out byte c |
| break; |
| } |
| } |
| |
| /* function call for "complicated" and error cases */ |
| (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); |
| if(c<0 && (++numSubstitutions, c = subchar) < 0) { |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } else if(c<=0xFFFF) { |
| *(pDest++)=(UChar)c; |
| } else { |
| *(pDest++)=U16_LEAD(c); |
| *(pDest++)=U16_TRAIL(c); |
| } |
| } |
| } |
| } while(--count > 0); |
| } |
| |
| while(i < srcLength && (pDest < pDestLimit)) { |
| // modified copy of U8_NEXT() |
| c = (uint8_t)src[i++]; |
| if(U8_IS_SINGLE(c)) { |
| *pDest++=(UChar)c; |
| } else { |
| uint8_t __t1, __t2; |
| if( /* handle U+0800..U+FFFF inline */ |
| (0xe0<=(c) && (c)<0xf0) && |
| ((i)+1)<srcLength && |
| U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && |
| (__t2=src[(i)+1]-0x80)<=0x3f) { |
| *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; |
| i+=2; |
| } else if( /* handle U+0080..U+07FF inline */ |
| ((c)<0xe0 && (c)>=0xc2) && |
| ((i)!=srcLength) && |
| (__t1=src[i]-0x80)<=0x3f) { |
| *pDest++ = (((c)&0x1f)<<6)|__t1; |
| ++(i); |
| } else { |
| /* function call for "complicated" and error cases */ |
| (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); |
| if(c<0 && (++numSubstitutions, c = subchar) < 0) { |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } else if(c<=0xFFFF) { |
| *(pDest++)=(UChar)c; |
| } else { |
| *(pDest++)=U16_LEAD(c); |
| if(pDest<pDestLimit) { |
| *(pDest++)=U16_TRAIL(c); |
| } else { |
| reqLength++; |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| /* Pre-flight the rest of the string. */ |
| while(i < srcLength) { |
| // modified copy of U8_NEXT() |
| c = (uint8_t)src[i++]; |
| if(U8_IS_SINGLE(c)) { |
| ++reqLength; |
| } else { |
| uint8_t __t1, __t2; |
| if( /* handle U+0800..U+FFFF inline */ |
| (0xe0<=(c) && (c)<0xf0) && |
| ((i)+1)<srcLength && |
| U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && |
| (__t2=src[(i)+1]-0x80)<=0x3f) { |
| ++reqLength; |
| i+=2; |
| } else if( /* handle U+0080..U+07FF inline */ |
| ((c)<0xe0 && (c)>=0xc2) && |
| ((i)!=srcLength) && |
| (__t1=src[i]-0x80)<=0x3f) { |
| ++reqLength; |
| ++(i); |
| } else { |
| /* function call for "complicated" and error cases */ |
| (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); |
| if(c<0 && (++numSubstitutions, c = subchar) < 0) { |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } |
| reqLength += U16_LENGTH(c); |
| } |
| } |
| } |
| } |
| |
| reqLength+=(int32_t)(pDest - dest); |
| |
| if(pNumSubstitutions!=NULL) { |
| *pNumSubstitutions=numSubstitutions; |
| } |
| |
| if(pDestLength){ |
| *pDestLength = reqLength; |
| } |
| |
| /* Terminate the buffer */ |
| u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); |
| |
| return dest; |
| } |
| |
| U_CAPI UChar* U_EXPORT2 |
| u_strFromUTF8(UChar *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const char* src, |
| int32_t srcLength, |
| UErrorCode *pErrorCode){ |
| return u_strFromUTF8WithSub( |
| dest, destCapacity, pDestLength, |
| src, srcLength, |
| U_SENTINEL, NULL, |
| pErrorCode); |
| } |
| |
| U_CAPI UChar * U_EXPORT2 |
| u_strFromUTF8Lenient(UChar *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const char *src, |
| int32_t srcLength, |
| UErrorCode *pErrorCode) { |
| UChar *pDest = dest; |
| UChar32 ch; |
| int32_t reqLength = 0; |
| uint8_t* pSrc = (uint8_t*) src; |
| |
| /* args check */ |
| if(U_FAILURE(*pErrorCode)){ |
| return NULL; |
| } |
| |
| if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| (destCapacity<0) || (dest == NULL && destCapacity > 0) |
| ) { |
| *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| |
| if(srcLength < 0) { |
| /* Transform a NUL-terminated string. */ |
| UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; |
| uint8_t t1, t2, t3; /* trail bytes */ |
| |
| while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { |
| if(ch < 0xc0) { |
| /* |
| * ASCII, or a trail byte in lead position which is treated like |
| * a single-byte sequence for better character boundary |
| * resynchronization after illegal sequences. |
| */ |
| *pDest++=(UChar)ch; |
| ++pSrc; |
| continue; |
| } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
| if((t1 = pSrc[1]) != 0) { |
| /* 0x3080 = (0xc0 << 6) + 0x80 */ |
| *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); |
| pSrc += 2; |
| continue; |
| } |
| } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
| if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { |
| /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
| /* 0x2080 = (0x80 << 6) + 0x80 */ |
| *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); |
| pSrc += 3; |
| continue; |
| } |
| } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
| if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { |
| pSrc += 4; |
| /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ |
| ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; |
| *(pDest++) = U16_LEAD(ch); |
| if(pDest < pDestLimit) { |
| *(pDest++) = U16_TRAIL(ch); |
| } else { |
| reqLength = 1; |
| break; |
| } |
| continue; |
| } |
| } |
| |
| /* truncated character at the end */ |
| *pDest++ = 0xfffd; |
| while(*++pSrc != 0) {} |
| break; |
| } |
| |
| /* Pre-flight the rest of the string. */ |
| while((ch = *pSrc) != 0) { |
| if(ch < 0xc0) { |
| /* |
| * ASCII, or a trail byte in lead position which is treated like |
| * a single-byte sequence for better character boundary |
| * resynchronization after illegal sequences. |
| */ |
| ++reqLength; |
| ++pSrc; |
| continue; |
| } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
| if(pSrc[1] != 0) { |
| ++reqLength; |
| pSrc += 2; |
| continue; |
| } |
| } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
| if(pSrc[1] != 0 && pSrc[2] != 0) { |
| ++reqLength; |
| pSrc += 3; |
| continue; |
| } |
| } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
| if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { |
| reqLength += 2; |
| pSrc += 4; |
| continue; |
| } |
| } |
| |
| /* truncated character at the end */ |
| ++reqLength; |
| break; |
| } |
| } else /* srcLength >= 0 */ { |
| const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL; |
| |
| /* |
| * This function requires that if srcLength is given, then it must be |
| * destCapatity >= srcLength so that we need not check for |
| * destination buffer overflow in the loop. |
| */ |
| if(destCapacity < srcLength) { |
| if(pDestLength != NULL) { |
| *pDestLength = srcLength; /* this likely overestimates the true destLength! */ |
| } |
| *pErrorCode = U_BUFFER_OVERFLOW_ERROR; |
| return NULL; |
| } |
| |
| if((pSrcLimit - pSrc) >= 4) { |
| pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ |
| |
| /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ |
| do { |
| ch = *pSrc++; |
| if(ch < 0xc0) { |
| /* |
| * ASCII, or a trail byte in lead position which is treated like |
| * a single-byte sequence for better character boundary |
| * resynchronization after illegal sequences. |
| */ |
| *pDest++=(UChar)ch; |
| } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
| /* 0x3080 = (0xc0 << 6) + 0x80 */ |
| *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); |
| } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
| /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
| /* 0x2080 = (0x80 << 6) + 0x80 */ |
| ch = (ch << 12) + (*pSrc++ << 6); |
| *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); |
| } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
| /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ |
| ch = (ch << 18) + (*pSrc++ << 12); |
| ch += *pSrc++ << 6; |
| ch += *pSrc++ - 0x3c82080; |
| *(pDest++) = U16_LEAD(ch); |
| *(pDest++) = U16_TRAIL(ch); |
| } |
| } while(pSrc < pSrcLimit); |
| |
| pSrcLimit += 3; /* restore original pSrcLimit */ |
| } |
| |
| while(pSrc < pSrcLimit) { |
| ch = *pSrc++; |
| if(ch < 0xc0) { |
| /* |
| * ASCII, or a trail byte in lead position which is treated like |
| * a single-byte sequence for better character boundary |
| * resynchronization after illegal sequences. |
| */ |
| *pDest++=(UChar)ch; |
| continue; |
| } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
| if(pSrc < pSrcLimit) { |
| /* 0x3080 = (0xc0 << 6) + 0x80 */ |
| *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); |
| continue; |
| } |
| } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
| if((pSrcLimit - pSrc) >= 2) { |
| /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
| /* 0x2080 = (0x80 << 6) + 0x80 */ |
| ch = (ch << 12) + (*pSrc++ << 6); |
| *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); |
| pSrc += 3; |
| continue; |
| } |
| } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
| if((pSrcLimit - pSrc) >= 3) { |
| /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ |
| ch = (ch << 18) + (*pSrc++ << 12); |
| ch += *pSrc++ << 6; |
| ch += *pSrc++ - 0x3c82080; |
| *(pDest++) = U16_LEAD(ch); |
| *(pDest++) = U16_TRAIL(ch); |
| pSrc += 4; |
| continue; |
| } |
| } |
| |
| /* truncated character at the end */ |
| *pDest++ = 0xfffd; |
| break; |
| } |
| } |
| |
| reqLength+=(int32_t)(pDest - dest); |
| |
| if(pDestLength){ |
| *pDestLength = reqLength; |
| } |
| |
| /* Terminate the buffer */ |
| u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); |
| |
| return dest; |
| } |
| |
| static inline uint8_t * |
| _appendUTF8(uint8_t *pDest, UChar32 c) { |
| /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ |
| if((c)<=0x7f) { |
| *pDest++=(uint8_t)c; |
| } else if(c<=0x7ff) { |
| *pDest++=(uint8_t)((c>>6)|0xc0); |
| *pDest++=(uint8_t)((c&0x3f)|0x80); |
| } else if(c<=0xffff) { |
| *pDest++=(uint8_t)((c>>12)|0xe0); |
| *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); |
| *pDest++=(uint8_t)(((c)&0x3f)|0x80); |
| } else /* if((uint32_t)(c)<=0x10ffff) */ { |
| *pDest++=(uint8_t)(((c)>>18)|0xf0); |
| *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); |
| *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); |
| *pDest++=(uint8_t)(((c)&0x3f)|0x80); |
| } |
| return pDest; |
| } |
| |
| |
| U_CAPI char* U_EXPORT2 |
| u_strToUTF8WithSub(char *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const UChar *pSrc, |
| int32_t srcLength, |
| UChar32 subchar, int32_t *pNumSubstitutions, |
| UErrorCode *pErrorCode){ |
| int32_t reqLength=0; |
| uint32_t ch=0,ch2=0; |
| uint8_t *pDest = (uint8_t *)dest; |
| uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL; |
| int32_t numSubstitutions; |
| |
| /* args check */ |
| if(U_FAILURE(*pErrorCode)){ |
| return NULL; |
| } |
| |
| if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || |
| (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
| subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
| ) { |
| *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| |
| if(pNumSubstitutions!=NULL) { |
| *pNumSubstitutions=0; |
| } |
| numSubstitutions=0; |
| |
| if(srcLength==-1) { |
| while((ch=*pSrc)!=0) { |
| ++pSrc; |
| if(ch <= 0x7f) { |
| if(pDest<pDestLimit) { |
| *pDest++ = (uint8_t)ch; |
| } else { |
| reqLength = 1; |
| break; |
| } |
| } else if(ch <= 0x7ff) { |
| if((pDestLimit - pDest) >= 2) { |
| *pDest++=(uint8_t)((ch>>6)|0xc0); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } else { |
| reqLength = 2; |
| break; |
| } |
| } else if(ch <= 0xd7ff || ch >= 0xe000) { |
| if((pDestLimit - pDest) >= 3) { |
| *pDest++=(uint8_t)((ch>>12)|0xe0); |
| *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } else { |
| reqLength = 3; |
| break; |
| } |
| } else /* ch is a surrogate */ { |
| int32_t length; |
| |
| /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/ |
| if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { |
| ++pSrc; |
| ch=U16_GET_SUPPLEMENTARY(ch, ch2); |
| } else if(subchar>=0) { |
| ch=subchar; |
| ++numSubstitutions; |
| } else { |
| /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } |
| |
| length = U8_LENGTH(ch); |
| if((pDestLimit - pDest) >= length) { |
| /* convert and append*/ |
| pDest=_appendUTF8(pDest, ch); |
| } else { |
| reqLength = length; |
| break; |
| } |
| } |
| } |
| while((ch=*pSrc++)!=0) { |
| if(ch<=0x7f) { |
| ++reqLength; |
| } else if(ch<=0x7ff) { |
| reqLength+=2; |
| } else if(!U16_IS_SURROGATE(ch)) { |
| reqLength+=3; |
| } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { |
| ++pSrc; |
| reqLength+=4; |
| } else if(subchar>=0) { |
| reqLength+=U8_LENGTH(subchar); |
| ++numSubstitutions; |
| } else { |
| /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } |
| } |
| } else { |
| const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL; |
| int32_t count; |
| |
| /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
| for(;;) { |
| /* |
| * Each iteration of the inner loop progresses by at most 3 UTF-8 |
| * bytes and one UChar, for most characters. |
| * For supplementary code points (4 & 2), which are rare, |
| * there is an additional adjustment. |
| */ |
| count = (int32_t)((pDestLimit - pDest) / 3); |
| srcLength = (int32_t)(pSrcLimit - pSrc); |
| if(count > srcLength) { |
| count = srcLength; /* min(remaining dest/3, remaining src) */ |
| } |
| if(count < 3) { |
| /* |
| * Too much overhead if we get near the end of the string, |
| * continue with the next loop. |
| */ |
| break; |
| } |
| do { |
| ch=*pSrc++; |
| if(ch <= 0x7f) { |
| *pDest++ = (uint8_t)ch; |
| } else if(ch <= 0x7ff) { |
| *pDest++=(uint8_t)((ch>>6)|0xc0); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } else if(ch <= 0xd7ff || ch >= 0xe000) { |
| *pDest++=(uint8_t)((ch>>12)|0xe0); |
| *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } else /* ch is a surrogate */ { |
| /* |
| * We will read two UChars and probably output four bytes, |
| * which we didn't account for with computing count, |
| * so we adjust it here. |
| */ |
| if(--count == 0) { |
| --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ |
| break; /* recompute count */ |
| } |
| |
| if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { |
| ++pSrc; |
| ch=U16_GET_SUPPLEMENTARY(ch, ch2); |
| |
| /* writing 4 bytes per 2 UChars is ok */ |
| *pDest++=(uint8_t)((ch>>18)|0xf0); |
| *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); |
| *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } else { |
| /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
| if(subchar>=0) { |
| ch=subchar; |
| ++numSubstitutions; |
| } else { |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } |
| |
| /* convert and append*/ |
| pDest=_appendUTF8(pDest, ch); |
| } |
| } |
| } while(--count > 0); |
| } |
| |
| while(pSrc<pSrcLimit) { |
| ch=*pSrc++; |
| if(ch <= 0x7f) { |
| if(pDest<pDestLimit) { |
| *pDest++ = (uint8_t)ch; |
| } else { |
| reqLength = 1; |
| break; |
| } |
| } else if(ch <= 0x7ff) { |
| if((pDestLimit - pDest) >= 2) { |
| *pDest++=(uint8_t)((ch>>6)|0xc0); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } else { |
| reqLength = 2; |
| break; |
| } |
| } else if(ch <= 0xd7ff || ch >= 0xe000) { |
| if((pDestLimit - pDest) >= 3) { |
| *pDest++=(uint8_t)((ch>>12)|0xe0); |
| *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } else { |
| reqLength = 3; |
| break; |
| } |
| } else /* ch is a surrogate */ { |
| int32_t length; |
| |
| if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { |
| ++pSrc; |
| ch=U16_GET_SUPPLEMENTARY(ch, ch2); |
| } else if(subchar>=0) { |
| ch=subchar; |
| ++numSubstitutions; |
| } else { |
| /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } |
| |
| length = U8_LENGTH(ch); |
| if((pDestLimit - pDest) >= length) { |
| /* convert and append*/ |
| pDest=_appendUTF8(pDest, ch); |
| } else { |
| reqLength = length; |
| break; |
| } |
| } |
| } |
| while(pSrc<pSrcLimit) { |
| ch=*pSrc++; |
| if(ch<=0x7f) { |
| ++reqLength; |
| } else if(ch<=0x7ff) { |
| reqLength+=2; |
| } else if(!U16_IS_SURROGATE(ch)) { |
| reqLength+=3; |
| } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { |
| ++pSrc; |
| reqLength+=4; |
| } else if(subchar>=0) { |
| reqLength+=U8_LENGTH(subchar); |
| ++numSubstitutions; |
| } else { |
| /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } |
| } |
| } |
| |
| reqLength+=(int32_t)(pDest - (uint8_t *)dest); |
| |
| if(pNumSubstitutions!=NULL) { |
| *pNumSubstitutions=numSubstitutions; |
| } |
| |
| if(pDestLength){ |
| *pDestLength = reqLength; |
| } |
| |
| /* Terminate the buffer */ |
| u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
| return dest; |
| } |
| |
| U_CAPI char* U_EXPORT2 |
| u_strToUTF8(char *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const UChar *pSrc, |
| int32_t srcLength, |
| UErrorCode *pErrorCode){ |
| return u_strToUTF8WithSub( |
| dest, destCapacity, pDestLength, |
| pSrc, srcLength, |
| U_SENTINEL, NULL, |
| pErrorCode); |
| } |
| |
| U_CAPI UChar* U_EXPORT2 |
| u_strFromJavaModifiedUTF8WithSub( |
| UChar *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const char *src, |
| int32_t srcLength, |
| UChar32 subchar, int32_t *pNumSubstitutions, |
| UErrorCode *pErrorCode) { |
| /* args check */ |
| if(U_FAILURE(*pErrorCode)) { |
| return NULL; |
| } |
| if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| (dest==NULL && destCapacity!=0) || destCapacity<0 || |
| subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
| ) { |
| *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| |
| if(pNumSubstitutions!=NULL) { |
| *pNumSubstitutions=0; |
| } |
| UChar *pDest = dest; |
| UChar *pDestLimit = dest+destCapacity; |
| int32_t reqLength = 0; |
| int32_t numSubstitutions=0; |
| |
| if(srcLength < 0) { |
| /* |
| * Transform a NUL-terminated ASCII string. |
| * Handle non-ASCII strings with slower code. |
| */ |
| UChar32 c; |
| while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) { |
| *pDest++=(UChar)c; |
| ++src; |
| } |
| if(c == 0) { |
| reqLength=(int32_t)(pDest - dest); |
| if(pDestLength) { |
| *pDestLength = reqLength; |
| } |
| |
| /* Terminate the buffer */ |
| u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
| return dest; |
| } |
| srcLength = static_cast<int32_t>(uprv_strlen(src)); |
| } |
| |
| /* Faster loop without ongoing checking for srcLength and pDestLimit. */ |
| UChar32 ch; |
| uint8_t t1, t2; |
| int32_t i = 0; |
| for(;;) { |
| int32_t count = (int32_t)(pDestLimit - pDest); |
| int32_t count2 = srcLength - i; |
| if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) { |
| /* fast ASCII loop */ |
| int32_t start = i; |
| uint8_t b; |
| while(i < srcLength && U8_IS_SINGLE(b = src[i])) { |
| *pDest++=b; |
| ++i; |
| } |
| int32_t delta = i - start; |
| count -= delta; |
| count2 -= delta; |
| } |
| /* |
| * Each iteration of the inner loop progresses by at most 3 UTF-8 |
| * bytes and one UChar. |
| */ |
| if(subchar > 0xFFFF) { |
| break; |
| } |
| count2 /= 3; |
| if(count > count2) { |
| count = count2; /* min(remaining dest, remaining src/3) */ |
| } |
| if(count < 3) { |
| /* |
| * Too much overhead if we get near the end of the string, |
| * continue with the next loop. |
| */ |
| break; |
| } |
| do { |
| ch = (uint8_t)src[i++]; |
| if(U8_IS_SINGLE(ch)) { |
| *pDest++=(UChar)ch; |
| } else { |
| if(ch >= 0xe0) { |
| if( /* handle U+0000..U+FFFF inline */ |
| ch <= 0xef && |
| (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f && |
| (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f |
| ) { |
| /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
| *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
| i += 2; |
| continue; |
| } |
| } else { |
| if( /* handle U+0000..U+07FF inline */ |
| ch >= 0xc0 && |
| (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f |
| ) { |
| *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
| ++i; |
| continue; |
| } |
| } |
| |
| if(subchar < 0) { |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } else if(subchar > 0xffff && --count == 0) { |
| /* |
| * We need to write two UChars, adjusted count for that, |
| * and ran out of space. |
| */ |
| --i; // back out byte ch |
| break; |
| } else { |
| /* function call for error cases */ |
| utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); |
| ++numSubstitutions; |
| *(pDest++)=(UChar)subchar; |
| } |
| } |
| } while(--count > 0); |
| } |
| |
| while(i < srcLength && (pDest < pDestLimit)) { |
| ch = (uint8_t)src[i++]; |
| if(U8_IS_SINGLE(ch)){ |
| *pDest++=(UChar)ch; |
| } else { |
| if(ch >= 0xe0) { |
| if( /* handle U+0000..U+FFFF inline */ |
| ch <= 0xef && |
| (i+1) < srcLength && |
| (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f && |
| (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f |
| ) { |
| /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
| *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
| i += 2; |
| continue; |
| } |
| } else { |
| if( /* handle U+0000..U+07FF inline */ |
| ch >= 0xc0 && |
| i < srcLength && |
| (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f |
| ) { |
| *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
| ++i; |
| continue; |
| } |
| } |
| |
| if(subchar < 0) { |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } else { |
| /* function call for error cases */ |
| utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); |
| ++numSubstitutions; |
| if(subchar<=0xFFFF) { |
| *(pDest++)=(UChar)subchar; |
| } else { |
| *(pDest++)=U16_LEAD(subchar); |
| if(pDest<pDestLimit) { |
| *(pDest++)=U16_TRAIL(subchar); |
| } else { |
| reqLength++; |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| /* Pre-flight the rest of the string. */ |
| while(i < srcLength) { |
| ch = (uint8_t)src[i++]; |
| if(U8_IS_SINGLE(ch)) { |
| reqLength++; |
| } else { |
| if(ch >= 0xe0) { |
| if( /* handle U+0000..U+FFFF inline */ |
| ch <= 0xef && |
| (i+1) < srcLength && |
| (uint8_t)(src[i] - 0x80) <= 0x3f && |
| (uint8_t)(src[i+1] - 0x80) <= 0x3f |
| ) { |
| reqLength++; |
| i += 2; |
| continue; |
| } |
| } else { |
| if( /* handle U+0000..U+07FF inline */ |
| ch >= 0xc0 && |
| i < srcLength && |
| (uint8_t)(src[i] - 0x80) <= 0x3f |
| ) { |
| reqLength++; |
| ++i; |
| continue; |
| } |
| } |
| |
| if(subchar < 0) { |
| *pErrorCode = U_INVALID_CHAR_FOUND; |
| return NULL; |
| } else { |
| /* function call for error cases */ |
| utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); |
| ++numSubstitutions; |
| reqLength+=U16_LENGTH(ch); |
| } |
| } |
| } |
| |
| if(pNumSubstitutions!=NULL) { |
| *pNumSubstitutions=numSubstitutions; |
| } |
| |
| reqLength+=(int32_t)(pDest - dest); |
| if(pDestLength) { |
| *pDestLength = reqLength; |
| } |
| |
| /* Terminate the buffer */ |
| u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
| return dest; |
| } |
| |
| U_CAPI char* U_EXPORT2 |
| u_strToJavaModifiedUTF8( |
| char *dest, |
| int32_t destCapacity, |
| int32_t *pDestLength, |
| const UChar *src, |
| int32_t srcLength, |
| UErrorCode *pErrorCode) { |
| int32_t reqLength=0; |
| uint32_t ch=0; |
| uint8_t *pDest = (uint8_t *)dest; |
| uint8_t *pDestLimit = pDest + destCapacity; |
| const UChar *pSrcLimit; |
| int32_t count; |
| |
| /* args check */ |
| if(U_FAILURE(*pErrorCode)){ |
| return NULL; |
| } |
| if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| (dest==NULL && destCapacity!=0) || destCapacity<0 |
| ) { |
| *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| |
| if(srcLength==-1) { |
| /* Convert NUL-terminated ASCII, then find the string length. */ |
| while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { |
| *pDest++ = (uint8_t)ch; |
| ++src; |
| } |
| if(ch == 0) { |
| reqLength=(int32_t)(pDest - (uint8_t *)dest); |
| if(pDestLength) { |
| *pDestLength = reqLength; |
| } |
| |
| /* Terminate the buffer */ |
| u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
| return dest; |
| } |
| srcLength = u_strlen(src); |
| } |
| |
| /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
| pSrcLimit = (src!=NULL)?(src+srcLength):NULL; |
| for(;;) { |
| count = (int32_t)(pDestLimit - pDest); |
| srcLength = (int32_t)(pSrcLimit - src); |
| if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { |
| /* fast ASCII loop */ |
| const UChar *prevSrc = src; |
| int32_t delta; |
| while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { |
| *pDest++=(uint8_t)ch; |
| ++src; |
| } |
| delta = (int32_t)(src - prevSrc); |
| count -= delta; |
| srcLength -= delta; |
| } |
| /* |
| * Each iteration of the inner loop progresses by at most 3 UTF-8 |
| * bytes and one UChar. |
| */ |
| count /= 3; |
| if(count > srcLength) { |
| count = srcLength; /* min(remaining dest/3, remaining src) */ |
| } |
| if(count < 3) { |
| /* |
| * Too much overhead if we get near the end of the string, |
| * continue with the next loop. |
| */ |
| break; |
| } |
| do { |
| ch=*src++; |
| if(ch <= 0x7f && ch != 0) { |
| *pDest++ = (uint8_t)ch; |
| } else if(ch <= 0x7ff) { |
| *pDest++=(uint8_t)((ch>>6)|0xc0); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } else { |
| *pDest++=(uint8_t)((ch>>12)|0xe0); |
| *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } |
| } while(--count > 0); |
| } |
| |
| while(src<pSrcLimit) { |
| ch=*src++; |
| if(ch <= 0x7f && ch != 0) { |
| if(pDest<pDestLimit) { |
| *pDest++ = (uint8_t)ch; |
| } else { |
| reqLength = 1; |
| break; |
| } |
| } else if(ch <= 0x7ff) { |
| if((pDestLimit - pDest) >= 2) { |
| *pDest++=(uint8_t)((ch>>6)|0xc0); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } else { |
| reqLength = 2; |
| break; |
| } |
| } else { |
| if((pDestLimit - pDest) >= 3) { |
| *pDest++=(uint8_t)((ch>>12)|0xe0); |
| *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| } else { |
| reqLength = 3; |
| break; |
| } |
| } |
| } |
| while(src<pSrcLimit) { |
| ch=*src++; |
| if(ch <= 0x7f && ch != 0) { |
| ++reqLength; |
| } else if(ch<=0x7ff) { |
| reqLength+=2; |
| } else { |
| reqLength+=3; |
| } |
| } |
| |
| reqLength+=(int32_t)(pDest - (uint8_t *)dest); |
| if(pDestLength){ |
| *pDestLength = reqLength; |
| } |
| |
| /* Terminate the buffer */ |
| u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
| return dest; |
| } |