| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 1998-2001, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * |
| * File ucbuf.c |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 05/10/01 Ram Creation. |
| ******************************************************************************* |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "unicode/ucnv.h" |
| #include "unicode/ucnv_err.h" |
| #include "filestrm.h" |
| #include "cmemory.h" |
| #include "unicode/ustring.h" |
| #include "ucbuf.h" |
| #include <stdio.h> |
| |
| #define MAX_IN_BUF 1000 |
| #define MAX_U_BUF 1500 |
| #define CONTEXT_LEN 15 |
| |
| struct UCHARBUF { |
| UChar* buffer; |
| UChar* currentPos; |
| UChar* bufLimit; |
| int32_t remaining; |
| FileStream* in; |
| UConverter* conv; |
| UBool showWarning; /* makes this API not produce any errors */ |
| }; |
| |
| static UBool ucbuf_autodetect_nrw(FileStream* in, const char** cp,int* numRead){ |
| /* initial 0xa5 bytes: make sure that if we read <4 bytes we don't misdetect something */ |
| char start[4]={ '\xa5', '\xa5', '\xa5', '\xa5' }; |
| int cap =T_FileStream_size(in); |
| UBool autodetect; |
| int signatureLength; |
| |
| *numRead=0; |
| *cp=""; |
| |
| if(cap<=0) { |
| return FALSE; |
| } |
| |
| autodetect = TRUE; |
| *numRead=T_FileStream_read(in, start, 4); /* *numRead might be <4 */ |
| if(start[0] == '\xFE' && start[1] == '\xFF') { |
| *cp = "UTF-16BE"; |
| signatureLength=2; |
| } else if(start[0] == '\xFF' && start[1] == '\xFE') { |
| if(start[2] == '\x00' && start[3] =='\x00'){ |
| *cp="UTF-32LE"; |
| signatureLength=4; |
| } else { |
| *cp = "UTF-16LE"; |
| signatureLength=2; |
| } |
| } else if(start[0] == '\xEF' && start[1] == '\xBB' && start[2] == '\xBF') { |
| *cp = "UTF-8"; |
| signatureLength=3; |
| }else if(start[0] == '\x0E' && start[1] == '\xFE' && start[2] == '\xFF'){ |
| *cp ="SCSU"; |
| signatureLength=3; |
| }else if(start[0] == '\x00' && start[1] == '\x00' && |
| start[2] == '\xFE' && start[3]=='\xFF'){ |
| *cp = "UTF-32BE"; |
| signatureLength=4; |
| }else{ |
| signatureLength=0; |
| autodetect=FALSE; |
| } |
| while(signatureLength<*numRead) { |
| T_FileStream_ungetc(start[--*numRead], in); |
| } |
| return autodetect; |
| } |
| |
| /* Autodetects UTF8, UTF-16-BigEndian and UTF-16-LittleEndian BOMs*/ |
| U_CAPI UBool U_EXPORT2 |
| ucbuf_autodetect(FileStream* in,const char** cp){ |
| UBool autodetect = FALSE; |
| int numRead =0; |
| const char* tcp; |
| autodetect=ucbuf_autodetect_nrw(in,&tcp, &numRead); |
| *cp =tcp; |
| /* rewind the file Stream */ |
| T_FileStream_rewind(in); |
| return autodetect; |
| } |
| |
| /* fill the uchar buffer */ |
| static UCHARBUF* |
| ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* err){ |
| UChar* pTarget=NULL; |
| UChar* target=NULL; |
| const char* source=NULL; |
| char cbuf[MAX_IN_BUF] = {'\0'}; |
| int numRead=0; |
| int offset=0; |
| const char* sourceLimit =NULL; |
| pTarget = buf->buffer; |
| /* check if we arrived here without exhausting the buffer*/ |
| if(buf->currentPos<buf->bufLimit){ |
| offset= buf->bufLimit-buf->currentPos; |
| memmove(buf->buffer,buf->currentPos,offset* sizeof(UChar)); |
| } |
| |
| #if DEBUG |
| memset(pTarget+offset,0xff,sizeof(UChar)*(MAX_IN_BUF-offset)); |
| #endif |
| |
| /* read the file */ |
| numRead=T_FileStream_read(buf->in,cbuf,MAX_IN_BUF-offset); |
| buf->remaining-=numRead; |
| |
| target=pTarget; |
| /* convert the bytes */ |
| if(buf->conv){ |
| /* set the callback to stop */ |
| UConverterToUCallback toUOldAction ; |
| void* toUOldContext; |
| void* toUNewContext=NULL; |
| ucnv_setToUCallBack(buf->conv, |
| UCNV_TO_U_CALLBACK_STOP, |
| toUNewContext, |
| &toUOldAction, |
| (const void**)&toUOldContext, |
| err); |
| /* since state is saved in the converter we add offset to source*/ |
| target = pTarget+offset; |
| source = cbuf; |
| sourceLimit = source + numRead; |
| ucnv_toUnicode(buf->conv,&target,target+(MAX_U_BUF-offset), |
| &source,source+numRead,NULL, |
| (UBool)(buf->remaining==0),err); |
| |
| if(U_FAILURE(*err)){ |
| char context[CONTEXT_LEN]; |
| char preContext[CONTEXT_LEN]; |
| char postContext[CONTEXT_LEN]; |
| int8_t len = CONTEXT_LEN; |
| int32_t start=0; |
| int32_t stop =0; |
| int32_t pos =0; |
| |
| if( buf->showWarning==TRUE){ |
| fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while" |
| " converting input stream to target encoding: %s\n", |
| u_errorName(*err)); |
| } |
| |
| *err = U_ZERO_ERROR; |
| |
| /* now get the context chars */ |
| ucnv_getInvalidChars(buf->conv,context,&len,err); |
| context[len]= 0 ; /* null terminate the buffer */ |
| |
| pos = source-cbuf-len; |
| |
| /* for pre-context */ |
| start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1)); |
| stop = pos-len; |
| |
| memcpy(preContext,cbuf+start,stop-start); |
| /* null terminate the buffer */ |
| preContext[stop-start] = 0; |
| |
| /* for post-context */ |
| start = pos+len; |
| stop = ((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf); |
| |
| memcpy(postContext,source,stop-start); |
| /* null terminate the buffer */ |
| postContext[stop-start] = 0; |
| |
| if(buf->showWarning ==TRUE){ |
| /* print out the context */ |
| fprintf(stderr,"\tPre-context: %s\n",preContext); |
| fprintf(stderr,"\tContext: %s\n",context); |
| fprintf(stderr,"\tPost-context: %s\n", postContext); |
| } |
| |
| /* reset the converter */ |
| ucnv_reset(buf->conv); |
| |
| /* set the call back to substiture |
| * and restart conversion |
| */ |
| ucnv_setToUCallBack(buf->conv, |
| UCNV_TO_U_CALLBACK_SUBSTITUTE, |
| toUNewContext, |
| &toUOldAction, |
| (const void**)&toUOldContext, |
| err); |
| |
| /* reset source and target start positions */ |
| target = pTarget+offset; |
| source = cbuf; |
| |
| /* re convert */ |
| ucnv_toUnicode(buf->conv,&target,target+(MAX_U_BUF-offset), |
| &source,sourceLimit,NULL, |
| (UBool)(buf->remaining==0),err); |
| |
| } |
| numRead= target-pTarget; |
| |
| |
| #if DEBUG |
| { |
| int i; |
| target = pTarget; |
| for(i=0;i<numRead;i++){ |
| /* printf("%c", (char)(*target++));*/ |
| } |
| } |
| #endif |
| |
| }else{ |
| u_charsToUChars(cbuf,target+offset,numRead); |
| numRead=((buf->remaining>MAX_IN_BUF)? MAX_IN_BUF:numRead+offset); |
| } |
| buf->currentPos = pTarget; |
| buf->bufLimit=pTarget+numRead; |
| return buf; |
| } |
| |
| /* get a UChar from the stream*/ |
| U_CAPI UChar32 U_EXPORT2 |
| ucbuf_getc(UCHARBUF* buf,UErrorCode* err){ |
| if(buf->currentPos>=buf->bufLimit){ |
| if(buf->remaining==0){ |
| return U_EOF; |
| } |
| buf=ucbuf_fillucbuf(buf,err); |
| if(U_FAILURE(*err)){ |
| return U_EOF; |
| } |
| } |
| |
| return *(buf->currentPos++); |
| } |
| |
| |
| /* u_unescapeAt() callback to return a UChar*/ |
| static UChar |
| _charAt(int32_t offset, void *context) { |
| return ((UCHARBUF*) context)->currentPos[offset]; |
| } |
| |
| /* getc and escape it */ |
| U_CAPI UChar32 U_EXPORT2 |
| ucbuf_getcx(UCHARBUF* buf,UErrorCode* err) { |
| int32_t length; |
| int32_t offset; |
| UChar32 c32,c1,c2; |
| |
| /* Fill the buffer if it is empty */ |
| if (buf->currentPos >=buf->bufLimit-2) { |
| ucbuf_fillucbuf(buf,err); |
| } |
| |
| /* Get the next character in the buffer */ |
| if (buf->currentPos < buf->bufLimit) { |
| c1 = *(buf->currentPos)++; |
| } else { |
| c1 = U_EOF; |
| } |
| |
| c2 = *(buf->currentPos); |
| |
| /* If it isn't a backslash, return it */ |
| if (c1 != 0x005C) { |
| return c1; |
| } |
| |
| /* Determine the amount of data in the buffer */ |
| length = buf->bufLimit-buf->currentPos; |
| |
| /* The longest escape sequence is \Uhhhhhhhh; make sure |
| we have at least that many characters */ |
| if (length < 10) { |
| |
| /* fill the buffer */ |
| ucbuf_fillucbuf(buf,err); |
| length = buf->bufLimit-buf->buffer; |
| } |
| |
| /* Process the escape */ |
| offset = 0; |
| c32 = u_unescapeAt(_charAt, &offset, length, (void*)buf); |
| |
| /* check if u_unescapeAt unescaped and converted |
| * to c32 or not |
| */ |
| if(c32!=c2){ |
| /* Update the current buffer position */ |
| buf->currentPos += offset; |
| }else{ |
| /* unescaping failed so we just return |
| * c1 and not consume the buffer |
| * this is useful for rules with escapes |
| * in resouce bundles |
| * eg: \' \\ \" |
| */ |
| return c1; |
| } |
| |
| return c32; |
| } |
| |
| /* open a UCHARBUF */ |
| U_CAPI UCHARBUF* U_EXPORT2 |
| ucbuf_open(FileStream* in,const char* cp, UBool showWarning, UErrorCode* err){ |
| |
| UCHARBUF* buf =(UCHARBUF*) uprv_malloc(sizeof(UCHARBUF)); |
| int numRead =0; |
| if(U_FAILURE(*err)){ |
| return NULL; |
| } |
| if(buf){ |
| buf->in=in; |
| buf->conv=NULL; |
| buf->showWarning = showWarning; |
| if(!cp ||(cp && *cp=='\0')){ |
| /* don't have code page name... try to autodetect */ |
| if(ucbuf_autodetect_nrw(in,&cp,&numRead)){ |
| buf->conv=ucnv_open(cp,err); |
| } |
| }else{ |
| buf->conv=ucnv_open(cp,err); |
| } |
| |
| if((buf->conv==NULL) && (buf->showWarning==TRUE)){ |
| fprintf(stderr,"###WARNING: No converter defined. Using codepage of system.\n"); |
| } |
| buf->remaining=T_FileStream_size(in)-numRead; |
| buf->buffer=(UChar*) uprv_malloc(sizeof(UChar)* MAX_U_BUF); |
| if (buf->buffer == NULL) { |
| *err = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| buf->currentPos=buf->buffer; |
| buf->bufLimit=buf->buffer; |
| if(U_FAILURE(*err)){ |
| fprintf(stderr, "Could not open codepage [%s]: %s\n", cp, u_errorName(*err)); |
| return NULL; |
| } |
| buf=ucbuf_fillucbuf(buf,err); |
| return buf; |
| }else{ |
| *err = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| } |
| |
| /* TODO: this method will fail if at the |
| * begining of buffer and the uchar to unget |
| * is from the previous buffer. Need to implement |
| * system to take care of that situation. |
| */ |
| U_CAPI void U_EXPORT2 |
| ucbuf_ungetc(UChar32 c,UCHARBUF* buf){ |
| /* decrement currentPos pointer |
| * if not at the begining of buffer |
| */ |
| if(buf->currentPos!=buf->buffer){ |
| buf->currentPos--; |
| } |
| } |
| |
| /* frees the resources of UChar* buffer */ |
| static void |
| ucbuf_closebuf(UCHARBUF* buf){ |
| uprv_free(buf->buffer); |
| buf->buffer = NULL; |
| } |
| |
| /* close the buf and release resources*/ |
| U_CAPI void U_EXPORT2 |
| ucbuf_close(UCHARBUF* buf){ |
| if(buf->conv){ |
| ucnv_close(buf->conv); |
| } |
| buf->in=NULL; |
| buf->currentPos=NULL; |
| buf->bufLimit=NULL; |
| ucbuf_closebuf(buf); |
| uprv_free(buf); |
| } |
| |
| /* rewind the buf and file stream */ |
| U_CAPI void U_EXPORT2 |
| ucbuf_rewind(UCHARBUF* buf){ |
| if(buf){ |
| const char* cp=""; |
| buf->currentPos=buf->buffer; |
| buf->bufLimit=buf->buffer; |
| ucnv_reset(buf->conv); |
| T_FileStream_rewind(buf->in); |
| ucbuf_autodetect(buf->in,&cp); |
| buf->remaining=T_FileStream_size(buf->in); |
| } |
| } |