blob: 9dd32dc6c0f22e9f227d565b5aa29b3a554c4296 [file] [log] [blame]
/*
*******************************************************************************
*
* Copyright (C) 1998-2001, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
* File ucbuf.c
*
* Modification History:
*
* Date Name Description
* 05/10/01 Ram Creation.
*******************************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
#include "filestrm.h"
#include "cmemory.h"
#include "unicode/ustring.h"
#include "ucbuf.h"
#include <stdio.h>
#define MAX_IN_BUF 1000
#define MAX_U_BUF 1500
#define CONTEXT_LEN 15
struct UCHARBUF {
UChar* buffer;
UChar* currentPos;
UChar* bufLimit;
int32_t remaining;
FileStream* in;
UConverter* conv;
UBool showWarning; /* makes this API not produce any errors */
};
static UBool ucbuf_autodetect_nrw(FileStream* in, const char** cp,int* numRead){
/* initial 0xa5 bytes: make sure that if we read <4 bytes we don't misdetect something */
char start[4]={ '\xa5', '\xa5', '\xa5', '\xa5' };
int cap =T_FileStream_size(in);
UBool autodetect;
int signatureLength;
*numRead=0;
*cp="";
if(cap<=0) {
return FALSE;
}
autodetect = TRUE;
*numRead=T_FileStream_read(in, start, 4); /* *numRead might be <4 */
if(start[0] == '\xFE' && start[1] == '\xFF') {
*cp = "UTF-16BE";
signatureLength=2;
} else if(start[0] == '\xFF' && start[1] == '\xFE') {
if(start[2] == '\x00' && start[3] =='\x00'){
*cp="UTF-32LE";
signatureLength=4;
} else {
*cp = "UTF-16LE";
signatureLength=2;
}
} else if(start[0] == '\xEF' && start[1] == '\xBB' && start[2] == '\xBF') {
*cp = "UTF-8";
signatureLength=3;
}else if(start[0] == '\x0E' && start[1] == '\xFE' && start[2] == '\xFF'){
*cp ="SCSU";
signatureLength=3;
}else if(start[0] == '\x00' && start[1] == '\x00' &&
start[2] == '\xFE' && start[3]=='\xFF'){
*cp = "UTF-32BE";
signatureLength=4;
}else{
signatureLength=0;
autodetect=FALSE;
}
while(signatureLength<*numRead) {
T_FileStream_ungetc(start[--*numRead], in);
}
return autodetect;
}
/* Autodetects UTF8, UTF-16-BigEndian and UTF-16-LittleEndian BOMs*/
U_CAPI UBool U_EXPORT2
ucbuf_autodetect(FileStream* in,const char** cp){
UBool autodetect = FALSE;
int numRead =0;
const char* tcp;
autodetect=ucbuf_autodetect_nrw(in,&tcp, &numRead);
*cp =tcp;
/* rewind the file Stream */
T_FileStream_rewind(in);
return autodetect;
}
/* fill the uchar buffer */
static UCHARBUF*
ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* err){
UChar* pTarget=NULL;
UChar* target=NULL;
const char* source=NULL;
char cbuf[MAX_IN_BUF] = {'\0'};
int numRead=0;
int offset=0;
const char* sourceLimit =NULL;
pTarget = buf->buffer;
/* check if we arrived here without exhausting the buffer*/
if(buf->currentPos<buf->bufLimit){
offset= buf->bufLimit-buf->currentPos;
memmove(buf->buffer,buf->currentPos,offset* sizeof(UChar));
}
#if DEBUG
memset(pTarget+offset,0xff,sizeof(UChar)*(MAX_IN_BUF-offset));
#endif
/* read the file */
numRead=T_FileStream_read(buf->in,cbuf,MAX_IN_BUF-offset);
buf->remaining-=numRead;
target=pTarget;
/* convert the bytes */
if(buf->conv){
/* set the callback to stop */
UConverterToUCallback toUOldAction ;
void* toUOldContext;
void* toUNewContext=NULL;
ucnv_setToUCallBack(buf->conv,
UCNV_TO_U_CALLBACK_STOP,
toUNewContext,
&toUOldAction,
(const void**)&toUOldContext,
err);
/* since state is saved in the converter we add offset to source*/
target = pTarget+offset;
source = cbuf;
sourceLimit = source + numRead;
ucnv_toUnicode(buf->conv,&target,target+(MAX_U_BUF-offset),
&source,source+numRead,NULL,
(UBool)(buf->remaining==0),err);
if(U_FAILURE(*err)){
char context[CONTEXT_LEN];
char preContext[CONTEXT_LEN];
char postContext[CONTEXT_LEN];
int8_t len = CONTEXT_LEN;
int32_t start=0;
int32_t stop =0;
int32_t pos =0;
if( buf->showWarning==TRUE){
fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while"
" converting input stream to target encoding: %s\n",
u_errorName(*err));
}
*err = U_ZERO_ERROR;
/* now get the context chars */
ucnv_getInvalidChars(buf->conv,context,&len,err);
context[len]= 0 ; /* null terminate the buffer */
pos = source-cbuf-len;
/* for pre-context */
start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1));
stop = pos-len;
memcpy(preContext,cbuf+start,stop-start);
/* null terminate the buffer */
preContext[stop-start] = 0;
/* for post-context */
start = pos+len;
stop = ((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf);
memcpy(postContext,source,stop-start);
/* null terminate the buffer */
postContext[stop-start] = 0;
if(buf->showWarning ==TRUE){
/* print out the context */
fprintf(stderr,"\tPre-context: %s\n",preContext);
fprintf(stderr,"\tContext: %s\n",context);
fprintf(stderr,"\tPost-context: %s\n", postContext);
}
/* reset the converter */
ucnv_reset(buf->conv);
/* set the call back to substiture
* and restart conversion
*/
ucnv_setToUCallBack(buf->conv,
UCNV_TO_U_CALLBACK_SUBSTITUTE,
toUNewContext,
&toUOldAction,
(const void**)&toUOldContext,
err);
/* reset source and target start positions */
target = pTarget+offset;
source = cbuf;
/* re convert */
ucnv_toUnicode(buf->conv,&target,target+(MAX_U_BUF-offset),
&source,sourceLimit,NULL,
(UBool)(buf->remaining==0),err);
}
numRead= target-pTarget;
#if DEBUG
{
int i;
target = pTarget;
for(i=0;i<numRead;i++){
/* printf("%c", (char)(*target++));*/
}
}
#endif
}else{
u_charsToUChars(cbuf,target+offset,numRead);
numRead=((buf->remaining>MAX_IN_BUF)? MAX_IN_BUF:numRead+offset);
}
buf->currentPos = pTarget;
buf->bufLimit=pTarget+numRead;
return buf;
}
/* get a UChar from the stream*/
U_CAPI UChar32 U_EXPORT2
ucbuf_getc(UCHARBUF* buf,UErrorCode* err){
if(buf->currentPos>=buf->bufLimit){
if(buf->remaining==0){
return U_EOF;
}
buf=ucbuf_fillucbuf(buf,err);
if(U_FAILURE(*err)){
return U_EOF;
}
}
return *(buf->currentPos++);
}
/* u_unescapeAt() callback to return a UChar*/
static UChar
_charAt(int32_t offset, void *context) {
return ((UCHARBUF*) context)->currentPos[offset];
}
/* getc and escape it */
U_CAPI UChar32 U_EXPORT2
ucbuf_getcx(UCHARBUF* buf,UErrorCode* err) {
int32_t length;
int32_t offset;
UChar32 c32,c1,c2;
/* Fill the buffer if it is empty */
if (buf->currentPos >=buf->bufLimit-2) {
ucbuf_fillucbuf(buf,err);
}
/* Get the next character in the buffer */
if (buf->currentPos < buf->bufLimit) {
c1 = *(buf->currentPos)++;
} else {
c1 = U_EOF;
}
c2 = *(buf->currentPos);
/* If it isn't a backslash, return it */
if (c1 != 0x005C) {
return c1;
}
/* Determine the amount of data in the buffer */
length = buf->bufLimit-buf->currentPos;
/* The longest escape sequence is \Uhhhhhhhh; make sure
we have at least that many characters */
if (length < 10) {
/* fill the buffer */
ucbuf_fillucbuf(buf,err);
length = buf->bufLimit-buf->buffer;
}
/* Process the escape */
offset = 0;
c32 = u_unescapeAt(_charAt, &offset, length, (void*)buf);
/* check if u_unescapeAt unescaped and converted
* to c32 or not
*/
if(c32!=c2){
/* Update the current buffer position */
buf->currentPos += offset;
}else{
/* unescaping failed so we just return
* c1 and not consume the buffer
* this is useful for rules with escapes
* in resouce bundles
* eg: \' \\ \"
*/
return c1;
}
return c32;
}
/* open a UCHARBUF */
U_CAPI UCHARBUF* U_EXPORT2
ucbuf_open(FileStream* in,const char* cp, UBool showWarning, UErrorCode* err){
UCHARBUF* buf =(UCHARBUF*) uprv_malloc(sizeof(UCHARBUF));
int numRead =0;
if(U_FAILURE(*err)){
return NULL;
}
if(buf){
buf->in=in;
buf->conv=NULL;
buf->showWarning = showWarning;
if(!cp ||(cp && *cp=='\0')){
/* don't have code page name... try to autodetect */
if(ucbuf_autodetect_nrw(in,&cp,&numRead)){
buf->conv=ucnv_open(cp,err);
}
}else{
buf->conv=ucnv_open(cp,err);
}
if((buf->conv==NULL) && (buf->showWarning==TRUE)){
fprintf(stderr,"###WARNING: No converter defined. Using codepage of system.\n");
}
buf->remaining=T_FileStream_size(in)-numRead;
buf->buffer=(UChar*) uprv_malloc(sizeof(UChar)* MAX_U_BUF);
if (buf->buffer == NULL) {
*err = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
buf->currentPos=buf->buffer;
buf->bufLimit=buf->buffer;
if(U_FAILURE(*err)){
fprintf(stderr, "Could not open codepage [%s]: %s\n", cp, u_errorName(*err));
return NULL;
}
buf=ucbuf_fillucbuf(buf,err);
return buf;
}else{
*err = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
}
/* TODO: this method will fail if at the
* begining of buffer and the uchar to unget
* is from the previous buffer. Need to implement
* system to take care of that situation.
*/
U_CAPI void U_EXPORT2
ucbuf_ungetc(UChar32 c,UCHARBUF* buf){
/* decrement currentPos pointer
* if not at the begining of buffer
*/
if(buf->currentPos!=buf->buffer){
buf->currentPos--;
}
}
/* frees the resources of UChar* buffer */
static void
ucbuf_closebuf(UCHARBUF* buf){
uprv_free(buf->buffer);
buf->buffer = NULL;
}
/* close the buf and release resources*/
U_CAPI void U_EXPORT2
ucbuf_close(UCHARBUF* buf){
if(buf->conv){
ucnv_close(buf->conv);
}
buf->in=NULL;
buf->currentPos=NULL;
buf->bufLimit=NULL;
ucbuf_closebuf(buf);
uprv_free(buf);
}
/* rewind the buf and file stream */
U_CAPI void U_EXPORT2
ucbuf_rewind(UCHARBUF* buf){
if(buf){
const char* cp="";
buf->currentPos=buf->buffer;
buf->bufLimit=buf->buffer;
ucnv_reset(buf->conv);
T_FileStream_rewind(buf->in);
ucbuf_autodetect(buf->in,&cp);
buf->remaining=T_FileStream_size(buf->in);
}
}