blob: fd1e54106addf245517543525fbcd96a59320f7a [file] [log] [blame]
#include "iconv_string.h"
#include <iconv.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#define tmpbufsize 4096
int iconv_string (const char* tocode, const char* fromcode,
const char* start, const char* end,
char** resultp, size_t* lengthp)
{
iconv_t cd = iconv_open(tocode,fromcode);
size_t length;
char* result;
if (cd == (iconv_t)(-1)) {
if (errno != EINVAL)
return -1;
/* Unsupported fromcode or tocode. Check whether the caller requested
autodetection. */
if (!strcmp(fromcode,"autodetect_utf8")) {
int ret;
/* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1. */
ret = iconv_string(tocode,"UTF-8",start,end,resultp,lengthp);
if (!(ret < 0 && errno == EILSEQ))
return -1;
ret = iconv_string(tocode,"ISO-8859-1",start,end,resultp,lengthp);
return ret;
}
if (!strcmp(fromcode,"autodetect_jp")) {
int ret;
/* Try 7-bit encoding first. If the input contains bytes >= 0x80,
it will fail. */
ret = iconv_string(tocode,"ISO-2022-JP-2",start,end,resultp,lengthp);
if (!(ret < 0 && errno == EILSEQ))
return -1;
/* Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
is unavoidable. People will condemn SHIFT_JIS.
If we tried SHIFT_JIS first, then some short EUC-JP inputs would
come out wrong, and people would condemn EUC-JP and Unix, which
would not be good. */
ret = iconv_string(tocode,"EUC-KR",start,end,resultp,lengthp);
if (!(ret < 0 && errno == EILSEQ))
return -1;
/* Finally try SHIFT_JIS. */
ret = iconv_string(tocode,"SHIFT_JIS",start,end,resultp,lengthp);
return ret;
}
if (!strcmp(fromcode,"autodetect_kr")) {
int ret;
/* Try 7-bit encoding first. If the input contains bytes >= 0x80,
it will fail. */
ret = iconv_string(tocode,"ISO-2022-KR",start,end,resultp,lengthp);
if (!(ret < 0 && errno == EILSEQ))
return -1;
/* Try EUC-KR next, because it is more frequently used. It may succeed
on some short JOHAB inputs. This is unavoidable. */
ret = iconv_string(tocode,"EUC-KR",start,end,resultp,lengthp);
if (!(ret < 0 && errno == EILSEQ))
return -1;
/* Finally try JOHAB. */
ret = iconv_string(tocode,"JOHAB",start,end,resultp,lengthp);
return ret;
}
errno = EINVAL;
return -1;
}
/* Determine the length we need. */
{
size_t count = 0;
char tmpbuf[tmpbufsize];
const char* inptr = start;
size_t insize = end-start;
while (insize > 0) {
char* outptr = tmpbuf;
size_t outsize = tmpbufsize;
size_t res = iconv(cd,&inptr,&insize,&outptr,&outsize);
if (res == (size_t)(-1)) {
if (errno == EINVAL)
break;
else {
int saved_errno = errno;
iconv_close(cd);
errno = saved_errno;
return -1;
}
}
count += outptr-tmpbuf;
}
{
char* outptr = tmpbuf;
size_t outsize = tmpbufsize;
size_t res = iconv(cd,NULL,NULL,&outptr,&outsize);
if (res == (size_t)(-1)) {
int saved_errno = errno;
iconv_close(cd);
errno = saved_errno;
return -1;
}
count += outptr-tmpbuf;
}
length = count;
}
if (lengthp != NULL)
*lengthp = length;
if (resultp == NULL) {
iconv_close(cd);
return 0;
}
result = (*resultp == NULL ? malloc(length) : realloc(*resultp,length));
*resultp = result;
if (length == 0) {
iconv_close(cd);
return 0;
}
if (result == NULL) {
iconv_close(cd);
errno = ENOMEM;
return -1;
}
iconv(cd,NULL,NULL,NULL,NULL); /* return to the initial state */
/* Do the conversion for real. */
{
const char* inptr = start;
size_t insize = end-start;
char* outptr = result;
size_t outsize = length;
while (insize > 0) {
size_t res = iconv(cd,&inptr,&insize,&outptr,&outsize);
if (res == (size_t)(-1)) {
if (errno == EINVAL)
break;
else {
int saved_errno = errno;
iconv_close(cd);
errno = saved_errno;
return -1;
}
}
}
{
size_t res = iconv(cd,NULL,NULL,&outptr,&outsize);
if (res == (size_t)(-1)) {
int saved_errno = errno;
iconv_close(cd);
errno = saved_errno;
return -1;
}
}
if (outsize != 0) abort();
}
iconv_close(cd);
return 0;
}