iconv clucene

/////////////////////////////////////////////////////////////////////////////////////
/// 此內容摘自 linux 上 iconv 命令程序代碼,目的在於處理轉碼出現無效字符的情況
////////////////////////////////////////////////////////////////////////////////////
struct iconv_hooks {};
struct iconv_fallbacks {};
typedef unsigned int ucs4_t;
typedef struct conv_struct * conv_t;
struct loop_funcs {
size_t (*loop_convert) (iconv_t icd,
const char* * inbuf, size_t *inbytesleft,
char* * outbuf, size_t *outbytesleft);
size_t (*loop_reset) (iconv_t icd,
char* * outbuf, size_t *outbytesleft);
};
struct mbtowc_funcs {
int (*xxx_mbtowc) (conv_t conv, ucs4_t *pwc, unsigned char const *s, int n);
/*
* int xxx_mbtowc (conv_t conv, ucs4_t *pwc, unsigned char const *s, int n)
* converts the byte sequence starting at s to a wide character. Up to n bytes
* are available at s. n is >= 1.
* Result is number of bytes consumed (if a wide character was read),
* or -1 if invalid, or -2 if n too small, or -2-(number of bytes consumed)
* if only a shift sequence was read.
*/
int (*xxx_flushwc) (conv_t conv, ucs4_t *pwc);
/*
* int xxx_flushwc (conv_t conv, ucs4_t *pwc)
* returns to the initial state and stores the pending wide character, if any.
* Result is 1 (if a wide character was read) or 0 if none was pending.
*/
};
struct wctomb_funcs {
int (*xxx_wctomb) (conv_t conv, unsigned char *r, ucs4_t wc, int n);
/*
* int xxx_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
* converts the wide character wc to the character set xxx, and stores the
* result beginning at r. Up to n bytes may be written at r. n is >= 1.
* Result is number of bytes written, or -1 if invalid, or -2 if n too small.
*/
int (*xxx_reset) (conv_t conv, unsigned char *r, int n);
/*
* int xxx_reset (conv_t conv, unsigned char *r, int n)
* stores a shift sequences returning to the initial state beginning at r.
* Up to n bytes may be written at r. n is >= 0.
* Result is number of bytes written, or -2 if n too small.
*/
};
typedef unsigned int state_t;
struct conv_struct {
struct loop_funcs lfuncs;
/* Input (conversion multibyte -> unicode) */
int iindex;
struct mbtowc_funcs ifuncs;
state_t istate;
/* Output (conversion unicode -> multibyte) */
int oindex;
struct wctomb_funcs ofuncs;
int oflags;
state_t ostate;
/* Operation flags */
int transliterate;
int discard_ilseq;
#ifndef LIBICONV_PLUG
struct iconv_fallbacks fallbacks;
struct iconv_hooks hooks;
#endif
};

////////////////////////////////////////////////////////////
/// 轉載結束
////////////////////////////////////////////////////////////

int __charcode_convert__(LPCSTR from, LPCSTR to, LPSTR save, int savelen, LPSTR src, int srclen, bool ignore_invalid_sequence)
{
iconv_t cd;
char *inbuf = src;
char *outbuf = save;
size_t outbufsize = savelen;
int status = 0;
size_t savesize = 0;
size_t inbufsize = srclen;
const char* inptr = inbuf;
size_t insize = inbufsize;
char* outptr = outbuf;
size_t outsize = outbufsize;

if (!ignore_invalid_sequence)
cd = iconv_open(to, from);
else
{
char tochartset[64]={0};
sprintf(tochartset, "%s//IGNORE", to);
cd = iconv_open(tochartset, from);
}

if (cd == (iconv_t)(-1))
{
printf("iconv_open oper error!\n");
status = -1;
goto done;
}

iconv(cd, NULL, NULL, NULL, NULL);
if (inbufsize == 0)
{
status = -1;
goto done;
}

int invaild_do;
invaild_do = 0;

while (insize > 0)
{
size_t res = iconv(cd, (char**)&inptr, &insize, &outptr, &outsize);
if (res == (size_t)(-1))
{
if (errno == EILSEQ)
{
if (invaild_do == 0)
{
((conv_t)cd)->discard_ilseq = 1;
invaild_do = 1;
continue;
}

status = -3;
goto done;
}
else if (errno == EINVAL)
{
if (inbufsize == 0)
{
status = -4;
goto done;
}
else
{
break;
}
}
else if (errno == E2BIG)
{
status = -5;
goto done;
}
else
{
status = -6;
goto done;
}
}

invaild_do = 0;

if (outptr != outbuf)
{
int saved_errno = errno;
int outsize = outptr - outbuf;
strncpy(save+savesize, outbuf, outsize);
errno = saved_errno;
}

lj_sleep(0, 1);
}

status = strlen(save);
status = status > 0 ? 0 : -1;

done:
iconv_close(cd);
return status;
}

char *charcode_convert(LPCSTR from, LPCSTR to, LPSTR src, int srclen, bool ignore_invalid_sequence)
{
char *outbuf = (char*)malloc(4*srclen+sizeof(char));
memset(outbuf, 0, 4*srclen+sizeof(char));
if (__charcode_convert__(from, to, outbuf, 4*srclen, src, srclen, ignore_invalid_sequence) != 0)
{
LJFREE(outbuf);
outbuf = (char*)malloc(sizeof(char));
memset(outbuf, 0, sizeof(char));
}
return outbuf;
}

char* utf8_to_chna(char *utf8buf, bool ignore_invalid_sequence, LPCSTR to_chna_charset)
{
return charcode_convert("UTF-8", to_chna_charset, utf8buf, strlen(utf8buf), ignore_invalid_sequence);
}

char* chna_to_utf8(char *chnabuf, bool ignore_invalid_sequence, LPCSTR frm_chna_charset)
{
return charcode_convert(frm_chna_charset, "UTF-8", chnabuf, strlen(chnabuf), ignore_invalid_sequence);
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章