acl/app/iconv/charset.cpp

208 lines
3.9 KiB
C++

/**
* writen by yan.zhang
*/
#include "stdafx.h"
#include "charset.h"
#include "chinese_utf8.h"
static inline int utf8_len(char *buf)
{
unsigned char *ptr;
int ret;
ptr = (unsigned char *)buf;
if (((*ptr) <= 0x7F))
ret = 1;
else if (((*ptr) & 0xF0) == 0xF0)
ret = 4;
else if (((*ptr) & 0xE0) == 0xE0)
ret = 3;
else if (((*ptr) & 0xC0) == 0xC0)
ret = 2;
else
ret = 5;
return ret;
}
static inline int chinese_word_find(unsigned char *word)
{
int start = 0, middle, end = chinese_utf8_list_count - 1;
unsigned int mint, wint;
unsigned char *wp;
wint = (word[0] << 16) | (word[1] << 8) | (word[2]);
while (1)
{
if (start > end)
return 0;
middle = (start + end) / 2;
wp = chinese_utf8_list + middle * 3;
mint = (wp[0] << 16) | (wp[1] << 8) | (wp[2]);
if (wint < mint)
end = middle - 1;
else if (mint < wint)
start = middle + 1;
else
return 1;
}
return 0;
}
static int chinese_word_count(char *str, int len)
{
int i = 0, ulen, count = 0;
while (i + 2 < len)
{
ulen = utf8_len(str + i);
if (ulen != 3)
{
i += ulen;
continue;
}
count += chinese_word_find((unsigned char *) str + i);
i += ulen;
}
return count;
}
#define detact_debug(fmt, args...) { if (debug_mode_) { \
printf(fmt, ##args); \
} }
charset_radar::charset_radar(void)
: debug_mode_(false)
{
}
charset_radar::~charset_radar(void)
{
}
bool charset_radar::detact(const char *data, int len, acl::string &result_charset)
{
typedef struct {
int group_id; /* 字符集分组id */
const char *charset;
int result_len;
} eas_charset;
eas_charset eas_cs[] = {
{1, "UTF-8", 0},
{2, "GB18030", 0},
{2, "BIG5", 0},
{0, 0, 0}
};
eas_charset *csp, *csp_find = 0;
acl::charset_conv conv;
acl::string data2;
acl::string toCharset;
int max_len = 0;
int count = 0;
int w_count;
int w_count_max = 0;
const char *result_cc = 0;
if (len < 2)
return false;
for (csp = eas_cs; csp->group_id; csp++)
{
conv.reset();
conv.set_add_invalid(false);
data2 = "";
toCharset = csp->charset;
toCharset += "//IGNORE";
if (!conv.convert(csp->charset, toCharset.c_str(), data, len, &data2))
continue;
csp->result_len = data2.length();
detact_debug("%-10s:\t%d\n", csp->charset, csp->result_len);
if(csp->result_len > max_len)
{
max_len = csp->result_len;
csp_find = csp;
}
else if (csp->result_len == max_len && csp_find
&& csp->group_id != csp_find->group_id)
{
result_charset = "UTF-8";
return true;
}
}
if (!csp_find)
{
result_charset = "UTF-8";
return true;
}
for (csp = eas_cs; csp->group_id; csp++)
{
if (csp_find->group_id != csp->group_id)
continue;
if (csp_find->result_len != csp->result_len)
continue;
count++;
}
detact_debug("count: %d\n", count);
if (count==1)
{
result_charset = csp_find->charset;
return true;
}
#if 1
if (csp_find->group_id !=2 )
{
result_charset = csp_find->charset;
return true;
}
#endif
for (csp = eas_cs; csp->group_id; csp++)
{
if (csp_find->group_id != csp->group_id)
continue;
if (csp_find->result_len != csp->result_len)
continue;
conv.reset();
conv.set_add_invalid(false);
data2 = "";
if (!conv.convert(csp->charset, "UTF-8//IGNORE", data, len, &data2))
continue;
w_count = chinese_word_count(data2.c_str(), data2.length());
detact_debug("%s, %zd, %d\n", csp->charset, data2.length(), w_count);
if ((w_count > w_count_max))
{
w_count_max = w_count;
result_cc = csp->charset;
}
}
if (!result_cc)
return false;
result_charset = result_cc;
return true;
}
bool charset_radar::detact(const acl::string &data, acl::string &result_charset)
{
return detact(data.c_str(), data.length(), result_charset);
}