mirror of
https://gitee.com/acl-dev/acl.git
synced 2024-12-05 05:18:53 +08:00
208 lines
3.9 KiB
C++
208 lines
3.9 KiB
C++
/**
|
|
* writen by yan.zhang
|
|
*/
|
|
|
|
#include "stdafx.h"
|
|
#include "charset.h"
|
|
#include "chinese_utf8.h"
|
|
|
|
static inline int utf8_len(char *buf)
|
|
{
|
|
unsigned char *ptr;
|
|
int ret;
|
|
ptr = (unsigned char *)buf;
|
|
if (((*ptr) <= 0x7F))
|
|
ret = 1;
|
|
else if (((*ptr) & 0xF0) == 0xF0)
|
|
ret = 4;
|
|
else if (((*ptr) & 0xE0) == 0xE0)
|
|
ret = 3;
|
|
else if (((*ptr) & 0xC0) == 0xC0)
|
|
ret = 2;
|
|
else
|
|
ret = 5;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static inline int chinese_word_find(unsigned char *word)
|
|
{
|
|
int start = 0, middle, end = chinese_utf8_list_count - 1;
|
|
unsigned int mint, wint;
|
|
unsigned char *wp;
|
|
|
|
wint = (word[0] << 16) | (word[1] << 8) | (word[2]);
|
|
|
|
while (1)
|
|
{
|
|
if (start > end)
|
|
return 0;
|
|
|
|
middle = (start + end) / 2;
|
|
wp = chinese_utf8_list + middle * 3;
|
|
mint = (wp[0] << 16) | (wp[1] << 8) | (wp[2]);
|
|
|
|
if (wint < mint)
|
|
end = middle - 1;
|
|
else if (mint < wint)
|
|
start = middle + 1;
|
|
else
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int chinese_word_count(char *str, int len)
|
|
{
|
|
int i = 0, ulen, count = 0;
|
|
|
|
while (i + 2 < len)
|
|
{
|
|
ulen = utf8_len(str + i);
|
|
if (ulen != 3)
|
|
{
|
|
i += ulen;
|
|
continue;
|
|
}
|
|
|
|
count += chinese_word_find((unsigned char *) str + i);
|
|
i += ulen;
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
#define detact_debug(fmt, args...) { if (debug_mode_) { \
|
|
printf(fmt, ##args); \
|
|
} }
|
|
|
|
charset_radar::charset_radar(void)
|
|
: debug_mode_(false)
|
|
{
|
|
}
|
|
|
|
charset_radar::~charset_radar(void)
|
|
{
|
|
}
|
|
|
|
bool charset_radar::detact(const char *data, int len, acl::string &result_charset)
|
|
{
|
|
typedef struct {
|
|
int group_id; /* 瀛楃闆嗗垎缁刬d */
|
|
const char *charset;
|
|
int result_len;
|
|
} eas_charset;
|
|
|
|
eas_charset eas_cs[] = {
|
|
{1, "UTF-8", 0},
|
|
{2, "GB18030", 0},
|
|
{2, "BIG5", 0},
|
|
{0, 0, 0}
|
|
};
|
|
eas_charset *csp, *csp_find = 0;
|
|
acl::charset_conv conv;
|
|
acl::string data2;
|
|
acl::string toCharset;
|
|
int max_len = 0;
|
|
int count = 0;
|
|
int w_count;
|
|
int w_count_max = 0;
|
|
const char *result_cc = 0;
|
|
|
|
if (len < 2)
|
|
return false;
|
|
|
|
for (csp = eas_cs; csp->group_id; csp++)
|
|
{
|
|
conv.reset();
|
|
conv.set_add_invalid(false);
|
|
data2 = "";
|
|
|
|
toCharset = csp->charset;
|
|
toCharset += "//IGNORE";
|
|
if (!conv.convert(csp->charset, toCharset.c_str(), data, len, &data2))
|
|
continue;
|
|
|
|
csp->result_len = data2.length();
|
|
detact_debug("%-10s:\t%d\n", csp->charset, csp->result_len);
|
|
|
|
if(csp->result_len > max_len)
|
|
{
|
|
max_len = csp->result_len;
|
|
csp_find = csp;
|
|
}
|
|
else if (csp->result_len == max_len && csp_find
|
|
&& csp->group_id != csp_find->group_id)
|
|
{
|
|
result_charset = "UTF-8";
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (!csp_find)
|
|
{
|
|
result_charset = "UTF-8";
|
|
return true;
|
|
}
|
|
|
|
for (csp = eas_cs; csp->group_id; csp++)
|
|
{
|
|
if (csp_find->group_id != csp->group_id)
|
|
continue;
|
|
if (csp_find->result_len != csp->result_len)
|
|
continue;
|
|
count++;
|
|
}
|
|
|
|
detact_debug("count: %d\n", count);
|
|
|
|
if (count==1)
|
|
{
|
|
result_charset = csp_find->charset;
|
|
return true;
|
|
}
|
|
|
|
#if 1
|
|
if (csp_find->group_id !=2 )
|
|
{
|
|
|
|
result_charset = csp_find->charset;
|
|
return true;
|
|
}
|
|
#endif
|
|
for (csp = eas_cs; csp->group_id; csp++)
|
|
{
|
|
if (csp_find->group_id != csp->group_id)
|
|
continue;
|
|
|
|
if (csp_find->result_len != csp->result_len)
|
|
continue;
|
|
|
|
conv.reset();
|
|
conv.set_add_invalid(false);
|
|
data2 = "";
|
|
if (!conv.convert(csp->charset, "UTF-8//IGNORE", data, len, &data2))
|
|
continue;
|
|
|
|
w_count = chinese_word_count(data2.c_str(), data2.length());
|
|
detact_debug("%s, %zd, %d\n", csp->charset, data2.length(), w_count);
|
|
if ((w_count > w_count_max))
|
|
{
|
|
w_count_max = w_count;
|
|
result_cc = csp->charset;
|
|
}
|
|
}
|
|
|
|
if (!result_cc)
|
|
return false;
|
|
|
|
result_charset = result_cc;
|
|
return true;
|
|
}
|
|
|
|
bool charset_radar::detact(const acl::string &data, acl::string &result_charset)
|
|
{
|
|
return detact(data.c_str(), data.length(), result_charset);
|
|
}
|