add iconv app for files charset convert.

This commit is contained in:
zhengshuxin 2017-03-13 22:57:46 +08:00
parent e2ec8096f5
commit d2128e6f70
12 changed files with 788 additions and 5 deletions

2
app/iconv/Makefile Normal file
View File

@ -0,0 +1,2 @@
include ../Makefile.in
PROG = iconv

207
app/iconv/charset.cpp Normal file
View File

@ -0,0 +1,207 @@
/**
* writen by yan.zhang
*/
#include "stdafx.h"
#include "charset.h"
#include "chinese_utf8.h"
static inline int utf8_len(char *buf)
{
unsigned char *ptr;
int ret;
ptr = (unsigned char *)buf;
if (((*ptr) <= 0x7F))
ret = 1;
else if (((*ptr) & 0xF0) == 0xF0)
ret = 4;
else if (((*ptr) & 0xE0) == 0xE0)
ret = 3;
else if (((*ptr) & 0xC0) == 0xC0)
ret = 2;
else
ret = 5;
return ret;
}
static inline int chinese_word_find(unsigned char *word)
{
int start = 0, middle, end = chinese_utf8_list_count - 1;
unsigned int mint, wint;
unsigned char *wp;
wint = (word[0] << 16) | (word[1] << 8) | (word[2]);
while (1)
{
if (start > end)
return 0;
middle = (start + end) / 2;
wp = chinese_utf8_list + middle * 3;
mint = (wp[0] << 16) | (wp[1] << 8) | (wp[2]);
if (wint < mint)
end = middle - 1;
else if (mint < wint)
start = middle + 1;
else
return 1;
}
return 0;
}
static int chinese_word_count(char *str, int len)
{
int i = 0, ulen, count = 0;
while (i + 2 < len)
{
ulen = utf8_len(str + i);
if (ulen != 3)
{
i += ulen;
continue;
}
count += chinese_word_find((unsigned char *) str + i);
i += ulen;
}
return count;
}
#define detact_debug(fmt, args...) { if (debug_mode_) { \
printf(fmt, ##args); \
} }
charset_radar::charset_radar(void)
: debug_mode_(false)
{
}
charset_radar::~charset_radar(void)
{
}
bool charset_radar::detact(const char *data, int len, acl::string &result_charset)
{
typedef struct {
int group_id; /* 字符集分组id */
const char *charset;
int result_len;
} eas_charset;
eas_charset eas_cs[] = {
{1, "UTF-8", 0},
{2, "GB18030", 0},
{2, "BIG5", 0},
{0, 0, 0}
};
eas_charset *csp, *csp_find = 0;
acl::charset_conv conv;
acl::string data2;
acl::string toCharset;
int max_len = 0;
int count = 0;
int w_count;
int w_count_max = 0;
const char *result_cc = 0;
if (len < 2)
return false;
for (csp = eas_cs; csp->group_id; csp++)
{
conv.reset();
conv.set_add_invalid(false);
data2 = "";
toCharset = csp->charset;
toCharset += "//IGNORE";
if (!conv.convert(csp->charset, toCharset.c_str(), data, len, &data2))
continue;
csp->result_len = data2.length();
detact_debug("%-10s:\t%d\n", csp->charset, csp->result_len);
if(csp->result_len > max_len)
{
max_len = csp->result_len;
csp_find = csp;
}
else if (csp->result_len == max_len && csp_find
&& csp->group_id != csp_find->group_id)
{
result_charset = "UTF-8";
return true;
}
}
if (!csp_find)
{
result_charset = "UTF-8";
return true;
}
for (csp = eas_cs; csp->group_id; csp++)
{
if (csp_find->group_id != csp->group_id)
continue;
if (csp_find->result_len != csp->result_len)
continue;
count++;
}
detact_debug("count: %d\n", count);
if (count==1)
{
result_charset = csp_find->charset;
return true;
}
#if 1
if (csp_find->group_id !=2 )
{
result_charset = csp_find->charset;
return true;
}
#endif
for (csp = eas_cs; csp->group_id; csp++)
{
if (csp_find->group_id != csp->group_id)
continue;
if (csp_find->result_len != csp->result_len)
continue;
conv.reset();
conv.set_add_invalid(false);
data2 = "";
if (!conv.convert(csp->charset, "UTF-8//IGNORE", data, len, &data2))
continue;
w_count = chinese_word_count(data2.c_str(), data2.length());
detact_debug("%s, %zd, %d\n", csp->charset, data2.length(), w_count);
if ((w_count > w_count_max))
{
w_count_max = w_count;
result_cc = csp->charset;
}
}
if (!result_cc)
return false;
result_charset = result_cc;
return true;
}
bool charset_radar::detact(const acl::string &data, acl::string &result_charset)
{
return detact(data.c_str(), data.length(), result_charset);
}

31
app/iconv/charset.h Normal file
View File

@ -0,0 +1,31 @@
#pragma once
class charset_radar
{
public:
charset_radar(void);
~charset_radar(void);
/**
*
* @param data
* @param len
* @param charset_result
* @return {bool}
*/
bool detact(const char *data, int len, acl::string &charset_result);
bool detact(const acl::string &data, acl::string &charset_result);
/*
*
*/
void setDebugMode(bool flag)
{
debug_mode_ = flag;
}
private:
bool debug_mode_;
};
//bool format_utf8(const char *str, int len, acl::string &out);

View File

@ -0,0 +1,380 @@
#include "stdafx.h"
#include "charset.h"
#include "charset_transfer.h"
#ifdef WIN32
#define SEP '\\'
#else
#define SEP '/'
#endif
// 去年路径前的 "./" 或 ".\",因为在 WIN32 下
#define SKIP(ptr) do \
{ \
if (*ptr == '.' && *(ptr + 1) == '/') \
ptr += 2; \
else if (*ptr == '.' && *(ptr + 1) == '\\') \
ptr += 2; \
} while (0)
static const char UTF8_HEADER[] = { (char) 0xEF, (char) 0xBB, (char) 0xBF };
charset_transfer& charset_transfer::set_from_charset(const char* charset)
{
from_charset_ = charset;
return *this;
}
charset_transfer& charset_transfer::set_to_charset(const char* charset)
{
to_charset_ = charset;
return *this;
}
charset_transfer& charset_transfer::set_from_path(const char* path)
{
from_path_ = path;
return *this;
}
charset_transfer& charset_transfer::set_to_path(const char* path)
{
to_path_ = path;
return *this;
}
charset_transfer& charset_transfer::set_utf8bom(bool yes)
{
utf8_bom_ = yes;
return *this;
}
bool charset_transfer::check_params(void)
{
if (from_charset_.empty())
{
logger_error("call set_from_charset first!");
return false;
}
if (to_charset_.empty())
{
logger_error("call set_to_charset first!");
return false;
}
if (from_path_.empty())
{
logger_error("call set_from_path first!");
return false;
}
if (to_path_.empty())
{
logger_error("call set_to_path first!");
return false;
}
return true;
}
bool charset_transfer::get_filepath(acl::scan_dir& scan, const char* filename,
acl::string& from_filepath, acl::string& to_filepath,
acl::string& to_path)
{
const char* rpath = scan.curr_path();
if (rpath == NULL)
{
logger_error("curr_path NULL, filename: %s", filename);
return false;
}
SKIP(rpath);
SKIP(filename);
if (*rpath == 0)
from_filepath << filename;
else
from_filepath << rpath << SEP << filename;
#if 0
if (strstr(from_filepath.c_str(), ".svn") != NULL
|| strstr(from_filepath.c_str(), ".git") != NULL
|| strstr(from_filepath.c_str(), ".cvs") != NULL
|| strstr(from_filepath.c_str(), ".inc") != NULL
|| strstr(from_filepath.c_str(), ".exe") != NULL
|| strstr(from_filepath.c_str(), ".class") != NULL
|| strstr(from_filepath.c_str(), ".zip") != NULL
|| strstr(from_filepath.c_str(), ".rar") != NULL
|| strstr(from_filepath.c_str(), ".tar") != NULL
|| strstr(from_filepath.c_str(), ".tar.gz") != NULL
|| strstr(from_filepath.c_str(), ".tgz") != NULL
|| strstr(from_filepath.c_str(), ".bzip2") != NULL
|| strstr(from_filepath.c_str(), ".o") != NULL)
{
logger("skip %s", from_filepath.c_str());
return false;
}
#else
static const char* files_ext[] = {
".c",
".h",
".cpp",
".hpp",
".cxx",
".hxx",
NULL,
};
bool match = false;
for (int i = 0; files_ext[i] != NULL; i++)
{
if (from_filepath.rncompare(files_ext[i],
strlen(files_ext[i]), false) == 0)
{
match = true;
break;
}
}
if (!match)
return false;
#endif
to_path << to_path_ << SEP << rpath;
to_filepath << to_path << SEP << filename;
return true;
}
bool charset_transfer::check_buff(const acl::string& buf, const char* charset,
acl::string& res)
{
if (buf[0] == UTF8_HEADER[0] && buf[1] == UTF8_HEADER[1]
&& buf[2] == UTF8_HEADER[2])
{
res = "utf-8";
}
else
{
charset_radar r;
if (r.detact(buf, res) == false)
{
res = "uknown";
return false;
}
}
#define EQ !strcasecmp
if (res.equal("UTF-8", false)
&& (EQ(charset, "utf-8") || EQ(charset, "utf8")))
{
return true;
}
else if (res.equal("GB18030", false)
&& (EQ(charset, "gbk") || EQ(charset, "gb2312")))
{
return true;
}
else if (res.equal(charset, false))
return true;
else
return false;
}
bool charset_transfer::check_file(const char* filepath,
const char* charset)
{
acl::string buf;
if (acl::ifstream::load(filepath, &buf) == false)
{
logger_error("load %s error %s", filepath, acl::last_serror());
return false;
}
acl::string res;
if (check_buff(buf, charset, res) == false)
{
logger("%s, guess: %s, want: %s",
filepath, res.c_str(), charset);
return false;
}
return true;
}
int charset_transfer::check_path(const char* path, const char* charset)
{
acl::scan_dir scan;
if (scan.open(path, true) == false)
{
logger_error("open %s error %s", path, acl::last_serror());
return -1;
}
const char* filepath;
int count = 0;
while ((filepath = scan.next_file(true)) != NULL)
{
if (check_file(filepath, charset))
count++;
}
return count;
}
int charset_transfer::transfer(bool recursive /* = true */)
{
if (check_params() == false)
return -1;
if (from_charset_.equal(to_charset_, false))
{
logger("to_charset_ is same as from_charset_(%s)",
from_charset_.c_str());
return 0;
}
acl::scan_dir scan;
if (scan.open(from_path_, recursive) == false)
{
logger_error("open dir %s error %s", from_path_.c_str(),
acl::last_serror());
return -1;
}
int count = 0;
const char* filename;
while ((filename = scan.next_file(false)) != NULL)
{
acl::string from_filepath, to_filepath, to_path;
if (!get_filepath(scan, filename, from_filepath,
to_filepath, to_path))
{
continue;
}
if (access(to_path.c_str(), 0) != 0
&& (acl_make_dirs(to_path.c_str(), 0755) == -1))
{
logger_error("acl_make_dirs %s error %s",
to_path.c_str(), acl::last_serror());
continue;
}
if (transfer(from_filepath, to_filepath))
{
logger("transfer to %s OK!", to_filepath.c_str());
count++;
}
}
return count;
}
bool charset_transfer::transfer(const char* from_file, const char* to_file)
{
if (from_charset_.empty())
{
logger_error("from_charset_ empty, file_path: %s", from_file);
return false;
}
if (to_charset_.empty())
{
logger_error("to_charset_ empty, file_path: %s", from_file);
return false;
}
if (to_charset_.equal(from_charset_, false))
{
logger("charset is same: %s, file_path: %s",
to_charset_.c_str(), from_file);
return false;
}
acl::string buf;
if (acl::ifstream::load(from_file, &buf) == false)
{
logger_error("load file %s error %s", from_file,
acl::last_serror());
return false;
}
if (buf.empty())
{
logger("file empty, file_path: %s", from_file);
return false;
}
acl::string charset_res;
if (check_buff(buf, to_charset_, charset_res))
return save_to(buf, to_file);
// printf("to_charset_: %s, charset_res: %s\r\n",
// to_charset_.c_str(), charset_res.c_str());
if (!from_charset_.equal("utf-8", false) &&
!from_charset_.equal("utf8", false))
{
if (buf[0] == UTF8_HEADER[0]
&& buf[1] == UTF8_HEADER[1]
&& buf[2] == UTF8_HEADER[2])
{
logger_warn("skip %s, utf8 header in no utf8 file, %s",
from_file, from_charset_.c_str());
return save_to(buf, to_file);
}
}
acl::charset_conv conv;
acl::string res;
if (conv.convert(from_charset_, to_charset_, buf.c_str(),
buf.size(), &res) == false)
{
logger_error("charset convert error: %s, file: %s",
conv.serror(), from_file);
return save_to(buf, to_file);
}
acl::ofstream fp;
if (fp.open_write(to_file) == false)
{
logger_error("open_write %s error %s", to_file,
acl::last_serror());
return false;
}
if ((to_charset_.equal("utf-8", false)
|| to_charset_.equal("utf8", false)) && utf8_bom_)
{
if (fp.write(UTF8_HEADER, 3) == -1)
{
logger_error("write UTF8_HEADER error %s, file: %s",
acl::last_serror(), to_file);
return false;
}
}
if (fp.write(res) == -1)
{
logger_error("write to %s error %s",
to_file, acl::last_serror());
return false;
}
return true;
}
bool charset_transfer::save_to(const acl::string& buf, const char* to_file)
{
acl::ofstream fp;
if (fp.open_write(to_file) == false)
{
logger_error("open_write %s error %s", to_file,
acl::last_serror());
return false;
}
if (fp.write(buf) == -1)
{
logger_error("write to %s error %s",
to_file, acl::last_serror());
return false;
}
return true;
}

View File

@ -0,0 +1,35 @@
#pragma once
class charset_transfer
{
public:
charset_transfer(void) : utf8_bom_(false) {}
~charset_transfer(void) {}
charset_transfer& set_from_charset(const char* charset);
charset_transfer& set_to_charset(const char* charset);
charset_transfer& set_from_path(const char* path);
charset_transfer& set_to_path(const char* path);
charset_transfer& set_utf8bom(bool yes);
int transfer(bool recursive = true);
static bool check_buff(const acl::string& buf, const char* charset,
acl::string& res);
static bool check_file(const char* filepath, const char* charset);
static int check_path(const char* path, const char* charset);
private:
acl::string from_charset_;
acl::string to_charset_;
acl::string from_path_;
acl::string to_path_;
bool utf8_bom_;
bool check_params(void);
bool get_filepath(acl::scan_dir& scan, const char* filename,
acl::string& from_filepath, acl::string& to_filepath,
acl::string& to_path);
bool transfer(const char* from_file, const char* to_file);
bool save_to(const acl::string& buf, const char* to_file);
};

6
app/iconv/chinese_utf8.h Normal file

File diff suppressed because one or more lines are too long

96
app/iconv/main.cpp Normal file
View File

@ -0,0 +1,96 @@
#include "stdafx.h"
#include "charset_transfer.h"
static void usage(const char* procname)
{
printf("usage: %s -h [help]\r\n"
" -f from_charset\r\n"
" -t to_charset\r\n"
" -b [when to_charset is utf-8 if BOM header be added]\r\n"
" -s source_dir\r\n"
" -d destination_dir\r\n"
" -c [just check charset only]\r\n",
procname);
}
int main(int argc, char* argv[])
{
int ch;
acl::string from_charset, to_charset, from_dir, to_dir;
bool use_bom = false, check_only = false;;
while ((ch = getopt(argc, argv, "hf:t:bs:d:c")) > 0)
{
switch (ch)
{
case 'h':
usage(argv[0]);
return 0;
case 'f':
from_charset = optarg;
break;
case 't':
to_charset = optarg;
break;
case 'b':
use_bom = true;
break;
case 's':
from_dir = optarg;
break;
case 'd':
to_dir = optarg;
break;
case 'c':
check_only = true;
break;
default:
break;
}
}
acl::log::stdout_open(true);
if (check_only)
{
if (from_dir.empty() || from_charset.empty())
{
printf("from_charset or from_dir not set\r\n");
return 1;
}
int n = charset_transfer::check_path(from_dir, from_charset);
printf("check over: %d, charset: %s\r\n",
n, from_charset.c_str());
return 0;
}
if (from_charset.empty() || to_charset.empty()
|| from_dir.empty() || to_dir.empty())
{
usage(argv[0]);
return 0;
}
if (chdir(from_dir.c_str()) == -1)
{
logger_error("chdir to %s error %s", from_dir.c_str(),
acl::last_serror());
return -1;
}
charset_transfer transfer;
transfer.set_from_charset(from_charset)
.set_to_charset(to_charset)
.set_from_path(".")
.set_to_path(to_dir)
.set_utf8bom(use_bom);
int n = transfer.transfer();
printf("transfer over: %d, from_charset: %s, to_charset: %s,"
" from_path: %s, to_path: %s\r\n", n, from_charset.c_str(),
to_charset.c_str(), from_dir.c_str(), to_dir.c_str());
return 0;
}

8
app/iconv/stdafx.cpp Normal file
View File

@ -0,0 +1,8 @@
// stdafx.cpp : 只包括标准包含文件的源文件
// wizard.pch 将成为预编译头
// stdafx.obj 将包含预编译类型信息
#include "stdafx.h"
// TODO: 在 STDAFX.H 中
//引用任何所需的附加头文件,而不是在此文件中引用

14
app/iconv/stdafx.h Normal file
View File

@ -0,0 +1,14 @@
// stdafx.h : 标准系统包含文件的包含文件,
// 或是常用但不常更改的项目特定的包含文件
//
#pragma once
//#include <iostream>
//#include <tchar.h>
// TODO: 在此处引用程序要求的附加头文件
#include "lib_acl.h"
#include "acl_cpp/lib_acl.hpp"

View File

@ -268,7 +268,6 @@ ACL_CFG_PARSER *acl_cfg_parser_load(const char *pathname, const char *delimiter)
/* first, skip all ' ' and '\t' */
SKIP(ptr, (*ptr == ' ' || *ptr == '\t'));
/* <20><><EFBFBD><EFBFBD><EFBFBD>洢ע<E6B4A2><D7A2><EFBFBD><EFBFBD> */
if (*ptr == '#') { /* the comment line */
SKIP(ptr, *ptr != '\n'); /* find the line's end */
if (*ptr) { /* this must be '\n' */
@ -288,7 +287,6 @@ ACL_CFG_PARSER *acl_cfg_parser_load(const char *pathname, const char *delimiter)
cfg_line->line_number = parser->total_line;
continue;
} else if (*ptr == '\r' || *ptr == '\n') {
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
/* SKIP(ptr, (*ptr == '\r' || *ptr == '\n')); */
if (*ptr == '\r' && *(ptr + 1) == '\n') {
*ptr = 0; /* set '\0' first and go on */
@ -312,7 +310,6 @@ ACL_CFG_PARSER *acl_cfg_parser_load(const char *pathname, const char *delimiter)
continue;
}
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ч<EFBFBD><D0A7> */
pline_begin = ptr; /* reset the line header */
/* find the line's end */
@ -355,7 +352,6 @@ ACL_CFG_PARSER *acl_cfg_parser_load(const char *pathname, const char *delimiter)
#endif
}
/* <20>ͷ<EFBFBD> ACL_CFG_LINE <20><><EFBFBD><EFBFBD><EFBFBD>õĻص<C4BB><D8B5><EFBFBD><EFBFBD><EFBFBD> */
static void _cfg_line_free(void *arg)
{
ACL_CFG_LINE *cfg_line;

View File

@ -123,6 +123,14 @@ static bool cmp_copy(acl::scan_dir& scan, const char* name,
if (strstr(from_filepath.c_str(), ".svn") != NULL
|| strstr(from_filepath.c_str(), ".git") != NULL
|| strstr(from_filepath.c_str(), ".cvs") != NULL
|| strstr(from_filepath.c_str(), ".inc") != NULL
|| strstr(from_filepath.c_str(), ".exe") != NULL
|| strstr(from_filepath.c_str(), ".zip") != NULL
|| strstr(from_filepath.c_str(), ".rar") != NULL
|| strstr(from_filepath.c_str(), ".tar") != NULL
|| strstr(from_filepath.c_str(), ".tar.gz") != NULL
|| strstr(from_filepath.c_str(), ".tgz") != NULL
|| strstr(from_filepath.c_str(), ".bzip2") != NULL
|| strstr(from_filepath.c_str(), ".o") != NULL)
{
return true;

View File

@ -236,7 +236,7 @@ redis_result& redis_result::put(const redis_result* rr, size_t idx)
else if (idx == 0)
children_idx_ = 0;
// +1 是为了确保最后一个数组元素可以被设为 NULL
// +1 是为了确保最后一个数组元素可以被设为 NULL
if (children_idx_ + 1 < children_size_)
{
children_[children_idx_++] = rr;