引入friso_config_t, 适合多线程多配置使用

This commit is contained in:
狮子的魂 2013-11-23 19:13:34 +08:00
parent 88a65368f2
commit 8fee218b79
4 changed files with 1067 additions and 1037 deletions

View File

@ -21,17 +21,6 @@ FRISO_API friso_t friso_new( void )
___ALLOCATION_ERROR___
}
//initialize the entry
e->max_len = DEFAULT_SEGMENT_LENGTH;
e->r_name = 1;
e->mix_len = DEFAULT_MIX_LENGTH;
e->lna_len = DEFAULT_LNA_LENGTH;
e->add_syn = 1;
e->clr_stw = 0;
e->keep_urec = 0;
e->spx_out = 0;
e->nthreshold = DEFAULT_NTHRESHOLD;
e->mode = ( friso_mode_t ) DEFAULT_SEGMENT_MODE;
e->dic = NULL;
return e;
@ -39,13 +28,15 @@ FRISO_API friso_t friso_new( void )
/* }}} */
/* {{{ creat a new friso with initialize item from a configuration file.
*
* @return 1 for successfully and 0 for failed.
*/
FRISO_API friso_t friso_new_from_ifile( fstring __ifile )
FRISO_API int friso_init_from_ifile(
friso_t friso, friso_config_t config, fstring __ifile )
{
FILE *__stream;
char __chars__[256], __key__[128], *__line__, __lexi__[128];
uint_t i, t, __hit__ = 0, __length__;
friso_t e = friso_new();
if ( ( __stream = fopen( __ifile, "rb" ) ) != NULL ) {
@ -97,25 +88,25 @@ FRISO_API friso_t friso_new_from_ifile( fstring __ifile )
__lexi__[t] = '\0';
}
} else if ( strcmp( __key__, "friso.max_len" ) == 0 ) {
e->max_len = ( ushort_t ) atoi( __line__ );
config->max_len = ( ushort_t ) atoi( __line__ );
} else if ( strcmp( __key__, "friso.r_name" ) == 0 ) {
e->r_name = ( ushort_t ) atoi( __line__ );
config->r_name = ( ushort_t ) atoi( __line__ );
} else if ( strcmp( __key__, "friso.mix_len" ) == 0 ) {
e->mix_len = ( ushort_t ) atoi( __line__ );
config->mix_len = ( ushort_t ) atoi( __line__ );
} else if ( strcmp( __key__, "friso.lna_len" ) == 0 ) {
e->lna_len = ( ushort_t ) atoi( __line__ );
config->lna_len = ( ushort_t ) atoi( __line__ );
} else if ( strcmp( __key__, "friso.add_syn" ) == 0 ) {
e->add_syn = ( ushort_t ) atoi( __line__ );
config->add_syn = ( ushort_t ) atoi( __line__ );
} else if ( strcmp( __key__, "friso.clr_stw" ) == 0 ) {
e->clr_stw = ( ushort_t ) atoi( __line__ );
config->clr_stw = ( ushort_t ) atoi( __line__ );
} else if ( strcmp( __key__, "friso.keep_urec" ) == 0 ) {
e->keep_urec = ( uint_t ) atoi( __line__ );
config->keep_urec = ( uint_t ) atoi( __line__ );
} else if ( strcmp( __key__, "friso.spx_out" ) == 0 ) {
e->spx_out = ( ushort_t ) atoi( __line__ );
config->spx_out = ( ushort_t ) atoi( __line__ );
} else if ( strcmp( __key__, "friso.nthreshold" ) == 0 ) {
e->nthreshold = atoi( __line__ );
config->nthreshold = atoi( __line__ );
} else if ( strcmp( __key__, "friso.mode" ) == 0 ) {
e->mode = ( friso_mode_t ) atoi( __line__ );
config->mode = ( friso_mode_t ) atoi( __line__ );
}
}
@ -126,14 +117,15 @@ FRISO_API friso_t friso_new_from_ifile( fstring __ifile )
*/
if ( __hit__ != 0 )
{
e->dic = friso_dic_new();
friso_dic_load_from_ifile( e, __lexi__, e->max_len * 3 );
friso->dic = friso_dic_new();
friso_dic_load_from_ifile( friso, config, __lexi__, config->max_len * 3 );
}
fclose( __stream );
return 1;
}
return e;
return 0;
}
/* }}} */
@ -150,19 +142,40 @@ FRISO_API void friso_free( friso_t friso )
}
/* }}} */
//A macro define has replace this.
//FRISO_API void friso_set_dic( friso_t friso, friso_dic_t dic )
//{
// friso->dic = dic;
//}
//
//A macro define has replace this.
//FRISO_API void friso_set_mode( friso_t friso, friso_mode_t mode )
//{
// friso->mode = mode;
//}
/* {{{ create a new friso configuration entry and initialize
* it with default value.*/
FRISO_API friso_config_t friso_new_config( void )
{
friso_config_t cfg = (friso_config_t)
FRISO_MALLOC(sizeof(friso_config_entry));
if ( cfg == NULL ) {
___ALLOCATION_ERROR___;
}
/* {{{ create a new segment item.
//initialize the configuration entry.
friso_init_config(cfg);
return cfg;
}
/* }}} */
/* {{{ initialize the specified friso config entry with default value.*/
FRISO_API void friso_init_config( friso_config_t cfg )
{
cfg->max_len = DEFAULT_SEGMENT_LENGTH;
cfg->r_name = 1;
cfg->mix_len = DEFAULT_MIX_LENGTH;
cfg->lna_len = DEFAULT_LNA_LENGTH;
cfg->add_syn = 1;
cfg->clr_stw = 0;
cfg->keep_urec = 0;
cfg->spx_out = 0;
cfg->nthreshold = DEFAULT_NTHRESHOLD;
cfg->mode = ( friso_mode_t ) DEFAULT_SEGMENT_MODE;
}
/* }}} */
/* {{{ create a new segment item entry.
*/
FRISO_API friso_task_t friso_new_task()
{
@ -218,13 +231,6 @@ FRISO_API friso_hits_t friso_new_hits( void )
}
/* }}} */
//free the allocations of the given friso hits
//A macro define has replace this.
//FRISO_API void friso_free_hits( friso_hits_t hits )
//{
// FRISO_FREE( hits );
//}
/* {{{ set the text of the current segmentation.
* that means we could re-use the segment.
* also we have to reset the idx and the length of the segmentation.
@ -274,6 +280,7 @@ __STATIC_API__ uint_t read_next_word(
*/
__STATIC_API__ lex_entry_t next_simple_cjk(
friso_t friso,
friso_config_t config,
friso_task_t task )
{
uint_t t, idx = task->idx, __length__;
@ -289,7 +296,7 @@ __STATIC_API__ lex_entry_t next_simple_cjk(
*/
__length__ = e->length;
for ( t = 1; t < friso->max_len
for ( t = 1; t < config->max_len
&& ( task->bytes = read_next_word(
task, &idx, task->buffer ) ) != 0; t++ )
{
@ -317,7 +324,7 @@ __STATIC_API__ lex_entry_t next_simple_cjk(
* make sure the current tokenzier is not stopwords.
* @warning: friso.clr_stw must be open in friso.ini configuration file.
*/
if ( friso->clr_stw
if ( config->clr_stw
&& friso_dic_match( friso->dic,
__LEX_STOPWORDS__, e->word ) ) {
return NULL;
@ -349,6 +356,7 @@ if ( __convert == 1 ) { \
//get the next latin word from the current position.
__STATIC_API__ lex_entry_t next_basic_latin(
friso_t friso,
friso_config_t config,
friso_task_t task )
{
int __convert = 0, t = 0;
@ -462,7 +470,7 @@ __STATIC_API__ lex_entry_t next_basic_latin(
//Try to find a english chinese mixed word.
tmp = new_string_buffer_with_string( sb->buffer );
for ( t = 0; t < friso->mix_len
for ( t = 0; t < config->mix_len
&& ( task->bytes = read_next_word(
task, &task->idx, task->buffer ) ) != 0; t++ )
{
@ -539,6 +547,7 @@ __STATIC_API__ lex_entry_t next_basic_latin(
*/
__STATIC_API__ friso_array_t get_next_match(
friso_t friso,
friso_config_t config,
friso_task_t task,
uint_t idx )
{
@ -548,12 +557,12 @@ __STATIC_API__ friso_array_t get_next_match(
//create a match dynamic array.
friso_array_t match =
new_array_list_with_opacity( friso->max_len );
new_array_list_with_opacity( config->max_len );
array_list_add( match,
friso_dic_get( friso->dic,
__LEX_CJK_WORDS__, task->buffer ) );
for ( t = 1; t < friso->max_len && ( task->bytes =
for ( t = 1; t < config->max_len && ( task->bytes =
read_next_word( task, &idx, task->buffer ) ) != 0;
t++ ) {
task->unicode = get_utf8_unicode( task->buffer );
@ -863,6 +872,7 @@ __STATIC_API__ friso_chunk_t mmseg_core_invoke( friso_array_t chunks )
*/
__STATIC_API__ lex_entry_t next_complex_cjk(
friso_t friso,
friso_config_t config,
friso_task_t task )
{
register uint_t x, y, z;
@ -872,7 +882,7 @@ __STATIC_API__ lex_entry_t next_complex_cjk(
friso_chunk_t e;
friso_array_t words, chunks;
friso_array_t smatch, tmatch,
fmatch = get_next_match( friso, task, task->idx );
fmatch = get_next_match( friso, config, task, task->idx );
/*
* here:
@ -888,7 +898,7 @@ __STATIC_API__ lex_entry_t next_complex_cjk(
* check and clear the stop words .
* @date 2013-06-13
*/
if ( friso->clr_stw &&
if ( config->clr_stw &&
friso_dic_match( friso->dic,
__LEX_STOPWORDS__, fe->word ) ) {
return NULL;
@ -914,7 +924,7 @@ __STATIC_API__ lex_entry_t next_complex_cjk(
__LEX_CJK_WORDS__, task->buffer ) )
{
//get the next matchs
smatch = get_next_match( friso, task, __idx__ );
smatch = get_next_match( friso, config, task, __idx__ );
for ( y = 0; y < smatch->length; y++ )
{
/*get the word and try the third layer match*/
@ -928,7 +938,7 @@ __STATIC_API__ lex_entry_t next_complex_cjk(
__LEX_CJK_WORDS__, task->buffer ) )
{
//get the matchs.
tmatch = get_next_match( friso, task, __idx__ );
tmatch = get_next_match( friso, config, task, __idx__ );
for ( z = 0; z < tmatch->length; z++ )
{
te = ( lex_entry_t ) array_list_get( tmatch, z );
@ -981,7 +991,7 @@ __STATIC_API__ lex_entry_t next_complex_cjk(
free_chunk( e );
//clear the stop words
if ( friso->clr_stw &&
if ( config->clr_stw &&
friso_dic_match( friso->dic,
__LEX_STOPWORDS__, fe->word ) ) {
return NULL;
@ -1015,7 +1025,7 @@ __STATIC_API__ lex_entry_t next_complex_cjk(
*/
FRISO_API friso_hits_t friso_next(
friso_t friso,
friso_mode_t _mode,
friso_config_t config,
friso_task_t task )
{
uint_t i, j, len;
@ -1091,10 +1101,10 @@ FRISO_API friso_hits_t friso_next(
}
//complex mode.
if ( _mode == __FRISO_COMPLEX_MODE__ )
lex = next_complex_cjk( friso, task );
if ( config->mode == __FRISO_COMPLEX_MODE__ )
lex = next_complex_cjk( friso, config, task );
//simple mode.
else lex = next_simple_cjk( friso, task );
else lex = next_simple_cjk( friso, config, task );
if ( lex == NULL ) continue; //find a stopwrod.
@ -1125,7 +1135,7 @@ FRISO_API friso_hits_t friso_next(
//find the next basic latin.
task->buffer[0] = task->text[task->idx++];
task->buffer[1] = '\0';
tmp = next_basic_latin(friso, task);
tmp = next_basic_latin(friso, config, task);
string_buffer_append( sb, tmp->word );
//check the CE dictionary.
@ -1178,8 +1188,8 @@ FRISO_API friso_hits_t friso_next(
memcpy(task->hits->word, lex->word, lex->length);
//check and append the synonyms words
if ( friso->add_syn && lex->syn != NULL ) {
if ( friso->spx_out == 1 )
if ( config->add_syn && lex->syn != NULL ) {
if ( config->spx_out == 1 )
hits_sphinx_output(lex);
else hits_normal_output(lex);
}
@ -1223,7 +1233,7 @@ FRISO_API friso_hits_t friso_next(
*/
if ( utf8_en_punctuation( task->unicode ) )
{
if ( friso->clr_stw
if ( config->clr_stw
&& friso_dic_match(friso->dic,
__LEX_STOPWORDS__, task->buffer) )
continue;
@ -1236,10 +1246,10 @@ FRISO_API friso_hits_t friso_next(
}
//get the next basic latin word.
lex = next_basic_latin( friso, task );
lex = next_basic_latin( friso, config, task );
//check if it is a stopword.
if ( friso->clr_stw
if ( config->clr_stw
&& friso_dic_match( friso->dic,
__LEX_STOPWORDS__, lex->word ) ) {
//free the newly created lexicon entry.
@ -1255,10 +1265,10 @@ FRISO_API friso_hits_t friso_next(
//check and add the synonyms words.
//@date 2013-10-15
if ( friso->add_syn && (
if ( config->add_syn && (
tmp = friso_dic_get( friso->dic,
__LEX_EN_WORDS__, lex->word) ) != NULL ) {
if ( friso->spx_out == 1 )
if ( config->spx_out == 1 )
hits_sphinx_output(tmp);
else hits_normal_output(tmp);
}
@ -1275,7 +1285,7 @@ FRISO_API friso_hits_t friso_next(
* @added 2013-08-31) */
else if ( utf8_cn_punctuation( task->unicode ) )
{
if ( friso->clr_stw
if ( config->clr_stw
&& friso_dic_match(friso->dic,
__LEX_STOPWORDS__, task->buffer) )
continue;
@ -1294,7 +1304,7 @@ FRISO_API friso_hits_t friso_next(
/* {{{ keep the unrecognized words?
//@date 2013-10-14 */
else if ( friso->keep_urec)
else if ( config->keep_urec)
{
memcpy(task->hits->word, task->buffer, task->bytes);
task->hits->word[task->bytes] = '\0';

View File

@ -2,8 +2,7 @@
* main interface file for friso - free soul.
* you could modify it and re-release it but never for commercial use.
*
* @author chenxin
* @email chenxin619315@gmail.com
* @author chenxin <chenxin619315@gmail.com>
*/
#ifndef _friso_h
#define _friso_h
@ -75,11 +74,13 @@ typedef enum {
__FRISO_COMPLEX_MODE__ = 2
} friso_mode_t;
/*
* Type: friso_entry
* -----------------
* This type is used to set the configuration of friso.
*/
/* friso entry.*/
typedef struct {
friso_dic_t dic; //friso dictionary
} friso_entry;
typedef friso_entry * friso_t;
/* task configuration entry.*/
typedef struct {
ushort_t max_len; //the max match length (4 - 7).
ushort_t r_name; //1 for open chinese name recognition 0 for close it.
@ -91,25 +92,22 @@ typedef struct {
ushort_t spx_out; //use sphinx output customize.
uint_t nthreshold; //the threshold value for a char to make up a chinese name.
friso_mode_t mode; //Complex mode or simple mode
friso_dic_t dic; //friso dictionary
} friso_entry;
typedef friso_entry * friso_t;
} friso_config_entry;
typedef friso_config_entry * friso_config_t;
/*the segmentation term entry.*/
#define __HITS_WORD_LENGTH__ 128
/*the segmentation term*/
typedef struct {
int offset;
char word[__HITS_WORD_LENGTH__];
} friso_hits_entry;
typedef friso_hits_entry * friso_hits_t;
/*
* Type: friso_segment
* Type: friso_task_entry
* This type used to represent the current segmentation content.
* like the text to split, and the current index.
* like the text to split, and the current index, hits buffer eg....
*/
typedef struct {
fstring text; //text to tokenize
@ -117,12 +115,10 @@ typedef struct {
uint_t length; //length of the text.
uint_t bytes; //latest word bytes in C.
uint_t unicode; //latest word unicode number.
//uint_t ce_check; //check the CN and EN mixed word if it is 1.
friso_link_t pool; //task pool.
friso_hits_t hits; //token result hits.
char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C).
} friso_task_entry;
typedef friso_task_entry * friso_task_t;
@ -136,7 +132,8 @@ typedef friso_task_entry * friso_task_t;
FRISO_API friso_t friso_new( void );
//creat a friso entry with a default value from a configuratile file.
FRISO_API friso_t friso_new_from_ifile( fstring );
//@return 1 for successfully and 0 for failed.
FRISO_API int friso_init_from_ifile( friso_t, friso_config_t, fstring );
/*
* Function: friso_free_vars;
@ -155,7 +152,9 @@ FRISO_API void friso_free( friso_t );
*/
//FRISO_API void friso_set_dic( friso_t, friso_dic_t );
#define friso_set_dic(friso, dic)\
friso->dic = dic
do {\
friso->dic = dic;\
} while (0)
/*
* Function: friso_set_mode
@ -165,7 +164,20 @@ FRISO_API void friso_free( friso_t );
*/
//FRISO_API void friso_set_mode( friso_t, friso_mode_t );
#define friso_set_mode( friso, mode )\
friso->mode = mode
do {\
friso->mode = mode;\
} while (0)
/*create a new friso configuration entry and initialize
it with the default value.*/
FRISO_API friso_config_t friso_new_config( void );
//initialize the specified friso config entry with default value.
FRISO_API void friso_init_config( friso_config_t );
//free the specified friso configuration entry.
//FRISO_API void friso_free_config( friso_config_t );
#define friso_free_config(cfg) FRISO_FREE(cfg)
/*
* Function: friso_new_task;
@ -204,9 +216,10 @@ FRISO_API void friso_set_text( friso_task_t, fstring );
* --------------------------------------
* This function is used to get next word that friso segmented.
*/
FRISO_API friso_hits_t friso_next( friso_t, friso_mode_t, friso_task_t );
FRISO_API friso_hits_t friso_next( friso_t, friso_config_t, friso_task_t );
/* }}} friso main interface define :: end*/
/* {{{ lexicon interface define :: start*/
/*
@ -242,13 +255,14 @@ FRISO_API void free_lex_entry( lex_entry_t );
* This function is used to load dictionary from a given path.
* no length limit when length less than 0.
*/
FRISO_API void friso_dic_load( friso_t, friso_lex_t, fstring, uint_t );
FRISO_API void friso_dic_load( friso_t, friso_config_t,
friso_lex_t, fstring, uint_t );
/*
* load the lexicon configuration file.
* and load all the valid lexicon from the conf file.
*/
FRISO_API void friso_dic_load_from_ifile( friso_t, fstring, uint_t );
FRISO_API void friso_dic_load_from_ifile( friso_t, friso_config_t, fstring, uint_t );
/*
* Function: friso_dic_match

View File

@ -12,10 +12,14 @@
#include <stdio.h>
#include <stdlib.h>
#ifdef _WIN32
# define FRISO_API extern __declspec(dllexport)
# define __STATIC_API__ static
#else
/*platform shared library statement :: unix*/
# define FRISO_API extern
//#define FRISO_API extern __declspec(dllexport)
# define __STATIC_API__ static inline
#endif
#define ___ALLOCATION_ERROR___ \

View File

@ -236,6 +236,7 @@ __STATIC_API__ fstring indexOf( fstring __str, char delimiter )
*/
FRISO_API void friso_dic_load(
friso_t friso,
friso_config_t config,
friso_lex_t lex,
fstring lex_file,
uint_t length )
@ -311,7 +312,7 @@ FRISO_API void friso_dic_load(
* and put them in a array list if the synonyms is not NULL
*/
sywords = NULL;
if ( friso->add_syn && _syn != NULL )
if ( config->add_syn && _syn != NULL )
{
string_split_reset( &sse, ",", _sbuffer );
sywords = new_array_list_with_opacity(5);
@ -396,6 +397,7 @@ __STATIC_API__ friso_lex_t get_lexicon_type_with_constant( fstring _key )
*/
FRISO_API void friso_dic_load_from_ifile(
friso_t friso,
friso_config_t config,
fstring _path,
uint_t _limits )
{
@ -465,7 +467,7 @@ FRISO_API void friso_dic_load_from_ifile(
string_buffer_append( sb, _path );
string_buffer_append( sb, __key__ );
//printf("key=%s, type=%d\n", __key__, lex_t);
friso_dic_load( friso, lex_t, sb->buffer, _limits );
friso_dic_load( friso, config, lex_t, sb->buffer, _limits );
}
}