mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-02 20:27:35 +08:00
1.增加sequence labeling任务的数据说明; 2.增加对CWSPipe的引用
This commit is contained in:
parent
be77533c38
commit
584a92c64c
@ -38,6 +38,7 @@ __all__ = [
|
||||
'JsonLoader',
|
||||
|
||||
'CWSLoader',
|
||||
"CWSPipe",
|
||||
|
||||
'MNLILoader',
|
||||
"QuoraLoader",
|
||||
|
@ -10,6 +10,8 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce
|
||||
__all__ = [
|
||||
"Pipe",
|
||||
|
||||
"CWSPipe",
|
||||
|
||||
"YelpFullPipe",
|
||||
"YelpPolarityPipe",
|
||||
"SSTPipe",
|
||||
@ -43,3 +45,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe
|
||||
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe
|
||||
from .pipe import Pipe
|
||||
from .conll import Conll2003Pipe
|
||||
from .cws import CWSPipe
|
||||
|
30
reproduction/seqence_labelling/chinese_ner/readme.md
Normal file
30
reproduction/seqence_labelling/chinese_ner/readme.md
Normal file
@ -0,0 +1,30 @@
|
||||
使用以下中文NERPipe自动下载的统计数据
|
||||
|
||||
| MsraNERPipe | # of sents | # of tokens |
|
||||
| ----------- | ---------- | ----------- |
|
||||
| train | 41747 | 1954374 |
|
||||
| dev | 4617 | 215505 |
|
||||
| test | 4365 | 172601 |
|
||||
| total | 50729 | 2342480 |
|
||||
这里报道的统计数据,与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致
|
||||
|
||||
|
||||
|
||||
| WeiboNERPipe | # of sents | # of tokens |
|
||||
| ------------ | ---------- | ----------- |
|
||||
| train | 1350 | 73778 |
|
||||
| dev | 270 | 14509 |
|
||||
| test | 270 | 14842 |
|
||||
| total | 1890 | 1890 |
|
||||
这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致
|
||||
|
||||
|
||||
|
||||
|
||||
| PeopleDailyPipe | # of sents | # of tokens |
|
||||
| --------------- | ---------- | ----------- |
|
||||
| train | 50658 | 2169879 |
|
||||
| dev | 4631 | 172601 |
|
||||
| test | 68 | 2270 |
|
||||
| total | 55357 | 2344750 |
|
||||
这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的
|
32
reproduction/seqence_labelling/cws/readme.md
Normal file
32
reproduction/seqence_labelling/cws/readme.md
Normal file
@ -0,0 +1,32 @@
|
||||
四个数据集的统计信息,最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。
|
||||
|
||||
| pku | # of sents | # of tokens |
|
||||
| ----- | ---------- | ----------- |
|
||||
| train | 17173 | 1650222 |
|
||||
| dev | 1881 | 176226 |
|
||||
| test | 1944 | 172733 |
|
||||
| total | 20998 | 1999181 |
|
||||
|
||||
|
||||
| cityu | # of sents | # of tokens |
|
||||
| ----- | ---------- | ----------- |
|
||||
| train | 47696 | 2164907 |
|
||||
| dev | 5323 | 238447 |
|
||||
| test | 1492 | 67690 |
|
||||
| total | 54511 | 2471044 |
|
||||
|
||||
|
||||
| msra | # of sents | # of tokens |
|
||||
| ----- | ---------- | ----------- |
|
||||
| train | 78242 | 3644550 |
|
||||
| dev | 8676 | 405919 |
|
||||
| test | 3985 | 184355 |
|
||||
| total | 90903 | 4234824 |
|
||||
|
||||
|
||||
| as | # of sents | # of tokens |
|
||||
| ----- | ---------- | ----------- |
|
||||
| train | 638273 | 7536586 |
|
||||
| dev | 70680 | 831464 |
|
||||
| test | 14429 | 197681 |
|
||||
| total | 723382 | 8565731 |
|
@ -1,17 +0,0 @@
|
||||
|
||||
|
||||
import unittest
|
||||
from ..data.CWSDataLoader import SigHanLoader
|
||||
from fastNLP.core.vocabulary import VocabularyOption
|
||||
|
||||
|
||||
class TestCWSDataLoader(unittest.TestCase):
|
||||
def test_case1(self):
|
||||
cws_loader = SigHanLoader(target_type='bmes')
|
||||
data = cws_loader.process('pku_demo.txt')
|
||||
print(data.datasets)
|
||||
|
||||
def test_calse2(self):
|
||||
cws_loader = SigHanLoader(target_type='bmes')
|
||||
data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption())
|
||||
print(data.datasets)
|
Loading…
Reference in New Issue
Block a user