1.增加sequence labeling任务的数据说明; 2.增加对CWSPipe的引用

This commit is contained in:
yh_cc 2019-08-26 01:33:17 +08:00
parent be77533c38
commit 584a92c64c
6 changed files with 66 additions and 17 deletions

View File

@ -38,6 +38,7 @@ __all__ = [
'JsonLoader',
'CWSLoader',
"CWSPipe",
'MNLILoader',
"QuoraLoader",

View File

@ -10,6 +10,8 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce
__all__ = [
"Pipe",
"CWSPipe",
"YelpFullPipe",
"YelpPolarityPipe",
"SSTPipe",
@ -43,3 +45,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe
from .pipe import Pipe
from .conll import Conll2003Pipe
from .cws import CWSPipe

View File

@ -0,0 +1,30 @@
使用以下中文NERPipe自动下载的统计数据
| MsraNERPipe | # of sents | # of tokens |
| ----------- | ---------- | ----------- |
| train | 41747 | 1954374 |
| dev | 4617 | 215505 |
| test | 4365 | 172601 |
| total | 50729 | 2342480 |
这里报道的统计数据,与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致
| WeiboNERPipe | # of sents | # of tokens |
| ------------ | ---------- | ----------- |
| train | 1350 | 73778 |
| dev | 270 | 14509 |
| test | 270 | 14842 |
| total | 1890 | 1890 |
这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致
| PeopleDailyPipe | # of sents | # of tokens |
| --------------- | ---------- | ----------- |
| train | 50658 | 2169879 |
| dev | 4631 | 172601 |
| test | 68 | 2270 |
| total | 55357 | 2344750 |
这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的

View File

@ -0,0 +1,32 @@
四个数据集的统计信息,最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。
| pku | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 17173 | 1650222 |
| dev | 1881 | 176226 |
| test | 1944 | 172733 |
| total | 20998 | 1999181 |
| cityu | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 47696 | 2164907 |
| dev | 5323 | 238447 |
| test | 1492 | 67690 |
| total | 54511 | 2471044 |
| msra | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 78242 | 3644550 |
| dev | 8676 | 405919 |
| test | 3985 | 184355 |
| total | 90903 | 4234824 |
| as | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 638273 | 7536586 |
| dev | 70680 | 831464 |
| test | 14429 | 197681 |
| total | 723382 | 8565731 |

View File

@ -1,17 +0,0 @@
import unittest
from ..data.CWSDataLoader import SigHanLoader
from fastNLP.core.vocabulary import VocabularyOption
class TestCWSDataLoader(unittest.TestCase):
def test_case1(self):
cws_loader = SigHanLoader(target_type='bmes')
data = cws_loader.process('pku_demo.txt')
print(data.datasets)
def test_calse2(self):
cws_loader = SigHanLoader(target_type='bmes')
data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption())
print(data.datasets)