1.增加sequence labeling任务的数据说明; 2.增加对CWSPipe的引用

2024-12-02 20:27:35 +08:00 · 2019-08-26 01:33:17 +08:00 · 2019-08-26 01:33:17 +08:00 · 584a92c64c
commit 584a92c64c
parent be77533c38
6 changed files with 66 additions and 17 deletions
--- a/fastNLP/io/init.py
+++ b/fastNLP/io/init.py
@ -38,6 +38,7 @@ __all__ = [
    'JsonLoader',

    'CWSLoader',
+    "CWSPipe",

    'MNLILoader',
    "QuoraLoader",
--- a/fastNLP/io/pipe/init.py
+++ b/fastNLP/io/pipe/init.py
@ -10,6 +10,8 @@ Pipe用于处理通过 Loader 读取的数据，所有的 Pipe 都包含 ``proce
 __all__ = [
    "Pipe",

+    "CWSPipe",
+
    "YelpFullPipe",
    "YelpPolarityPipe",
    "SSTPipe",
@ -43,3 +45,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe
    MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe
 from .pipe import Pipe
 from .conll import Conll2003Pipe
+from .cws import CWSPipe
--- a/reproduction/seqence_labelling/chinese_ner/readme.md
+++ b/reproduction/seqence_labelling/chinese_ner/readme.md
@ -0,0 +1,30 @@
+使用以下中文NERPipe自动下载的统计数据
+
+| MsraNERPipe | # of sents | # of tokens |
+| ----------- | ---------- | ----------- |
+| train       | 41747      | 1954374     |
+| dev         | 4617       | 215505      |
+| test        | 4365       | 172601      |
+| total       | 50729      | 2342480     |
+这里报道的统计数据，与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致
+
+
+
+| WeiboNERPipe | # of sents | # of tokens |
+| ------------ | ---------- | ----------- |
+| train        | 1350       | 73778       |
+| dev          | 270        | 14509       |
+| test         | 270        | 14842       |
+| total        | 1890       | 1890        |
+这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致
+
+
+
+
+| PeopleDailyPipe | # of sents | # of tokens |
+| --------------- | ---------- | ----------- |
+| train           | 50658      | 2169879     |
+| dev             | 4631       | 172601      |
+| test            | 68         | 2270        |
+| total           | 55357      | 2344750     |
+这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的
--- a/reproduction/seqence_labelling/cws/readme.md
+++ b/reproduction/seqence_labelling/cws/readme.md
@ -0,0 +1,32 @@
+四个数据集的统计信息，最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。
+
+| pku   | # of sents | # of tokens |
+| ----- | ---------- | ----------- |
+| train | 17173      | 1650222     |
+| dev   | 1881       | 176226      |
+| test  | 1944       | 172733      |
+| total | 20998      | 1999181     |
+
+
+| cityu | # of sents | # of tokens |
+| ----- | ---------- | ----------- |
+| train | 47696      | 2164907     |
+| dev   | 5323       | 238447      |
+| test  | 1492       | 67690       |
+| total | 54511      | 2471044     |
+
+
+| msra  | # of sents | # of tokens |
+| ----- | ---------- | ----------- |
+| train | 78242      | 3644550     |
+| dev   | 8676       | 405919      |
+| test  | 3985       | 184355      |
+| total | 90903      | 4234824     |
+
+
+| as    | # of sents | # of tokens |
+| ----- | ---------- | ----------- |
+| train | 638273     | 7536586     |
+| dev   | 70680      | 831464      |
+| test  | 14429      | 197681      |
+| total | 723382     | 8565731     |
--- a/reproduction/seqence_labelling/cws/test/init.py
+++ b/reproduction/seqence_labelling/cws/test/init.py
--- a/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py
+++ b/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py
@ -1,17 +0,0 @@
-
-
-import unittest
-from ..data.CWSDataLoader import SigHanLoader
-from fastNLP.core.vocabulary import VocabularyOption
-
-
-class TestCWSDataLoader(unittest.TestCase):
-    def test_case1(self):
-        cws_loader = SigHanLoader(target_type='bmes')
-        data = cws_loader.process('pku_demo.txt')
-        print(data.datasets)
-
-    def test_calse2(self):
-        cws_loader = SigHanLoader(target_type='bmes')
-        data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption())
-        print(data.datasets)