[verify] sst2loader use spacy tokenizer

2024-12-01 11:48:09 +08:00 · 2019-07-08 13:00:53 +08:00 · 2019-07-08 13:00:53 +08:00 · d8bd40daf0
commit d8bd40daf0
parent 4687b378bb
1 changed files with 3 additions and 1 deletions
--- a/reproduction/text_classification/data/sstLoader.py
+++ b/reproduction/text_classification/data/sstLoader.py
@ -7,6 +7,7 @@ from fastNLP import Instance
 from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
 import csv
 from typing import Union, Dict
+from reproduction.utils import check_dataloader_paths, get_tokenizer

 class SSTLoader(DataSetLoader):
    URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
@ -104,6 +105,7 @@ class sst2Loader(DataSetLoader):
    '''
    def __init__(self):
        super(sst2Loader, self).__init__()
+        self.tokenizer = get_tokenizer()

    def _load(self, path: str) -> DataSet:
        ds = DataSet()
@ -114,7 +116,7 @@ class sst2Loader(DataSetLoader):
            if idx<=skip_row:
                continue
            target = row[1]
-            words = row[0].split()
+            words=self.tokenizer(words)
            ds.append(Instance(words=words,target=target))
            all_count+=1
        print("all count:", all_count)