mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-03 04:37:37 +08:00
新增CTBLoader
This commit is contained in:
parent
e933df4227
commit
338deec103
@ -212,7 +212,7 @@ class OntoNotesNERLoader(ConllLoader):
|
||||
|
||||
返回的DataSet的内容为
|
||||
|
||||
.. csv-table:: 下面是使用OntoNoteNERLoader读取的DataSet所具备的结构, target列是BIO编码
|
||||
.. csv-table::
|
||||
:header: "raw_words", "target"
|
||||
|
||||
"[Nadim, Ladki]", "[B-PER, I-PER]"
|
||||
@ -276,11 +276,45 @@ class OntoNotesNERLoader(ConllLoader):
|
||||
|
||||
|
||||
class CTBLoader(Loader):
|
||||
"""
|
||||
支持加载的数据应该具备以下格式, 其中第二列为词语,第四列为pos tag,第七列为依赖树的head,第八列为依赖树的label
|
||||
|
||||
Example::
|
||||
|
||||
1 印度 _ NR NR _ 3 nn _ _
|
||||
2 海军 _ NN NN _ 3 nn _ _
|
||||
3 参谋长 _ NN NN _ 5 nsubjpass _ _
|
||||
4 被 _ SB SB _ 5 pass _ _
|
||||
5 解职 _ VV VV _ 0 root _ _
|
||||
|
||||
1 新华社 _ NR NR _ 7 dep _ _
|
||||
2 新德里 _ NR NR _ 7 dep _ _
|
||||
3 12月 _ NT NT _ 7 dep _ _
|
||||
...
|
||||
|
||||
读取之后DataSet具备的格式为
|
||||
|
||||
.. csv-table::
|
||||
:header: "raw_words", "pos", "dep_head", "dep_label"
|
||||
|
||||
"[印度, 海军, ...]", "[NR, NN, SB, ...]", "[3, 3, ...]", "[nn, nn, ...]"
|
||||
"[新华社, 新德里, ...]", "[NR, NR, NT, ...]", "[7, 7, 7, ...]", "[dep, dep, dep, ...]"
|
||||
"[...]", "[...]", "[...]", "[...]"
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
headers = [
|
||||
'raw_words', 'pos', 'dep_head', 'dep_label',
|
||||
]
|
||||
indexes = [
|
||||
1, 3, 6, 7,
|
||||
]
|
||||
self.loader = ConllLoader(headers=headers, indexes=indexes)
|
||||
|
||||
def _load(self, path: str):
|
||||
pass
|
||||
dataset = self.loader._load(path)
|
||||
return dataset
|
||||
|
||||
|
||||
class CNNERLoader(Loader):
|
||||
@ -339,7 +373,7 @@ class MsraNERLoader(CNNERLoader):
|
||||
|
||||
读取后的DataSet包含以下的field
|
||||
|
||||
.. csv-table:: target列是基于BIO的编码方式
|
||||
.. csv-table::
|
||||
:header: "raw_chars", "target"
|
||||
|
||||
"[我, 们, 变...]", "[O, O, ...]"
|
||||
|
Loading…
Reference in New Issue
Block a user