mirror of
https://gitee.com/dify_ai/dify.git
synced 2024-12-02 03:07:59 +08:00
ascii filter Unicode U+FFFE (#2038)
Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
parent
bdb0d77227
commit
cb7a608d75
@ -531,7 +531,9 @@ class IndexingRunner:
|
|||||||
def filter_string(self, text):
|
def filter_string(self, text):
|
||||||
text = re.sub(r'<\|', '<', text)
|
text = re.sub(r'<\|', '<', text)
|
||||||
text = re.sub(r'\|>', '>', text)
|
text = re.sub(r'\|>', '>', text)
|
||||||
text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]', '', text)
|
text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xEF\xBF\xBE]', '', text)
|
||||||
|
# Unicode U+FFFE
|
||||||
|
text = re.sub(u'\uFFFE', '', text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _get_splitter(self, processing_rule: DatasetProcessRule,
|
def _get_splitter(self, processing_rule: DatasetProcessRule,
|
||||||
|
Loading…
Reference in New Issue
Block a user