From cb7a608d75ac630ed2756ba95ef2b5f5ef95c03a Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Mon, 15 Jan 2024 16:52:18 +0800 Subject: [PATCH] ascii filter Unicode U+FFFE (#2038) Co-authored-by: jyong --- api/core/indexing_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 28a99d2f7..22a60ae56 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -531,7 +531,9 @@ class IndexingRunner: def filter_string(self, text): text = re.sub(r'<\|', '<', text) text = re.sub(r'\|>', '>', text) - text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]', '', text) + text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xEF\xBF\xBE]', '', text) + # Unicode U+FFFE + text = re.sub(u'\uFFFE', '', text) return text def _get_splitter(self, processing_rule: DatasetProcessRule,