From 82033af097d772167d666a0272d8d604821c5776 Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:09:22 +0800 Subject: [PATCH] clean un-allowed special charters when doing indexing estimate (#10153) --- api/core/indexing_runner.py | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 8df26172b..fb9fe8f21 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -17,6 +17,7 @@ from core.errors.error import ProviderTokenNotInitError from core.llm_generator.llm_generator import LLMGenerator from core.model_manager import ModelInstance, ModelManager from core.model_runtime.entities.model_entities import ModelType +from core.rag.cleaner.clean_processor import CleanProcessor from core.rag.datasource.keyword.keyword_factory import Keyword from core.rag.docstore.dataset_docstore import DatasetDocumentStore from core.rag.extractor.entity.extract_setting import ExtractSetting @@ -597,26 +598,9 @@ class IndexingRunner: rules = DatasetProcessRule.AUTOMATIC_RULES else: rules = json.loads(processing_rule.rules) if processing_rule.rules else {} + document_text = CleanProcessor.clean(text, rules) - if "pre_processing_rules" in rules: - pre_processing_rules = rules["pre_processing_rules"] - for pre_processing_rule in pre_processing_rules: - if pre_processing_rule["id"] == "remove_extra_spaces" and pre_processing_rule["enabled"] is True: - # Remove extra spaces - pattern = r"\n{3,}" - text = re.sub(pattern, "\n\n", text) - pattern = r"[\t\f\r\x20\u00a0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]{2,}" - text = re.sub(pattern, " ", text) - elif pre_processing_rule["id"] == "remove_urls_emails" and pre_processing_rule["enabled"] is True: - # Remove email - pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)" - text = re.sub(pattern, "", text) - - # Remove URL - pattern = r"https?://[^\s]+" - text = re.sub(pattern, "", text) - - return text + return document_text @staticmethod def format_split_text(text):