diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index 2a77c1afc..c3f0b75cf 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -228,7 +228,7 @@ class WordExtractor(BaseExtractor): def parse_paragraph(paragraph): paragraph_content = [] for run in paragraph.runs: - if run.element.tag.endswith('r'): + if hasattr(run.element, 'tag') and isinstance(element.tag, str) and run.element.tag.endswith('r'): drawing_elements = run.element.findall( './/{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing') for drawing in drawing_elements: @@ -248,13 +248,14 @@ class WordExtractor(BaseExtractor): paragraphs = doc.paragraphs.copy() tables = doc.tables.copy() for element in doc.element.body: - if element.tag.endswith('p'): # paragraph - para = paragraphs.pop(0) - parsed_paragraph = parse_paragraph(para) - if parsed_paragraph: - content.append(parsed_paragraph) - elif element.tag.endswith('tbl'): # table - table = tables.pop(0) - content.append(self._table_to_markdown(table,image_map)) + if hasattr(element, 'tag'): + if isinstance(element.tag, str) and element.tag.endswith('p'): # paragraph + para = paragraphs.pop(0) + parsed_paragraph = parse_paragraph(para) + if parsed_paragraph: + content.append(parsed_paragraph) + elif isinstance(element.tag, str) and element.tag.endswith('tbl'): # table + table = tables.pop(0) + content.append(self._table_to_markdown(table,image_map)) return '\n'.join(content)