extract docx filter comment element (#7092)

2024-12-03 19:57:37 +08:00 · 2024-08-08 16:53:29 +08:00 · 2024-08-08 16:53:29 +08:00 · 12095f8cd6
commit 12095f8cd6
parent 925f0d2e09
1 changed files with 10 additions and 9 deletions
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@ -228,7 +228,7 @@ class WordExtractor(BaseExtractor):
        def parse_paragraph(paragraph):
            paragraph_content = []
            for run in paragraph.runs:
-                if run.element.tag.endswith('r'):
+                if hasattr(run.element, 'tag') and isinstance(element.tag, str) and run.element.tag.endswith('r'):
                    drawing_elements = run.element.findall(
                        './/{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
                    for drawing in drawing_elements:
@ -248,13 +248,14 @@ class WordExtractor(BaseExtractor):
        paragraphs = doc.paragraphs.copy()
        tables = doc.tables.copy()
        for element in doc.element.body:
-            if element.tag.endswith('p'):  # paragraph
-                para = paragraphs.pop(0)
-                parsed_paragraph = parse_paragraph(para)
-                if parsed_paragraph:
-                    content.append(parsed_paragraph)
-            elif element.tag.endswith('tbl'):  # table
-                table = tables.pop(0)
-                content.append(self._table_to_markdown(table,image_map))
+            if hasattr(element, 'tag'):
+                if isinstance(element.tag, str) and element.tag.endswith('p'):  # paragraph
+                    para = paragraphs.pop(0)
+                    parsed_paragraph = parse_paragraph(para)
+                    if parsed_paragraph:
+                        content.append(parsed_paragraph)
+                elif isinstance(element.tag, str) and element.tag.endswith('tbl'):  # table
+                    table = tables.pop(0)
+                    content.append(self._table_to_markdown(table,image_map))
        return '\n'.join(content)