From 2bf48514bca10943b06d1c7759ed653800e5887a Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Tue, 6 Jun 2023 19:51:40 +0800 Subject: [PATCH] fix markdown parser (#230) --- api/core/index/readers/markdown_parser.py | 111 ++++++++++++++++++++++ api/core/indexing_runner.py | 4 +- 2 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 api/core/index/readers/markdown_parser.py diff --git a/api/core/index/readers/markdown_parser.py b/api/core/index/readers/markdown_parser.py new file mode 100644 index 000000000..e12c06a78 --- /dev/null +++ b/api/core/index/readers/markdown_parser.py @@ -0,0 +1,111 @@ +"""Markdown parser. + +Contains parser for md files. + +""" +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union, cast + +from llama_index.readers.file.base_parser import BaseParser + + +class MarkdownParser(BaseParser): + """Markdown parser. + + Extract text from markdown files. + Returns dictionary with keys as headers and values as the text between headers. + + """ + + def __init__( + self, + *args: Any, + remove_hyperlinks: bool = True, + remove_images: bool = True, + **kwargs: Any, + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._remove_hyperlinks = remove_hyperlinks + self._remove_images = remove_images + + def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: + """Convert a markdown file to a dictionary. + + The keys are the headers and the values are the text under each header. + + """ + markdown_tups: List[Tuple[Optional[str], str]] = [] + lines = markdown_text.split("\n") + + current_header = None + current_text = "" + + for line in lines: + header_match = re.match(r"^#+\s", line) + if header_match: + if current_header is not None: + markdown_tups.append((current_header, current_text)) + + current_header = line + current_text = "" + else: + current_text += line + "\n" + markdown_tups.append((current_header, current_text)) + + if current_header is not None: + # pass linting, assert keys are defined + markdown_tups = [ + (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) + for key, value in markdown_tups + ] + else: + markdown_tups = [ + (key, re.sub("\n", "", value)) for key, value in markdown_tups + ] + + return markdown_tups + + def remove_images(self, content: str) -> str: + """Get a dictionary of a markdown file from its path.""" + pattern = r"!{1}\[\[(.*)\]\]" + content = re.sub(pattern, "", content) + return content + + def remove_hyperlinks(self, content: str) -> str: + """Get a dictionary of a markdown file from its path.""" + pattern = r"\[(.*?)\]\((.*?)\)" + content = re.sub(pattern, r"\1", content) + return content + + def _init_parser(self) -> Dict: + """Initialize the parser with the config.""" + return {} + + def parse_tups( + self, filepath: Path, errors: str = "ignore" + ) -> List[Tuple[Optional[str], str]]: + """Parse file into tuples.""" + with open(filepath, "r", encoding="utf-8") as f: + content = f.read() + if self._remove_hyperlinks: + content = self.remove_hyperlinks(content) + if self._remove_images: + content = self.remove_images(content) + markdown_tups = self.markdown_to_tups(content) + return markdown_tups + + def parse_file( + self, filepath: Path, errors: str = "ignore" + ) -> Union[str, List[str]]: + """Parse file into string.""" + tups = self.parse_tups(filepath, errors=errors) + results = [] + # TODO: don't include headers right now + for header, value in tups: + if header is None: + results.append(value) + else: + results.append(f"\n\n{header}\n{value}") + return results diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index a4ff9ed28..23e87bb1e 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -12,11 +12,10 @@ from llama_index.data_structs import Node from llama_index.data_structs.node_v2 import DocumentRelationship from llama_index.node_parser import SimpleNodeParser, NodeParser from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR -from llama_index.readers.file.markdown_parser import MarkdownParser - from core.docstore.dataset_docstore import DatesetDocumentStore from core.index.keyword_table_index import KeywordTableIndex from core.index.readers.html_parser import HTMLParser +from core.index.readers.markdown_parser import MarkdownParser from core.index.readers.pdf_parser import PDFParser from core.index.spiltter.fixed_text_splitter import FixedRecursiveCharacterTextSplitter from core.index.vector_index import VectorIndex @@ -247,6 +246,7 @@ class IndexingRunner: file_extractor = DEFAULT_FILE_EXTRACTOR.copy() file_extractor[".markdown"] = MarkdownParser() + file_extractor[".md"] = MarkdownParser() file_extractor[".html"] = HTMLParser() file_extractor[".htm"] = HTMLParser() file_extractor[".pdf"] = PDFParser({'upload_file': upload_file})