Parse base64 eml file (#1796)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
Charlie.Wei 2023-12-21 13:18:58 +08:00 committed by GitHub
parent 7083a05a25
commit 64642fabc4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 10 deletions

View File

@ -1,9 +1,8 @@
import logging
import re
from typing import Optional, List, Tuple, cast
import base64
from typing import List
from bs4 import BeautifulSoup
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document
logger = logging.getLogger(__name__)
@ -11,8 +10,6 @@ logger = logging.getLogger(__name__)
class UnstructuredEmailLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
@ -26,16 +23,28 @@ class UnstructuredEmailLoader(BaseLoader):
self._file_path = file_path
self._api_url = api_url
def load(self) -> List[Document]:
from unstructured.partition.email import partition_email
elements = partition_email(filename=self._file_path, api_url=self._api_url)
# noinspection PyBroadException
try:
for element in elements:
element_text = element.text.strip()
padding_needed = 4 - len(element_text) % 4
element_text += '=' * padding_needed
element_decode = base64.b64decode(element_text)
soup = BeautifulSoup(element_decode.decode('utf-8'), 'html.parser')
element.text = soup.get_text()
except Exception:
pass
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@ -55,4 +55,6 @@ pymilvus==2.3.0
qdrant-client==1.6.4
cohere~=4.32
unstructured~=0.10.27
unstructured[docx,pptx]~=0.10.27
unstructured[docx,pptx]~=0.10.27
bs4~=0.0.1
markdown~=3.5.1