mirror of
https://gitee.com/dify_ai/dify.git
synced 2024-11-30 10:18:13 +08:00
Parse base64 eml file (#1796)
Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
parent
7083a05a25
commit
64642fabc4
@ -1,9 +1,8 @@
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional, List, Tuple, cast
|
||||
|
||||
import base64
|
||||
from typing import List
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.helpers import detect_file_encodings
|
||||
from langchain.schema import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -11,8 +10,6 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class UnstructuredEmailLoader(BaseLoader):
|
||||
"""Load msg files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
@ -26,16 +23,28 @@ class UnstructuredEmailLoader(BaseLoader):
|
||||
self._file_path = file_path
|
||||
self._api_url = api_url
|
||||
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
from unstructured.partition.email import partition_email
|
||||
|
||||
elements = partition_email(filename=self._file_path, api_url=self._api_url)
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
for element in elements:
|
||||
element_text = element.text.strip()
|
||||
|
||||
padding_needed = 4 - len(element_text) % 4
|
||||
element_text += '=' * padding_needed
|
||||
|
||||
element_decode = base64.b64decode(element_text)
|
||||
soup = BeautifulSoup(element_decode.decode('utf-8'), 'html.parser')
|
||||
element.text = soup.get_text()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
documents.append(Document(page_content=text))
|
||||
|
||||
return documents
|
||||
|
@ -55,4 +55,6 @@ pymilvus==2.3.0
|
||||
qdrant-client==1.6.4
|
||||
cohere~=4.32
|
||||
unstructured~=0.10.27
|
||||
unstructured[docx,pptx]~=0.10.27
|
||||
unstructured[docx,pptx]~=0.10.27
|
||||
bs4~=0.0.1
|
||||
markdown~=3.5.1
|
Loading…
Reference in New Issue
Block a user