mirror of
https://gitee.com/dify_ai/dify.git
synced 2024-12-03 03:38:08 +08:00
improve: unify Excel files parsing in either xls or xlsx file format by Pandas (#4965)
This commit is contained in:
parent
0d20df9a51
commit
39c14ec7c1
@ -2,7 +2,6 @@
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
import xlrd
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
@ -28,61 +27,19 @@ class ExcelExtractor(BaseExtractor):
|
||||
self._autodetect_encoding = autodetect_encoding
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
""" parse excel file"""
|
||||
if self._file_path.endswith('.xls'):
|
||||
return self._extract4xls()
|
||||
elif self._file_path.endswith('.xlsx'):
|
||||
return self._extract4xlsx()
|
||||
|
||||
def _extract4xls(self) -> list[Document]:
|
||||
wb = xlrd.open_workbook(filename=self._file_path)
|
||||
""" Load from Excel file in xls or xlsx format using Pandas."""
|
||||
documents = []
|
||||
# loop over all sheets
|
||||
for sheet in wb.sheets():
|
||||
row_header = None
|
||||
for row_index, row in enumerate(sheet.get_rows(), start=1):
|
||||
if self.is_blank_row(row):
|
||||
continue
|
||||
if row_header is None:
|
||||
row_header = row
|
||||
continue
|
||||
item_arr = []
|
||||
for index, cell in enumerate(row):
|
||||
txt_value = str(cell.value)
|
||||
item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
|
||||
item_str = ",".join(item_arr)
|
||||
document = Document(page_content=item_str, metadata={'source': self._file_path})
|
||||
documents.append(document)
|
||||
return documents
|
||||
|
||||
def _extract4xlsx(self) -> list[Document]:
|
||||
"""Load from file path using Pandas."""
|
||||
data = []
|
||||
# Read each worksheet of an Excel file using Pandas
|
||||
xls = pd.ExcelFile(self._file_path)
|
||||
for sheet_name in xls.sheet_names:
|
||||
df = pd.read_excel(xls, sheet_name=sheet_name)
|
||||
excel_file = pd.ExcelFile(self._file_path)
|
||||
for sheet_name in excel_file.sheet_names:
|
||||
df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name)
|
||||
|
||||
# filter out rows with all NaN values
|
||||
df.dropna(how='all', inplace=True)
|
||||
|
||||
# transform each row into a Document
|
||||
for _, row in df.iterrows():
|
||||
item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
|
||||
document = Document(page_content=item, metadata={'source': self._file_path})
|
||||
data.append(document)
|
||||
return data
|
||||
documents += [Document(page_content=';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)),
|
||||
metadata={'source': self._file_path},
|
||||
) for _, row in df.iterrows()]
|
||||
|
||||
@staticmethod
|
||||
def is_blank_row(row):
|
||||
"""
|
||||
|
||||
Determine whether the specified line is a blank line.
|
||||
:param row: row object。
|
||||
:return: Returns True if the row is blank, False otherwise.
|
||||
"""
|
||||
# Iterates through the cells and returns False if a non-empty cell is found
|
||||
for cell in row:
|
||||
if cell.value is not None and cell.value != '':
|
||||
return False
|
||||
return True
|
||||
return documents
|
||||
|
Loading…
Reference in New Issue
Block a user