Skip to content

Commit 35b662a

Browse files
authored
perf: Optimize document extraction for complex table files (#3116)
1 parent a4faf52 commit 35b662a

File tree

1 file changed

+20
-31
lines changed

1 file changed

+20
-31
lines changed

apps/common/handle/impl/table/xlsx_parse_table_handle.py

Lines changed: 20 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,36 +19,24 @@ def support(self, file, get_buffer):
1919

2020
def fill_merged_cells(self, sheet, image_dict):
2121
data = []
22-
23-
# 获取第一行作为标题行
24-
headers = []
25-
for idx, cell in enumerate(sheet[1]):
26-
if cell.value is None:
27-
headers.append(' ' * (idx + 1))
28-
else:
29-
headers.append(cell.value)
30-
3122
# 从第二行开始遍历每一行
32-
for row in sheet.iter_rows(min_row=2, values_only=False):
33-
row_data = {}
23+
for row in sheet.iter_rows(values_only=False):
24+
row_data = []
3425
for col_idx, cell in enumerate(row):
3526
cell_value = cell.value
36-
37-
# 如果单元格为空,并且该单元格在合并单元格内,获取合并单元格的值
38-
if cell_value is None:
39-
for merged_range in sheet.merged_cells.ranges:
40-
if cell.coordinate in merged_range:
41-
cell_value = sheet[merged_range.min_row][merged_range.min_col - 1].value
42-
break
43-
4427
image = image_dict.get(cell_value, None)
4528
if image is not None:
4629
cell_value = f'![](/api/image/{image.id})'
4730

4831
# 使用标题作为键,单元格的值作为值存入字典
49-
row_data[headers[col_idx]] = cell_value
32+
row_data.insert(col_idx, cell_value)
5033
data.append(row_data)
5134

35+
for merged_range in sheet.merged_cells.ranges:
36+
cell_value = data[merged_range.min_row - 1][merged_range.min_col - 1]
37+
for row_index in range(merged_range.min_row, merged_range.max_row + 1):
38+
for col_index in range(merged_range.min_col, merged_range.max_col + 1):
39+
data[row_index - 1][col_index - 1] = cell_value
5240
return data
5341

5442
def handle(self, file, get_buffer, save_image):
@@ -65,11 +53,13 @@ def handle(self, file, get_buffer, save_image):
6553
paragraphs = []
6654
ws = wb[sheetname]
6755
data = self.fill_merged_cells(ws, image_dict)
68-
69-
for row in data:
70-
row_output = "; ".join([f"{key}: {value}" for key, value in row.items()])
71-
# print(row_output)
72-
paragraphs.append({'title': '', 'content': row_output})
56+
if len(data) >= 2:
57+
head_list = data[0]
58+
for row_index in range(1, len(data)):
59+
row_output = "; ".join(
60+
[f"{head_list[col_index]}: {data[row_index][col_index]}" for col_index in
61+
range(0, len(data[row_index]))])
62+
paragraphs.append({'title': '', 'content': row_output})
7363

7464
result.append({'name': sheetname, 'paragraphs': paragraphs})
7565

@@ -78,7 +68,6 @@ def handle(self, file, get_buffer, save_image):
7868
return [{'name': file.name, 'paragraphs': []}]
7969
return result
8070

81-
8271
def get_content(self, file, save_image):
8372
try:
8473
# 加载 Excel 文件
@@ -94,18 +83,18 @@ def get_content(self, file, save_image):
9483
# 如果未指定 sheet_name,则使用第一个工作表
9584
for sheetname in workbook.sheetnames:
9685
sheet = workbook[sheetname] if sheetname else workbook.active
97-
rows = self.fill_merged_cells(sheet, image_dict)
98-
if len(rows) == 0:
86+
data = self.fill_merged_cells(sheet, image_dict)
87+
if len(data) == 0:
9988
continue
10089
# 提取表头和内容
10190

102-
headers = [f"{key}" for key, value in rows[0].items()]
91+
headers = [f"{value}" for value in data[0]]
10392

10493
# 构建 Markdown 表格
10594
md_table = '| ' + ' | '.join(headers) + ' |\n'
10695
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
107-
for row in rows:
108-
r = [f'{value}' for key, value in row.items()]
96+
for row_index in range(1, len(data)):
97+
r = [f'{value}' for value in data[row_index]]
10998
md_table += '| ' + ' | '.join(
11099
[str(cell).replace('\n', '<br>') if cell is not None else '' for cell in r]) + ' |\n'
111100

0 commit comments

Comments
 (0)