-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathhtmlParse.py
More file actions
87 lines (79 loc) · 2.89 KB
/
htmlParse.py
File metadata and controls
87 lines (79 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
def table_to_list(html, encoding='utf-8'):
"""
Transform a HTML table to a list of 2 dimensions
Split values of colspan and rowspan
Repair table (maybe not stable)
Note : Untested with a lot of encoding
:param html: html code
:type html: str
:param encoding: document encoding
:rtype encoding: str
:rtype: list[list[str]]
"""
def __add_cell(table, i_line, i_col, val, nb_line, nb_col):
while len(table) <= i_line:
table.append([])
nb_line += 1
# Repair table
while len(table[i_line]) < i_col:
table[i_line].append('')
table[i_line].insert(i_col, val)
if len(table[i_line]) > nb_col:
nb_col = len(table[i_line])
return nb_line, nb_col
def __get_cell_val(cell, encoding):
text = cell.get_text(separator=' ') # bs4
if not isinstance(text, str):
text = text.encode(encoding)
if text is None or text == " ":
text = ""
return text
soup = BeautifulSoup(html, 'html.parser')
table = []
nb_col = 0
nb_line = 0
# Transform BF object to list 2d with metadata (text, colspan, rowspan)
tr_list = soup.find_all('tr')
for i_tr, tr in enumerate(tr_list):
table.append([])
td_th_list = tr.find_all(re.compile(r'(td|th)'))
for cell in td_th_list:
# Calculate rowspan and colspan
colspan_val = 1
rowspan_val = 1
if 'colspan' in cell.attrs:
colspan_val = int(cell.attrs['colspan'])
if 'rowspan' in cell.attrs:
rowspan_val = int(cell.attrs['rowspan'])
cell_info = {'colspan': colspan_val, 'rowspan': rowspan_val, 'text': __get_cell_val(cell, encoding)}
table[-1].append(cell_info)
if len(td_th_list) > nb_col:
nb_col = len(td_th_list)
nb_line = len(tr_list)
del tr_list, soup
i_col = 0
# Transform list 2d with metadata to a list of string
# Split rowspan and colspan
while i_col < nb_col:
i_line = 0
while i_line < nb_line:
# Repair table
while i_col >= len(table[i_line]):
table[i_line].append('')
cell = table[i_line][i_col]
if type(cell) is not dict:
i_line += 1
continue
for i_colspan in range(i_col, i_col + cell['colspan']):
for i_rowspan in range(i_line, i_line + cell['rowspan']):
if i_colspan == i_col and i_rowspan == i_line:
continue
nb_line, nb_col = __add_cell(table, i_rowspan, i_colspan, cell['text'], nb_line, nb_col)
# Update cell value to string
table[i_line][i_col] = cell['text']
i_line += 1
i_col += 1
return table