-
Notifications
You must be signed in to change notification settings - Fork 334
Expand file tree
/
Copy pathgeneralized_solution.py
More file actions
137 lines (107 loc) · 5.04 KB
/
Copy pathgeneralized_solution.py
File metadata and controls
137 lines (107 loc) · 5.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# GENERALIZED SOLUTION
# instead of relying on css class
# we filter:
# 1. &stick= in the href — present on knowledge graph carousel links
# 2. has an <img> child
#
# for name and year we find "leaf" nodes — tags with no child tags, only text
# so we never depend on class names for the text content either
#
# <a href="/search?...&stick=..."> <-- &stick=
# <img id="..." data-src="..." src="..."/> <-- has img
# <div> <-- not a leaf
# <div>The Starry Night</div> <-- name at leaf[0]
# <div>1889</div> <-- year at leaf[1]
# </div>
# </a>
# IMAGE WITH SRC - EXAMPLES
# <div class="iELo6" jsdata="JI96Wc;unsupported;BCVKE8" style="width:153px;top:8px;left:8px">
# <a href="/search?sca_esv=c2e426814f4d07e9&gl=us&hl=en&q=The+Starry+Night&stick=H4sIAAAAAAAAAONgFuLQz9U3MI_PNVLiBLFMzC3jC7WUspOt9Msyi0sTc-ITi0qQmJnFJVbl-UXZxYtYBUIyUhWCSxKLiioV_DLTM0oAdKX0-E4AAAA&sa=X&ved=2ahUKEwjK-K-JwLWKAxXcQTABHePpOFoQtq8DegQIMxAD">
# <img alt="The Starry Night" class="taFZJe" data-deferred="1" id="_L_FkZ4qlAtyDwbkP49Pj0QU_63" src="data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="/>
# <div class="KHK6lb">
# <div class="pgNMRc">The Starry Night</div>
# <div class="cxzHyb">1889</div>
# </div>
# </a>
# </div>
# IMAGE WITH DATA-SRC
#
# <div class="iELo6" style="display:none;width:0px;top:0px;left:0px" jsdata="JI96Wc;unsupported;BCVKF4">
# <a href="/search?sca_esv=c2e426814f4d07e9&gl=us&hl=en&q=Self-Portrait+with+Bandaged+Ear&stick=H4sIAAAAAAAAAONgFuLQz9U3MI_PNVLiArFMk0pSqiy1lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWOWDU3PSdAPyi0qKEjNLFMozSzIUnBLzUhLTU1MUXBOLAGGQvkZeAAAA&sa=X&ved=2ahUKEwjK-K-JwLWKAxXcQTABHePpOFoQtq8DegQIMxAT">
# <img class="taFZJe" alt="Self-Portrait with Bandaged Ear" data-src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ8juuefle5MyKZKBLRgPjsGSJon7vkt91SM7WTRuZOOyAyUI1v" src="data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==">
# <div class="KHK6lb">
# <div class="pgNMRc">Self-Portrait with Bandaged Ear</div>
# <div class="cxzHyb">1889</div>
# </div>
# </a>
# </div>
from bs4 import BeautifulSoup
import json
def get_artworks(file_path):
with open(file_path) as f:
soup = BeautifulSoup(f, "lxml")
# build image map — same as hardcoded version
image_map = {}
for script in soup.find_all("script"):
text = script.string or ""
if "data:image" not in text or "var ii=" not in text:
continue
s_start = text.index("var s='") + len("var s='")
s_end = text.index("';", s_start)
img_data = text[s_start:s_end]
img_data = img_data.replace("\\x3d", "=")
ii_start = text.index("var ii=['") + len("var ii=['")
ii_end = text.index("']", ii_start)
img_id = text[ii_start:ii_end]
image_map[img_id] = img_data
artworks = []
for a in soup.find_all("a", href=True):
card_href = a.get("href", "")
# carousel links seem to have &stick= in the href
if "&stick=" not in card_href:
continue
# must have an img inside
card_img = a.find("img")
if not card_img:
continue
# leaf nodes = tags inside <a> that have no child tags and have text these are the name and year divs
leaves = []
for tag in a.find_all():
if tag.name == "img":
continue
if tag.find():
continue
if not tag.get_text(strip=True):
continue
leaves.append(tag)
# need at least a name
if not leaves:
continue
card_name = leaves[0].get_text(strip=True)
card_year = leaves[1].get_text(strip=True) if len(leaves) > 1 else None
card_google_link = "https://www.google.com" + card_href
card_img_id = card_img.get("id")
card_img_data_src = card_img.get("data-src")
final_card_img_src = card_img_data_src if card_img_data_src is not None else image_map.get(card_img_id)
aw = {
"name": card_name,
"link": card_google_link,
}
if card_year:
aw["extensions"] = [card_year]
if final_card_img_src:
aw["image"] = final_card_img_src
artworks.append(aw)
return {"artworks": artworks}
if __name__ == "__main__":
import sys
# accept a file path as argument, default to van gogh
input_file = sys.argv[1] if len(sys.argv) > 1 else "files/van-gogh-paintings.html"
# output file name: files/marvel.html -> files/marvel-output.json
output_file = input_file.replace(".html", "-output.json")
print(f"parsing {input_file}...")
result = get_artworks(input_file)
print(f"found {len(result['artworks'])} items")
with open(output_file, "w") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"saved to {output_file}")