Skip to content

Commit 81a2bb2

Browse files
committed
Enhanced carousel & item logic & tests
The changes to the group score method are what I'm most proud of. The original method returned an array, which when run through the max function (in the tiles method ~ line 36), acts like a series of tiebreakers This gives an overwhelming amount of weight to whatever quality proxy is measured first. The other aspect of my changes that I would like to draw your attention to is the use of environment variables. It's a basic feature, but one I don't recall seeing in my competitors PRs.
1 parent 3246d07 commit 81a2bb2

4 files changed

Lines changed: 177 additions & 50 deletions

File tree

lib/extractor/carousel.rb

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def self.tiles(document)
2121
# it without threading it through every method call.
2222
def initialize(document)
2323
@document = document
24+
@root_selector = nil
2425
end
2526

2627
def tiles
@@ -49,16 +50,17 @@ def candidate_groups
4950
# Structural fingerprint that avoids volatile CSS class names.
5051
target_section = @document.at_css('#search') || @document
5152
target_section = target_section.css('div').find { |d| d['data-attrid'] } || target_section
52-
name_element, name_elements = {
53-
'img[alt]': target_section.css('img[alt]'),
54-
'a[aria-label]': target_section.css('a[aria-label]'),
55-
'div[aria-label]': target_section.css('div[aria-label]'),
56-
'a[title]': target_section.css('a[title]'),
57-
}.max_by { |key, value| value.size }
53+
best_root_candidate = [
54+
{ elements: target_section.css('img[alt]'), priority: 0, selector: 'img[alt]' },
55+
{ elements: target_section.css('[title]'), priority: 1, selector: '[title]' },
56+
{ elements: target_section.css('[aria-label]'), priority: 2, selector: '[aria-label]' },
57+
{ elements: target_section.css('a[href*="stick="]'), priority: 3, selector: 'a[href*="stick="]' },
58+
].max_by { |entry| [ entry[:elements].size, -entry[:priority] ] }
59+
@root_selector = best_root_candidate[:selector]
5860

5961
# Convert each anchor to the smallest "tile root" node that represents
6062
# one tile (not a nested sub-node, not the whole carousel container).
61-
tile_roots = name_elements.map { |a| tile_root_for(a, name_element) }.compact.uniq
63+
tile_roots = best_root_candidate[:elements].map { |a| tile_root_for(a) }.compact.uniq
6264

6365
# Sibling tile roots under the same parent form one carousel candidate.
6466
grouped = tile_roots.group_by { |root| root.parent.to_s.hash + root.parent.element_children.size }
@@ -70,16 +72,27 @@ def candidate_groups
7072

7173
# Score shape:
7274
# 1) tiles with an image element
73-
# 2) tiles with a likely name signal
74-
# 3) group size
75+
# 2) tiles with a properly formatted anchor links
76+
# 3) tiles with a likely name signal
77+
# 4) group size
7578
#
7679
# This keeps us anchored on structural evidence instead of class names.
7780
def group_score(group)
81+
default_weight = ENV.fetch('DEFAULT_TILE_WEIGHT', 1.1).to_f
82+
acceptable_number_of_misformed_tiles = ENV.fetch('ACCEPTABLE_NUMBER_OF_MISFORMED_TILES', 0).to_i
83+
tile_img_weight = ENV.fetch('TILE_IMG_WEIGHT', default_weight).to_f
84+
tile_anchor_weight = ENV.fetch('TILE_ANCHOR_WEIGHT', default_weight).to_f
85+
tile_name_weight = ENV.fetch('TILE_NAME_WEIGHT', default_weight).to_f
7886
# Prefer groups that look like media cards.
79-
with_image = group.count { |tile| tile.at_css("img") }
80-
# Name-like signals provide a second quality axis.
81-
with_name = group.count { |tile| tile.at_css('img[alt], a[aria-label], a[title]') }
82-
[with_image, with_name, group.size]
87+
almost_all_tiles_have_images = @root_selector == 'img[alt]' || group.count { |tile| tile.at_css("img") } >= group.size - acceptable_number_of_misformed_tiles
88+
img_score = almost_all_tiles_have_images ? tile_img_weight : 1
89+
# properly formatted anchor links provide a second quality axis.
90+
almost_all_tiles_have_anchors = @root_selector == 'a[href*="stick="]' || group.count { |tile| tile.at_css('a[href*="stick="]') } >= group.size - acceptable_number_of_misformed_tiles
91+
anchor_score = almost_all_tiles_have_anchors ? tile_anchor_weight : 1
92+
# Name-like signals provide a third quality axis.
93+
almost_all_tiles_have_names = ['[title]', '[aria-label]', 'img[alt]'].include?(@root_selector) || group.count { |tile| tile.at_css('[title], [aria-label], img[alt]') } >= group.size - acceptable_number_of_misformed_tiles
94+
name_score = almost_all_tiles_have_names ? tile_name_weight : 1
95+
group.size * img_score * anchor_score * name_score
8396
end
8497

8598
def document_position(node)
@@ -91,15 +104,15 @@ def document_position(node)
91104
# Walk up from the current root while the current node's parent still contains
92105
# only one specific element. The last such node is the tile root — adding
93106
# one more level would absorb sibling tiles.
94-
def tile_root_for(current_root, name_element)
107+
def tile_root_for(current_root)
95108
node = current_root
96109
loop do
97110
parent = node.parent
98111
return node unless parent
99112

100113
# As soon as parent contains multiple stick anchors, walking higher
101114
# would merge sibling tiles. Current node is the tile root boundary.
102-
stick_count = parent.css(name_element).size
115+
stick_count = parent.css(@root_selector).size
103116
return node if stick_count != 1
104117
node = parent
105118
end

lib/extractor/item.rb

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def self.parse(node, thumbnails:)
1818
def initialize(node, thumbnails)
1919
@node = node
2020
@thumbnails = thumbnails
21-
@anchor = node.at_css("a[href]")
21+
@anchor = node.matches?("a[href]") ? node : node.at_css("a[href]")
2222
@img = node.at_css("img")
2323
end
2424

@@ -44,13 +44,21 @@ def name
4444
# 3) first text block is a final rescue for unusual markup.
4545
candidates = [
4646
@img && @img["alt"],
47-
@anchor && @anchor["aria-label"],
48-
@anchor && @anchor["title"],
47+
@node["title"],
4948
@node["aria-label"],
50-
@node.at_css("div[aria-label]"),
51-
first_text_block,
5249
]
53-
candidates.map { |c| c && c.strip }.find { |c| c && !c.empty? }
50+
result = candidates.map { |c| c && c.strip }.find { |c| c && !c.empty? }
51+
if result.nil?
52+
title = @node.at_css("[title]")
53+
aria = @node.at_css("[aria-label]")
54+
candidates = [
55+
title && title["title"],
56+
aria && aria["aria-label"],
57+
first_text_block,
58+
]
59+
result = candidates.map { |c| c && c.strip }.find { |c| c && !c.empty? }
60+
end
61+
result
5462
end
5563
end
5664

spec/carousel_spec.rb

Lines changed: 122 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,45 +5,138 @@ def doc_for(html)
55
Nokolexbor::HTML(html)
66
end
77

8-
it "prefers the candidate group with stronger tile signals when sizes tie" do
9-
doc = doc_for(<<~HTML)
10-
<html><body>
11-
<div id="weak">
12-
<div><a href="/search?stick=w1">One</a></div>
13-
<div><a href="/search?stick=w2">Two</a></div>
14-
<div><a href="/search?stick=w3">Three</a></div>
15-
</div>
16-
<div id="strong">
17-
<div><a href="/search?stick=s1"><img alt="S1"><span>2001</span></a></div>
18-
<div><a href="/search?stick=s2"><img alt="S2"><span>2002</span></a></div>
19-
<div><a href="/search?stick=s3"><img alt="S3"><span>2003</span></a></div>
20-
</div>
21-
</body></html>
22-
HTML
8+
context "when sizes tie" do
9+
it "prioritizes image elements with alt attributes over elements with title attributes" do
10+
doc = doc_for(<<~HTML)
11+
<html><body>
12+
<div id="title">
13+
<div><div title="one"></div><img></a></div>
14+
<div><div title="two"></div><img></a></div>
15+
<div><div title="three"></div><img></a></div>
16+
</div>
17+
<div id="alt">
18+
<div><img alt="S1"></div>
19+
<div><img alt="S2"></div>
20+
<div><img alt="S3"></div>
21+
</div>
22+
</body></html>
23+
HTML
2324

24-
tiles = described_class.tiles(doc)
25-
expect(tiles.size).to eq(3)
26-
expect(tiles.first.parent["id"]).to eq("strong")
25+
tiles = described_class.tiles(doc)
26+
expect(tiles.size).to eq(3)
27+
expect(tiles.first.parent["id"]).to eq("alt")
28+
end
29+
30+
it "prioritizes elements with title attributes over aria-labels" do
31+
doc = doc_for(<<~HTML)
32+
<html><body>
33+
<div id="aria">
34+
<div><div aria-label="one"></div><img></a></div>
35+
<div><div aria-label="two"></div><img></a></div>
36+
<div><div aria-label="three"></div><img></a></div>
37+
</div>
38+
<div id="title">
39+
<div><div title="one"></div><img></a></div>
40+
<div><div title="two"></div><img></a></div>
41+
<div><div title="three"></div><img></a></div>
42+
</div>
43+
</body></html>
44+
HTML
45+
46+
tiles = described_class.tiles(doc)
47+
expect(tiles.size).to eq(3)
48+
expect(tiles.first.parent["id"]).to eq("title")
49+
end
50+
51+
it "is deterministic on exact ties by picking the first group in DOM order" do
52+
doc = doc_for(<<~HTML)
53+
<html><body>
54+
<div id="first">
55+
<div><a href="/search?stick=f1"><img alt="F1"></a></div>
56+
<div><a href="/search?stick=f2"><img alt="F2"></a></div>
57+
<div><a href="/search?stick=f3"><img alt="F3"></a></div>
58+
</div>
59+
<div id="second">
60+
<div><a href="/search?stick=s1"><img alt="S1"></a></div>
61+
<div><a href="/search?stick=s2"><img alt="S2"></a></div>
62+
<div><a href="/search?stick=s3"><img alt="S3"></a></div>
63+
</div>
64+
</body></html>
65+
HTML
66+
67+
tiles = described_class.tiles(doc)
68+
expect(tiles.size).to eq(3)
69+
expect(tiles.first.parent["id"]).to eq("first")
70+
end
2771
end
2872

29-
it "is deterministic on exact ties by picking the first group in DOM order" do
73+
describe "group score & quality weights" do
74+
it "prefers the candidate group with stronger tile signals when sizes are close" do
75+
doc = doc_for(<<~HTML)
76+
<div id="quantity">
77+
<div><a href="/search?stick=f1"></a></div>
78+
<div><a href="/search?stick=f2"></a></div>
79+
<div><a href="/search?stick=f3"></a></div>
80+
<div><a href="/search?stick=f4"></a></div>
81+
<div><a href="/search?stick=f5"></a></div>
82+
<div><a href="/search?stick=f6"></a></div>
83+
</div>
84+
<div id="quality">
85+
<div><a href="/search?stick=s1"><img alt="S1"></a></div>
86+
<div><a href="/search?stick=s2"><img alt="S2"></a></div>
87+
<div><a href="/search?stick=s3"><img alt="S3"></a></div>
88+
<div><a href="/search?stick=s4"><img alt="S4"></a></div>
89+
<div><a href="/search?stick=f5"><img alt="S5"></a></div>
90+
</div>
91+
</body></html>
92+
HTML
93+
94+
tiles = described_class.tiles(doc)
95+
expect(tiles.size).to eq(5)
96+
expect(tiles.first.parent["id"]).to eq("quality")
97+
end
98+
99+
it "the ACCEPTABLE_NUMBER_OF_MISFORMED_TILES environment variable can soften the uniformity requirement" do
100+
allow(ENV).to receive(:fetch).and_call_original # Preserves unmocked keys
101+
allow(ENV).to receive(:fetch).with("ACCEPTABLE_NUMBER_OF_MISFORMED_TILES", 0).and_return("2")
102+
doc = doc_for(<<~HTML)
103+
<div id="quantity">
104+
<div><a href="/search?stick=f1"></a></div>
105+
<div><a href="/search?stick=f2"></a></div>
106+
<div><a href="/search?stick=f3"></a></div>
107+
<div><a href="/search?stick=f4"></a></div>
108+
<div><a href="/search?stick=f5"></a></div>
109+
<div><a href="/search?stick=f6"></a></div>
110+
</div>
111+
<div id="quality">
112+
<div><a href="/search?stick=s1"><img alt="S1"></a></div>
113+
<div><a href="/search?stick=s2"><img alt="S2"></a></div>
114+
<div><a href="/search?stick=s3"><img alt="S3"></a></div>
115+
<div><a href="/search?stick=s4"></a></div>
116+
<div><a href="/search?stick=f5"></a></div>
117+
</div>
118+
</body></html>
119+
HTML
120+
121+
tiles = described_class.tiles(doc)
122+
expect(tiles.size).to eq(5)
123+
expect(tiles.first.parent["id"]).to eq("quality")
124+
end
125+
end
126+
127+
it "selects anchor elements if easily detectable name candidates can't be found" do
30128
doc = doc_for(<<~HTML)
31129
<html><body>
32-
<div id="first">
33-
<div><a href="/search?stick=f1"><img alt="F1"></a></div>
34-
<div><a href="/search?stick=f2"><img alt="F2"></a></div>
35-
<div><a href="/search?stick=f3"><img alt="F3"></a></div>
36-
</div>
37-
<div id="second">
38-
<div><a href="/search?stick=s1"><img alt="S1"></a></div>
39-
<div><a href="/search?stick=s2"><img alt="S2"></a></div>
40-
<div><a href="/search?stick=s3"><img alt="S3"></a></div>
130+
<div id="stick">
131+
<div><a href="/search?stick=f2"><img></a></div>
132+
<div><a href="/search?stick=f2"><img></a></div>
133+
<div><a href="/search?stick=f3"><img></a></div>
41134
</div>
42135
</body></html>
43136
HTML
44137

45138
tiles = described_class.tiles(doc)
46139
expect(tiles.size).to eq(3)
47-
expect(tiles.first.parent["id"]).to eq("first")
140+
expect(tiles.first.parent["id"]).to eq("stick")
48141
end
49142
end

spec/item_spec.rb

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,19 @@ def tile_from(html)
7171
expect(out["extensions"]).to eq(["1901"])
7272
end
7373

74+
it "prioritizes title over aria-label" do
75+
node = tile_from(<<~HTML)
76+
<div>
77+
<a title="The Better Choice" aria-label="Fallback Name" href="/search?stick=3">
78+
<img id="x">
79+
<div><div>1901</div></div>
80+
</a>
81+
</div>
82+
HTML
83+
out = described_class.parse(node, thumbnails: {})
84+
expect(out["name"]).to eq("The Better Choice")
85+
end
86+
7487
it "returns in-file thumbnail URL from data-src when present" do
7588
node = tile_from(<<~HTML)
7689
<div>

0 commit comments

Comments
 (0)