Skip to content

Commit fcd228c

Browse files
committed
performance improvements & scrapeMemo index explanation
1 parent 82bb73a commit fcd228c

5 files changed

Lines changed: 56 additions & 119 deletions

File tree

lib/extractor.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
require_relative "extractor/carousel"
55
require_relative "extractor/item"
66

7+
# scrapeMemo psuedocode: enabling the Item class to detect whether a relevant scrapeMemo record has been found by the Carousel class would probably require changing the Extractor from a module into a class
78
module Extractor
89
# Public facade. Returns an Array<Hash> of carousel items.
910
#

lib/extractor/carousel.rb

Lines changed: 27 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,19 @@ def self.tiles(document)
2121
# it without threading it through every method call.
2222
def initialize(document)
2323
@document = document
24-
@root_selector = nil
2524
end
2625

2726
def tiles
28-
groups = candidate_groups
27+
# scrapeMemo psuedocode: create empty scrapeMemo hash, which will serve as an index for future parsing of the same search result structure (data-attrid, tile grid container class, tile root class, tile count, name_attribute, image_script_variable_names)
28+
target_section = @document.at_css('#search') || @document
29+
# scrapeMemo psuedocode: if '#search' can't be found, add that to scrapeMemo hash
30+
target_section = target_section.css('div').find { |d| d['data-attrid'] } || target_section
31+
# scrapeMemo psuedocode: if div['data-attrid'] can't be found, add that to scrapeMemo hash
32+
# scrapeMemo psuedocode: check database for any records containing the same ['data-attrid'] value
33+
# scrapeMemo psuedocode: if one or more record(s) exist, scan for the tile grid container class, prioritizing the record most recently created
34+
# scrapeMemo psuedocode: if the tile grid container exists & has the expected number of children with the expected tile root class, set them as the tile roots & skip the rest of this function
35+
36+
groups = candidate_groups(target_section)
2937
return [] if groups.empty?
3038

3139
# Multiple stick-link groups can exist on one page. We prefer the group
@@ -36,38 +44,23 @@ def tiles
3644
best_score = scored.map(&:first).max
3745
best_groups = scored.select { |score, _| score == best_score }.map(&:last)
3846
best_groups.min_by { |g| document_position(g.first) } || []
47+
# scrapeMemo psuedocode: just like with the div['data-attrid'] value before, we can now check the tile grid container class, tile root class, tile count, as well as the div['data-attrid'] value against recorded indexes to check for search result structure drift
3948
end
4049

4150
private
4251

43-
# Build candidate groups by:
44-
# 1. Finding every `/search?…&stick=…` anchor.
45-
# 2. Walking each anchor up to its *tile root* — the highest ancestor
46-
# that still contains exactly one stick anchor.
47-
# 3. Grouping tile roots by their common parent. A group with
48-
# MIN_TILES+ siblings is a carousel candidate.
49-
def candidate_groups
50-
# Structural fingerprint that avoids volatile CSS class names.
51-
target_section = @document.at_css('#search') || @document
52-
target_section = target_section.css('div').find { |d| d['data-attrid'] } || target_section
53-
best_root_candidate = [
54-
{ elements: target_section.css('img[alt]'), priority: 0, selector: 'img[alt]' },
55-
{ elements: target_section.css('[title]'), priority: 1, selector: '[title]' },
56-
{ elements: target_section.css('[aria-label]'), priority: 2, selector: '[aria-label]' },
57-
{ elements: target_section.css('a[href*="stick="]'), priority: 3, selector: 'a[href*="stick="]' },
58-
].max_by { |entry| [ entry[:elements].size, -entry[:priority] ] }
59-
@root_selector = best_root_candidate[:selector]
60-
61-
# Convert each anchor to the smallest "tile root" node that represents
62-
# one tile (not a nested sub-node, not the whole carousel container).
63-
tile_roots = best_root_candidate[:elements].map { |a| tile_root_for(a) }.compact.uniq
64-
65-
# Sibling tile roots under the same parent form one carousel candidate.
66-
grouped = tile_roots.group_by { |root| root.parent.to_s.hash + root.parent.element_children.size }
67-
grouped.delete(nil)
52+
# before = Time.now
53+
# for i in 1..1000
54+
# candidate_groups(target_section)
55+
# end
56+
# after = Time.now
57+
# puts "new version benchmarked at #{after - before}"
6858

69-
# Drop weak candidates early.
70-
grouped.values.select { |g| g.size >= MIN_TILES }
59+
# Build candidate groups by finding the three biggest sibling groups within the target section
60+
# I tested this versus the previous implementation with the quick & dirty benchmark commented out above
61+
# against the van-gogh-paintings.html results and found this to be about x10 faster.
62+
def candidate_groups(target_section)
63+
target_section.css('div', 'section', 'main').max_by(3) { |element| element.element_children.count }.map { |e| e.element_children }
7164
end
7265

7366
# Score shape:
@@ -84,13 +77,15 @@ def group_score(group)
8477
tile_anchor_weight = ENV.fetch('TILE_ANCHOR_WEIGHT', default_weight).to_f
8578
tile_name_weight = ENV.fetch('TILE_NAME_WEIGHT', default_weight).to_f
8679
# Prefer groups that look like media cards.
87-
almost_all_tiles_have_images = @root_selector == 'img[alt]' || group.count { |tile| tile.at_css("img") } >= group.size - acceptable_number_of_misformed_tiles
80+
almost_all_tiles_have_images = group.count { |tile| tile.at_css("img") } >= group.size - acceptable_number_of_misformed_tiles
8881
img_score = almost_all_tiles_have_images ? tile_img_weight : 1
8982
# properly formatted anchor links provide a second quality axis.
90-
almost_all_tiles_have_anchors = @root_selector == 'a[href*="stick="]' || group.count { |tile| tile.at_css('a[href*="stick="]') } >= group.size - acceptable_number_of_misformed_tiles
83+
almost_all_tiles_have_anchors = group.count { |tile| tile.at_css('a[href*="stick="]') } >= group.size - acceptable_number_of_misformed_tiles
9184
anchor_score = almost_all_tiles_have_anchors ? tile_anchor_weight : 1
9285
# Name-like signals provide a third quality axis.
93-
almost_all_tiles_have_names = ['[title]', '[aria-label]', 'img[alt]'].include?(@root_selector) || group.count { |tile| tile.at_css('[title], [aria-label], img[alt]') } >= group.size - acceptable_number_of_misformed_tiles
86+
# I didn't think that searching for each name candidate individually to enforce uniformity
87+
# was worth the performance cost, but it is a tradeoff worth discussing in a code review
88+
almost_all_tiles_have_names = group.count { |tile| tile.at_css('[title], [aria-label], img[alt]') } >= group.size - acceptable_number_of_misformed_tiles
9489
name_score = almost_all_tiles_have_names ? tile_name_weight : 1
9590
group.size * img_score * anchor_score * name_score
9691
end
@@ -100,22 +95,5 @@ def document_position(node)
10095
# DOM-order fallback is stable and explainable.
10196
@document.css("*").index(node) || Float::INFINITY
10297
end
103-
104-
# Walk up from the current root while the current node's parent still contains
105-
# only one specific element. The last such node is the tile root — adding
106-
# one more level would absorb sibling tiles.
107-
def tile_root_for(current_root)
108-
node = current_root
109-
loop do
110-
parent = node.parent
111-
return node unless parent
112-
113-
# As soon as parent contains multiple stick anchors, walking higher
114-
# would merge sibling tiles. Current node is the tile root boundary.
115-
stick_count = parent.css(@root_selector).size
116-
return node if stick_count != 1
117-
node = parent
118-
end
119-
end
12098
end
12199
end

lib/extractor/item.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def to_h
3737
private
3838

3939
def name
40+
# scrapeMemo psuedocode: if the Carousel class detected a relevant scrapeMemo index, then the recorded name candidate can be checked for first
4041
@name ||= begin
4142
# Ordered fallback chain:
4243
# 1) <img alt> is usually canonical title on Google tiles.

lib/extractor/thumbnail_index.rb

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,6 @@ module Extractor
88
# thumbnail is injected at runtime against the image id(s) listed in `ii`.
99
# We replicate that mapping at parse time so no JS execution is required.
1010
class ThumbnailIndex
11-
# Greedy on the data URI body, anchored on the trailing `_setImagesSrc(ii,s,r)`
12-
# to avoid mis-pairing s/ii from adjacent script blocks.
13-
14-
15-
IMAGE_SETTER_REGEX = /_setImagesSrc\((?<id>[a-z]*),\s*(?<source>[a-z]*)/.freeze
16-
1711
ID_REGEX = /'([^']+)'/.freeze
1812

1913
# Public convenience API.
@@ -26,15 +20,22 @@ def initialize(document)
2620
end
2721

2822
def build
23+
# scrapeMemo psuedocode: if the Carousel class detected a relevant scrapeMemo index, then the recorded function name, variable names, and variable order (maybe?) could be used to construct a single regex instead of the three different regexes I'm using
24+
image_setter_regex = /_setImagesSrc\((?<id>[a-z]*),\s*(?<source>[a-z]*)/.freeze
2925
mapping = {}
3026
@document.css("script").each do |script|
3127
body = script.content
32-
desired_variables = body.match(IMAGE_SETTER_REGEX)
28+
desired_variables = body.match(image_setter_regex)
29+
# instead of skipping if the image_setter_regex fails to match because the image_setter function has been renamed, we could modify the source & ids regexes to look for properly formatted variable assignments & then look for a function taking those variables as parameters near the very end of the script
30+
# Something like...
31+
# source_regex = /var\s+(?<source_variable>\w+)\s*=\s*'(?<data>data:image\/[^']+)'\s*;/.freeze
32+
# ids_regex = /var\s+(?<ids_variable>\w+)\s*=\s*\[(?<ids>[^\]]*)\]\s*;/.freeze
33+
# image_setter_regex = /(?<function_name>\w+)\((#{source_variable}|#{ids_variable}),\s*(#{source_variable}|#{ids_variable}).{,12}\z/.freeze
3334
next if desired_variables.nil?
3435

3536
source_regex = /var\s+#{desired_variables[:source]}\s*=\s*'(?<data>data:image\/[^']+)'\s*;/.freeze
3637
ids_regex = /var\s+#{desired_variables[:id]}\s*=\s*\[(?<ids>[^\]]*)\]\s*;/.freeze
37-
38+
3839
source_result = body.match(source_regex)
3940
ids_result = body.match(ids_regex)
4041

spec/carousel_spec.rb

Lines changed: 18 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -5,69 +5,25 @@ def doc_for(html)
55
Nokolexbor::HTML(html)
66
end
77

8-
context "when sizes tie" do
9-
it "prioritizes image elements with alt attributes over elements with title attributes" do
10-
doc = doc_for(<<~HTML)
11-
<html><body>
12-
<div id="title">
13-
<div><div title="one"></div><img></a></div>
14-
<div><div title="two"></div><img></a></div>
15-
<div><div title="three"></div><img></a></div>
16-
</div>
17-
<div id="alt">
18-
<div><img alt="S1"></div>
19-
<div><img alt="S2"></div>
20-
<div><img alt="S3"></div>
21-
</div>
22-
</body></html>
23-
HTML
24-
25-
tiles = described_class.tiles(doc)
26-
expect(tiles.size).to eq(3)
27-
expect(tiles.first.parent["id"]).to eq("alt")
28-
end
29-
30-
it "prioritizes elements with title attributes over aria-labels" do
31-
doc = doc_for(<<~HTML)
32-
<html><body>
33-
<div id="aria">
34-
<div><div aria-label="one"></div><img></a></div>
35-
<div><div aria-label="two"></div><img></a></div>
36-
<div><div aria-label="three"></div><img></a></div>
37-
</div>
38-
<div id="title">
39-
<div><div title="one"></div><img></a></div>
40-
<div><div title="two"></div><img></a></div>
41-
<div><div title="three"></div><img></a></div>
42-
</div>
43-
</body></html>
44-
HTML
45-
46-
tiles = described_class.tiles(doc)
47-
expect(tiles.size).to eq(3)
48-
expect(tiles.first.parent["id"]).to eq("title")
49-
end
50-
51-
it "is deterministic on exact ties by picking the first group in DOM order" do
52-
doc = doc_for(<<~HTML)
53-
<html><body>
54-
<div id="first">
55-
<div><a href="/search?stick=f1"><img alt="F1"></a></div>
56-
<div><a href="/search?stick=f2"><img alt="F2"></a></div>
57-
<div><a href="/search?stick=f3"><img alt="F3"></a></div>
58-
</div>
59-
<div id="second">
60-
<div><a href="/search?stick=s1"><img alt="S1"></a></div>
61-
<div><a href="/search?stick=s2"><img alt="S2"></a></div>
62-
<div><a href="/search?stick=s3"><img alt="S3"></a></div>
63-
</div>
64-
</body></html>
65-
HTML
8+
it "is deterministic on exact ties by picking the first group in DOM order when sizes tie" do
9+
doc = doc_for(<<~HTML)
10+
<html><body>
11+
<div id="first">
12+
<div><a href="/search?stick=f1"><img alt="F1"></a></div>
13+
<div><a href="/search?stick=f2"><img alt="F2"></a></div>
14+
<div><a href="/search?stick=f3"><img alt="F3"></a></div>
15+
</div>
16+
<div id="second">
17+
<div><a href="/search?stick=s1"><img alt="S1"></a></div>
18+
<div><a href="/search?stick=s2"><img alt="S2"></a></div>
19+
<div><a href="/search?stick=s3"><img alt="S3"></a></div>
20+
</div>
21+
</body></html>
22+
HTML
6623

67-
tiles = described_class.tiles(doc)
68-
expect(tiles.size).to eq(3)
69-
expect(tiles.first.parent["id"]).to eq("first")
70-
end
24+
tiles = described_class.tiles(doc)
25+
expect(tiles.size).to eq(3)
26+
expect(tiles.first.parent["id"]).to eq("first")
7127
end
7228

7329
describe "group score & quality weights" do

0 commit comments

Comments
 (0)