Skip to content

Commit 42197b4

Browse files
committed
final tests & changes
1 parent fcd228c commit 42197b4

4 files changed

Lines changed: 21 additions & 32 deletions

File tree

lib/extractor/carousel.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,10 @@ def initialize(document)
2424
end
2525

2626
def tiles
27-
# scrapeMemo psuedocode: create empty scrapeMemo hash, which will serve as an index for future parsing of the same search result structure (data-attrid, tile grid container class, tile root class, tile count, name_attribute, image_script_variable_names)
27+
# scrapeMemo psuedocode: create empty scrapeMemo hash, which will serve as an index for future parsing of the same search result structure (data-attrid, whether target grid is inside div#search or not, tile grid container class, tile root class, tile count, name_attribute, image_script_variable_names)
2828
target_section = @document.at_css('#search') || @document
2929
# scrapeMemo psuedocode: if '#search' can't be found, add that to scrapeMemo hash
30-
target_section = target_section.css('div').find { |d| d['data-attrid'] } || target_section
30+
target_section = target_section.css('div').find { |d| d['data-attrid'] } || @document.css('div').find { |d| d['data-attrid'] } || target_section
3131
# scrapeMemo psuedocode: if div['data-attrid'] can't be found, add that to scrapeMemo hash
3232
# scrapeMemo psuedocode: check database for any records containing the same ['data-attrid'] value
3333
# scrapeMemo psuedocode: if one or more record(s) exist, scan for the tile grid container class, prioritizing the record most recently created

lib/extractor/item.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def to_h
3131
"extensions" => extensions,
3232
"link" => link,
3333
"image" => image,
34-
}
34+
}.compact
3535
end
3636

3737
private

spec/extractor_spec.rb

Lines changed: 17 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,41 +7,30 @@
77
end
88
let(:result) { Extractor.call(File.join(FIXTURES_DIR, "van-gogh-paintings.html")) }
99

10-
it "returns the same number of items as the SerpApi reference output" do
11-
expect(result.size).to eq(expected.size)
10+
it "matches SerpApi reference output" do
11+
expect(result).to eq(expected)
1212
end
13+
end
1314

14-
it "matches name, extensions and link byte-for-byte across all items" do
15-
mismatches = result.each_with_index.reject do |item, i|
16-
%w[name extensions link].all? { |f| item[f] == expected[i][f] }
17-
end
18-
expect(mismatches).to be_empty, -> {
19-
mismatches.first(3).map { |it, i|
20-
"row #{i}: got=#{it.reject { |k,_| k == 'image' }.inspect} " \
21-
"exp=#{expected[i].reject { |k,_| k == 'image' }.inspect}"
22-
}.join("\n")
23-
}
15+
describe ".call with U.S. Presidents fixture" do
16+
let(:expected) do
17+
JSON.parse(File.read((File.expand_path("fixtures/u.s._presidents.json", __dir__))))["artworks"]
2418
end
19+
let(:result) { Extractor.call(File.expand_path("fixtures/u.s._presidents.html", __dir__)) }
2520

26-
it "extracts every inline thumbnail present in the HTML exactly" do
27-
# The page only ships the first N base64 thumbnails inline. The rest
28-
# are URL thumbnails in in-file attributes (e.g. data-src). Those are
29-
# still part of the page snapshot and should be surfaced as-is.
30-
inline_expected = expected.each_with_index.select { |e, _| e["image"].to_s.start_with?("data:") }
21+
it "matches U.S. Presidents reference output" do
22+
expect(result).to eq(expected)
23+
end
24+
end
3125

32-
inline_expected.each do |e, i|
33-
expect(result[i]["image"]).to eq(e["image"]),
34-
"mismatch on row #{i} (#{e['name']})"
35-
end
26+
describe ".call with Tom Cruise movies fixture" do
27+
let(:expected) do
28+
JSON.parse(File.read((File.expand_path("fixtures/tom_cruise_movies.json", __dir__))))["artworks"]
3629
end
30+
let(:result) { Extractor.call(File.expand_path("fixtures/tom_cruise_movies.html", __dir__)) }
3731

38-
it "matches image output byte-for-byte against expected array" do
39-
mismatches = result.each_with_index.reject { |item, i| item["image"] == expected[i]["image"] }
40-
expect(mismatches).to be_empty, -> {
41-
mismatches.first(3).map { |(item, i)|
42-
"row #{i}: got=#{item['image'].inspect} exp=#{expected[i]['image'].inspect}"
43-
}.join("\n")
44-
}
32+
it "matches Tom Cruise movies reference output" do
33+
expect(result).to eq(expected)
4534
end
4635
end
4736
end

spec/item_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def tile_from(html)
5353

5454
it "returns nil when there is no anchor (malformed tile)" do
5555
node = tile_from("<div><span>just text</span></div>")
56-
expected = {"extensions" => nil, "image" => nil, "link" => nil, "name" => "just text"}
56+
expected = {"name" => "just text"}
5757
expect(described_class.parse(node, thumbnails: {})).to eq(expected)
5858
end
5959

0 commit comments

Comments
 (0)