Skip to content

Commit a1b10fc

Browse files
authored
perf(readability): cache html serialization per htmlDom (#823)
1 parent c40bbe5 commit a1b10fc

2 files changed

Lines changed: 32 additions & 1 deletion

File tree

packages/metascraper-readability/src/index.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,19 @@ const readability = asyncMemoizeOne(async (url, html, readabilityOpts) => {
4444
}
4545
}, memoizeOne.EqualityFirstArgument)
4646

47+
const htmlCache = new WeakMap()
48+
49+
const getHtml = htmlDom => {
50+
if (!htmlCache.has(htmlDom)) {
51+
htmlCache.set(htmlDom, htmlDom.html())
52+
}
53+
54+
return htmlCache.get(htmlDom)
55+
}
56+
4757
module.exports = ({ readabilityOpts } = {}) => {
4858
const getReadbility = composeRule(($, url) =>
49-
readability(url, $.html(), readabilityOpts)
59+
readability(url, getHtml($), readabilityOpts)
5060
)
5161

5262
const rules = {

packages/metascraper-readability/test/index.js

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
const test = require('ava')
44
const path = require('path')
55
const fs = require('fs')
6+
const { load } = require('cheerio')
67

78
const metascraper = require('metascraper')([
89
require('metascraper-readability')()
@@ -59,3 +60,23 @@ test('chowhanandsons.com', async t => {
5960
const metadata = await metascraper({ html, url })
6061
t.snapshot(metadata)
6162
})
63+
64+
test('serializes html once per invocation', async t => {
65+
const url = 'https://microlink.io'
66+
const html = fs.readFileSync(
67+
path.resolve(__dirname, 'fixtures/microlink.io.html'),
68+
'utf-8'
69+
)
70+
71+
const $ = load(html, { baseURI: url })
72+
const originalHtml = $.html.bind($)
73+
let htmlCalls = 0
74+
75+
$.html = (...args) => {
76+
if (args.length === 0) htmlCalls++
77+
return originalHtml(...args)
78+
}
79+
80+
await metascraper({ htmlDom: $, url })
81+
t.is(htmlCalls, 1)
82+
})

0 commit comments

Comments
 (0)