Skip to content

Commit eb5b8eb

Browse files
authored
feat: defuddle (#829)
* feat: defuddle * test: update snapshot * perf: add benchmark * chore: capture defuddle errors * test: validation * perf: one loop * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * fix: skip JSON-LD structural keys (@-prefixed) in recursive schema search The non-exact recursive search in searchSchemaResults iterated over all object keys including @context, @type, etc. When @context is an object (e.g. containing property namespace definitions), property names inside it could match and return schema definition URLs instead of actual content values. Made-with: Cursor * fix: treat null JSON-LD properties as missing in $jsonld fallback `get()` returns `null` for explicitly null properties, but the loop guard used strict `!== undefined`, preventing the recursive searchSchemaResults fallback from running. Similarly, searchSchemaResults returned early with an empty result when hasOwnProperty matched a null value, blocking the non-exact recursive traversal. Both checks now correctly fall through for null values, allowing nested properties to be discovered. Made-with: Cursor * fix: remove unconditional object-to-name resolution in $jsonld The object-to-name block converted any object with a string `.name` into that name, regardless of what the caller expected. This broke callers like `$jsonld('image')` and `$jsonld('publisher')` that need the full object for downstream validators (toImage, toLogoUrl). The recursive fallback via searchSchemaResults already handles object-to-name resolution where appropriate. Made-with: Cursor * refactor: extract common helpers * fix: join multiple values from exact JSON-LD schema search in $jsonld fallback When searchSchemaResults traverses an explicit property path (e.g., author.name) through an array of objects, multiple resolved values are now joined with " and " instead of discarding all but the first. Fuzzy (recursive) search still returns only the first match. This restores co-author extraction for sites like NBC News where author is an array of Person objects. Made-with: Cursor
1 parent a83afd7 commit eb5b8eb

24 files changed

Lines changed: 947 additions & 82 deletions

File tree

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ const metascraper = require('metascraper')([
239239
- [metascraper-audio](https://github.com/microlinkhq/metascraper/tree/master/packages/metascraper-audio) – Get audio property from HTML markup.
240240
- [metascraper-author](https://github.com/microlinkhq/metascraper/tree/master/packages/metascraper-author) – Get author property from HTML markup.
241241
- [metascraper-date](https://github.com/microlinkhq/metascraper/tree/master/packages/metascraper-date) – Get date property from HTML markup.
242+
- [metascraper-defuddle](https://github.com/microlinkhq/metascraper/tree/master/packages/metascraper-defuddle) – A Defuddle connector for metascraper.
242243
- [metascraper-description](https://github.com/microlinkhq/metascraper/tree/master/packages/metascraper-description) – Get description property from HTML markup.
243244
- [metascraper-feed](https://github.com/microlinkhq/metascraper/tree/master/packages/metascraper-feed) – Get RSS/Atom feed URL from HTML markup.
244245
- [metascraper-feeds](https://github.com/microlinkhq/metascraper/tree/master/packages/metascraper-feeds) – Extract feed links (RSS/Atom/JSON) from HTML markup.
@@ -328,7 +329,7 @@ A set of property names to pick for the metadata extraction process. When specif
328329

329330
Type: `Array`
330331

331-
You can pass additional rules to add on execution time.
332+
You can pass additional rules to add on execution time.
332333

333334
These rules will be merged with your loaded [rules](#rules) at the beginning.
334335

Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,339 @@
1+
'use strict'
2+
3+
const fs = require('fs')
4+
const path = require('path')
5+
6+
const metascraper = require('../packages/metascraper/src')
7+
8+
const INTEGRATION_DIR = path.resolve(
9+
__dirname,
10+
'../packages/metascraper/test/integration'
11+
)
12+
13+
const BASE_FIELDS = [
14+
'author',
15+
'date',
16+
'description',
17+
'audio',
18+
'video',
19+
'image',
20+
'lang',
21+
'logo',
22+
'manifest',
23+
'publisher',
24+
'title',
25+
'url'
26+
]
27+
28+
const ALL_FIELDS = [...BASE_FIELDS]
29+
30+
const isFilled = value => value !== null && value !== undefined && value !== ''
31+
32+
const normalizeValue = value => {
33+
if (value === undefined) return null
34+
if (Array.isArray(value)) return value
35+
if (value && typeof value === 'object') {
36+
return JSON.parse(JSON.stringify(value))
37+
}
38+
return value
39+
}
40+
41+
const valueEquals = (a, b) => JSON.stringify(a) === JSON.stringify(b)
42+
43+
const formatDuration = milliseconds => {
44+
if (milliseconds < 1000) return `${milliseconds}ms`
45+
return `${(milliseconds / 1000).toFixed(2)}s`
46+
}
47+
48+
const formatValue = value => {
49+
const normalized = normalizeValue(value)
50+
if (typeof normalized === 'string') {
51+
const truncated =
52+
normalized.length <= 180 ? normalized : `${normalized.slice(0, 177)}...`
53+
return JSON.stringify(truncated)
54+
}
55+
56+
const serialized = JSON.stringify(normalized)
57+
if (serialized.length <= 180) return serialized
58+
return `${serialized.slice(0, 177)}...`
59+
}
60+
61+
const getMetascraper = bundleName =>
62+
metascraper([
63+
require('metascraper-author')(),
64+
require('metascraper-date')(),
65+
require('metascraper-description')(),
66+
require('metascraper-audio')(),
67+
require('metascraper-video')(),
68+
require('metascraper-image')(),
69+
require('metascraper-lang')(),
70+
require('metascraper-logo')(),
71+
require('metascraper-logo-favicon')(),
72+
require('metascraper-manifest')(),
73+
require('metascraper-publisher')(),
74+
require('metascraper-title')(),
75+
require('metascraper-url')(),
76+
require(bundleName)()
77+
])
78+
79+
const extractTopLevelUrl = source => {
80+
const match = source.match(/const\s+url\s*=\s*(['"])(.*?)\1/)
81+
return match ? match[2] : null
82+
}
83+
84+
const extractCases = (source, integrationName) => {
85+
const cases = []
86+
const topLevelUrl = extractTopLevelUrl(source)
87+
88+
const urls = []
89+
const urlRegex = /const\s+url\s*=\s*(['"])(.*?)\1/g
90+
let urlMatch
91+
while ((urlMatch = urlRegex.exec(source)) !== null) {
92+
urls.push({ index: urlMatch.index, value: urlMatch[2] })
93+
}
94+
95+
const tests = []
96+
const testRegex = /test\((['"])(.*?)\1\s*,/g
97+
let testMatch
98+
while ((testMatch = testRegex.exec(source)) !== null) {
99+
tests.push({ index: testMatch.index, value: testMatch[2] })
100+
}
101+
102+
const readFileRegex = /readFile\(resolve\(__dirname,\s*(['"])(.*?)\1\)\)/g
103+
let fileMatch
104+
while ((fileMatch = readFileRegex.exec(source)) !== null) {
105+
const inputPath = fileMatch[2]
106+
const readIndex = fileMatch.index
107+
108+
const nearestUrl = [...urls]
109+
.reverse()
110+
.find(candidate => candidate.index < readIndex)
111+
const nearestTest = [...tests]
112+
.reverse()
113+
.find(candidate => candidate.index < readIndex)
114+
115+
const url = nearestUrl ? nearestUrl.value : topLevelUrl
116+
if (!url) continue
117+
118+
cases.push({
119+
suite: integrationName,
120+
testName: nearestTest ? nearestTest.value : integrationName,
121+
inputPath,
122+
url
123+
})
124+
}
125+
126+
return cases
127+
}
128+
129+
const discoverCases = () => {
130+
const dirs = fs
131+
.readdirSync(INTEGRATION_DIR, { withFileTypes: true })
132+
.filter(entry => entry.isDirectory())
133+
.map(entry => entry.name)
134+
.sort()
135+
136+
const cases = []
137+
138+
dirs.forEach(dir => {
139+
const indexPath = path.resolve(INTEGRATION_DIR, dir, 'index.js')
140+
if (!fs.existsSync(indexPath)) return
141+
142+
const source = fs.readFileSync(indexPath, 'utf8')
143+
const extracted = extractCases(source, dir)
144+
extracted.forEach(testCase => {
145+
const htmlPath = path.resolve(INTEGRATION_DIR, dir, testCase.inputPath)
146+
if (!fs.existsSync(htmlPath)) return
147+
cases.push({ ...testCase, htmlPath })
148+
})
149+
})
150+
151+
return cases
152+
}
153+
154+
const getFieldDiff = ({ readability, defuddle }) => {
155+
const fields = {}
156+
const differentFields = []
157+
158+
ALL_FIELDS.forEach(field => {
159+
const a = normalizeValue(readability[field])
160+
const b = normalizeValue(defuddle[field])
161+
162+
const equal = valueEquals(a, b)
163+
const readabilityFilled = isFilled(a)
164+
const defuddleFilled = isFilled(b)
165+
166+
fields[field] = {
167+
equal,
168+
readability: a,
169+
defuddle: b,
170+
readabilityFilled,
171+
defuddleFilled
172+
}
173+
174+
if (!equal) differentFields.push(field)
175+
})
176+
177+
return {
178+
differentFields,
179+
fields
180+
}
181+
}
182+
183+
const computeSummary = rows => {
184+
const summary = {
185+
totalCases: rows.length,
186+
casesWithDifferences: 0,
187+
casesEqual: 0,
188+
perField: {},
189+
completeness: {
190+
readability: { totalFilled: 0 },
191+
defuddle: { totalFilled: 0 }
192+
}
193+
}
194+
195+
ALL_FIELDS.forEach(field => {
196+
summary.perField[field] = {
197+
different: 0,
198+
readabilityOnlyFilled: 0,
199+
defuddleOnlyFilled: 0,
200+
bothFilled: 0,
201+
bothEmpty: 0,
202+
equal: 0
203+
}
204+
})
205+
206+
rows.forEach(row => {
207+
if (row.differentFields.length > 0) {
208+
summary.casesWithDifferences++
209+
} else {
210+
summary.casesEqual++
211+
}
212+
213+
ALL_FIELDS.forEach(field => {
214+
const stats = summary.perField[field]
215+
const diff = row.fields[field]
216+
217+
if (!diff.equal) stats.different++
218+
else stats.equal++
219+
220+
if (diff.readabilityFilled && diff.defuddleFilled) stats.bothFilled++
221+
else if (!diff.readabilityFilled && !diff.defuddleFilled) {
222+
stats.bothEmpty++
223+
} else if (diff.readabilityFilled) stats.readabilityOnlyFilled++
224+
else if (diff.defuddleFilled) stats.defuddleOnlyFilled++
225+
226+
if (diff.readabilityFilled) summary.completeness.readability.totalFilled++
227+
if (diff.defuddleFilled) summary.completeness.defuddle.totalFilled++
228+
})
229+
})
230+
231+
return summary
232+
}
233+
234+
const printSummary = ({ summary, rows }) => {
235+
const topDiffs = [...rows]
236+
.sort((a, b) => b.differentFields.length - a.differentFields.length)
237+
.slice(0, 10)
238+
239+
console.log(
240+
'Accuracy benchmark: metascraper-readability vs metascraper-defuddle'
241+
)
242+
console.log(`Cases: ${summary.totalCases}`)
243+
console.log(
244+
`Cases with differences: ${summary.casesWithDifferences} | Equal cases: ${summary.casesEqual}`
245+
)
246+
console.log(
247+
`Filled values (all fields): readability=${summary.completeness.readability.totalFilled} | defuddle=${summary.completeness.defuddle.totalFilled}`
248+
)
249+
console.log('')
250+
251+
console.log('Top differing cases:')
252+
topDiffs.forEach(row => {
253+
if (row.differentFields.length === 0) return
254+
console.log(
255+
`- ${row.suite}/${row.testName}: ${
256+
row.differentFields.length
257+
} fields differ (${row.differentFields.join(', ')})`
258+
)
259+
})
260+
}
261+
262+
const printCaseDifferences = ({ differentFields, fields }) => {
263+
if (differentFields.length === 0) {
264+
console.log(' -> no field differences')
265+
return
266+
}
267+
268+
console.log(` -> differences in ${differentFields.length} fields`)
269+
270+
differentFields.forEach(field => {
271+
const diff = fields[field]
272+
console.log(` ${field}`)
273+
console.log(` readability: ${formatValue(diff.readability)}`)
274+
console.log(` defuddle: ${formatValue(diff.defuddle)}`)
275+
})
276+
}
277+
278+
const main = async () => {
279+
const start = Date.now()
280+
const readability = getMetascraper('metascraper-readability')
281+
const defuddle = getMetascraper('metascraper-defuddle')
282+
const cases = discoverCases()
283+
const totalCases = cases.length
284+
285+
const rows = []
286+
287+
console.log(
288+
`Running accuracy benchmark over ${totalCases} integration cases...`
289+
)
290+
291+
if (totalCases === 0) {
292+
console.log('No integration cases were discovered. Exiting early.')
293+
return
294+
}
295+
296+
for (const [index, testCase] of cases.entries()) {
297+
const position = index + 1
298+
const caseStart = Date.now()
299+
console.log(
300+
`[${position}/${totalCases}] ${testCase.suite}/${testCase.testName}`
301+
)
302+
303+
const html = fs.readFileSync(testCase.htmlPath)
304+
305+
const readabilityMetadata = await readability({ html, url: testCase.url })
306+
const defuddleMetadata = await defuddle({ html, url: testCase.url })
307+
308+
const { differentFields, fields } = getFieldDiff({
309+
readability: readabilityMetadata,
310+
defuddle: defuddleMetadata
311+
})
312+
313+
rows.push({
314+
suite: testCase.suite,
315+
testName: testCase.testName,
316+
inputPath: testCase.inputPath,
317+
url: testCase.url,
318+
differentFields,
319+
fields
320+
})
321+
322+
printCaseDifferences({ differentFields, fields })
323+
324+
console.log(
325+
` -> done in ${formatDuration(Date.now() - caseStart)} (${
326+
differentFields.length
327+
} differing fields)`
328+
)
329+
}
330+
331+
const summary = computeSummary(rows)
332+
printSummary({ summary, rows })
333+
console.log(`Elapsed: ${formatDuration(Date.now() - start)}`)
334+
}
335+
336+
main().catch(error => {
337+
console.error(error)
338+
process.exitCode = 1
339+
})

0 commit comments

Comments
 (0)