Skip to content

Commit 742d302

Browse files
committed
WIP: Check "external" resources in the main loop as well via asset.load({ metadataOnly: true })
1 parent 6ac9d9d commit 742d302

3 files changed

Lines changed: 39 additions & 207 deletions

File tree

lib/index.js

Lines changed: 29 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
const AssetGraph = require('assetgraph');
22
const async = require('async');
3-
const request = require('request');
43
const version = require('../package.json').version;
54
const relationDebugDescription = require('./relationDebugDescription');
65
const prettyBytes = require('pretty-bytes');
@@ -140,171 +139,6 @@ async function hyperlink({
140139
};
141140
}
142141

143-
function httpStatus(asset, attempt = 1) {
144-
const url = asset.url;
145-
const relations = asset._incoming;
146-
147-
const loadReport = {
148-
operator: 'external-check',
149-
name: `external-check ${url}`,
150-
at: [...new Set(relations.map(r => r.debugDescription))].join('\n '),
151-
expected: `200 ${url}`
152-
};
153-
154-
return callback => {
155-
if (shouldSkip(loadReport)) {
156-
return setTimeout(callback);
157-
}
158-
159-
request({
160-
method: attempt === 1 ? 'head' : 'get',
161-
url: asset.url,
162-
strictSSL: true,
163-
gzip: true,
164-
headers: {
165-
'User-Agent': hyperlinkUserAgent,
166-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
167-
'Accept-Encoding': 'gzip, deflate, sdch, br'
168-
}
169-
}, (error, res) => {
170-
if (error) {
171-
const code = error.code;
172-
let actual = code || 'Unknown error';
173-
174-
switch (code) {
175-
case 'ENOTFOUND':
176-
actual = `DNS missing: ${asset.hostname}`;
177-
break;
178-
case 'HPE_INVALID_CONSTANT':
179-
if (attempt === 1) {
180-
return httpStatus(asset, attempt + 1)(callback);
181-
}
182-
break;
183-
}
184-
185-
reportTest({
186-
...loadReport,
187-
ok: false,
188-
actual
189-
});
190-
191-
return callback();
192-
}
193-
194-
const status = res.statusCode;
195-
196-
if (status >= 200 && status < 300) {
197-
const contentType = res.headers['content-type'];
198-
if (contentType && asset.type) {
199-
const matchContentType = contentType.match(
200-
/^\s*([\w\-+.]+\/[\w-+.]+)(?:\s|;|$)/i
201-
);
202-
if (matchContentType && asset.expectedTypes) {
203-
asset.contentType = matchContentType[1].toLowerCase();
204-
asset._tryUpgrade();
205-
}
206-
} else if (!contentType) {
207-
const contentTypeMisingReport = {
208-
ok: false,
209-
name: `content-type-missing ${asset.urlOrDescription}`,
210-
operator: 'content-type-missing',
211-
expected: asset.contentType || `A Content-Type compatible with ${asset.type}`,
212-
actual: contentType,
213-
at: [...new Set(relations.map(r => r.debugDescription))].join('\n '),
214-
};
215-
216-
if (!shouldSkip(contentTypeMisingReport)) {
217-
reportTest(contentTypeMisingReport);
218-
};
219-
}
220-
}
221-
222-
// Some servers respond weirdly to HEAD requests. Make a second attempt with GET
223-
if (attempt === 1 && status >= 400 && status < 600) {
224-
return httpStatus(asset, attempt + 1)(callback);
225-
}
226-
227-
// Some servers (jspm.io) respond with 502 if requesting HEAD, then GET to close in succession. Give the server a second to cool down
228-
if (attempt === 2 && status === 502) {
229-
setTimeout(
230-
() => httpStatus(asset, attempt + 1)(callback),
231-
1000
232-
);
233-
return;
234-
}
235-
236-
const redirects = res.request._redirect.redirects;
237-
if (redirects.length > 0) {
238-
const log = [{ redirectUri: url }, ...redirects].map((item, idx, arr) => {
239-
if (arr[idx + 1]) {
240-
item.statusCode = arr[idx + 1].statusCode;
241-
} else {
242-
item.statusCode = 200;
243-
}
244-
245-
return item;
246-
});
247-
248-
const redirectReport = {
249-
operator: 'external-redirect',
250-
name: `external-redirect ${url}`,
251-
at: [...new Set(relations.map(r => r.debugDescription))].join('\n '),
252-
expected: `302 ${url} --> 200 ${log[log.length - 1].redirectUri}`
253-
};
254-
255-
const actual = log.map(
256-
redirect => `${redirect.statusCode} ${redirect.redirectUri}`
257-
).join(' --> ');
258-
259-
if (!shouldSkip(redirectReport)) {
260-
// A single temporary redirect is allowed
261-
if ([302, 307].includes(log[0].statusCode)) {
262-
if (log.length < 3) {
263-
reportTest({
264-
...redirectReport,
265-
expected: actual,
266-
actual,
267-
ok: true
268-
});
269-
} else {
270-
reportTest({
271-
...redirectReport,
272-
expected: `${log[0].statusCode} ${url} --> 200 ${log[log.length - 1].redirectUri}`,
273-
actual,
274-
ok: false
275-
});
276-
}
277-
} else {
278-
reportTest({
279-
...redirectReport,
280-
actual,
281-
ok: false
282-
});
283-
}
284-
}
285-
}
286-
287-
if (status === 200) {
288-
reportTest({
289-
...loadReport,
290-
ok: true,
291-
actual: loadReport.expected
292-
});
293-
294-
return callback();
295-
}
296-
297-
reportTest({
298-
...loadReport,
299-
actual: `${status} ${url}`,
300-
ok: false
301-
});
302-
303-
return callback();
304-
});
305-
};
306-
}
307-
308142
if (verbose) {
309143
ag.on('addRelation', relation => {
310144
console.error('addRelation', relation.toString());
@@ -394,9 +228,10 @@ async function hyperlink({
394228
async function processAsset(asset) {
395229
if (!processedAssets.has(asset)) {
396230
processedAssets.add(asset);
231+
const operator = asset._metadataOnly ? 'external-check' : 'load';
397232
const loadReport = {
398-
operator: 'load',
399-
name: `load ${asset.urlOrDescription}`,
233+
operator,
234+
name: `${operator} ${asset.urlOrDescription}`,
400235
expected: `200 ${asset.urlOrDescription}`
401236
};
402237

@@ -411,7 +246,8 @@ async function hyperlink({
411246
}
412247

413248
try {
414-
await asset.load();
249+
// FIXME: Make sure we do a full load if an asset is added to the queue again in non-metadataOnly mode
250+
await asset.load({metadataOnly: asset._metadataOnly});
415251

416252
reportTest({
417253
...loadReport,
@@ -432,6 +268,18 @@ async function hyperlink({
432268
return;
433269
}
434270

271+
if (asset.statusCode >= 300 && asset.statusCode < 400) {
272+
// TODO: Warn about chains of temporary redirects
273+
const redirectRelation = asset.outgoingRelations.find(r => r.type === 'HttpRedirect');
274+
reportTest({
275+
ok: asset.statusCode !== 301,
276+
operator: 'external-redirect',
277+
name: `external-redirect ${asset.url}`,
278+
at: loadReport.at,
279+
expected: `302 ${asset.url} --> 200 ${redirectRelation.to.url}}`
280+
});
281+
}
282+
435283
for (const relation of asset.externalRelations) {
436284
// Only do work for supported protocols
437285
if (!['http:', 'https:', 'file:'].includes(relation.to.protocol)) {
@@ -463,7 +311,6 @@ async function hyperlink({
463311
});
464312
}
465313
}
466-
467314
} else if (relation.to.type === 'Html') {
468315
(relation.to.incomingFragments = relation.to.incomingFragments || []).push({
469316
fragment,
@@ -509,8 +356,10 @@ async function hyperlink({
509356
}
510357

511358
let follow;
512-
513-
if (['HtmlPreconnectLink', 'HtmlDnsPrefetchLink'].includes(relation.type)) {
359+
let metadataOnly = asset._metadataOnly;
360+
if (['HttpRedirect', 'FileRedirect'].includes(relation.type)) {
361+
follow = true;
362+
} else if (['HtmlPreconnectLink', 'HtmlDnsPrefetchLink'].includes(relation.type)) {
514363
follow = false;
515364
relation.to['check' + relation.type] = true;
516365
} else if (['HtmlAnchor', 'SvgAnchor', 'HtmlIFrame'].includes(relation.type)) {
@@ -522,39 +371,41 @@ async function hyperlink({
522371
follow = true;
523372
relation.to.stopProcessing = true;
524373
} else {
525-
relation.to.check = true;
374+
metadataOnly = true;
526375
}
527376
}
528377
} else if (/^(?:JavaScript|Css)Source(?:Mapping)Url$/.test(relation.type)) {
529378
if (followSourceMaps) {
530379
follow = true;
531380
} else {
532-
relation.to.check = true;
381+
metadataOnly = true;
533382
}
534383
} else if (['SourceMapFile', 'SourceMapSource'].includes(relation.type)) {
535384
if (followSourceMaps) {
536-
relation.to.check = true;
385+
metadataOnly = true;
537386
}
538387
} else {
539388
follow = true;
540389
}
541390

542-
if (follow) {
391+
if (follow || metadataOnly) {
543392
if (assetTypesWithoutRelations.includes(relation.to.type)) {
544393
// If we are handling local file-urls, follow but mark as end-of-line in processing
545394
if (relation.from.protocol === 'file:' && relation.to.protocol === 'file:') {
546395
relation.to.stopProcessing = !recursive;
547396
assetQueue.push(relation.to);
548397
} else {
549-
relation.to.check = true;
398+
metadataOnly = true;
550399
}
551400
} else {
552401
assetQueue.push(relation.to);
553402
}
403+
relation.to._metadataOnly = metadataOnly;
404+
assetQueue.push(relation.to);
554405
}
555406
}
556407

557-
if (asset.type === 'Html') {
408+
if (asset.type === 'Html' && !asset._metadataOnly) {
558409
// Remember the set of ids in the document before unloading so incoming fragments can be checked:
559410
asset.ids = new Set();
560411
for (const element of Array.from(asset.parseTree.querySelectorAll('[id]'))) {
@@ -622,24 +473,6 @@ async function hyperlink({
622473
}
623474
}
624475

625-
// Check urls
626-
const assetsToCheck = ag.findAssets({check: true}).filter(asset => !processedAssets.has(asset));
627-
t.push({
628-
name: `Crawling ${assetsToCheck.length} outgoing urls`
629-
});
630-
631-
await new Promise((resolve, reject) => async.parallelLimit(
632-
assetsToCheck.map(asset => httpStatus(asset)),
633-
20,
634-
err => {
635-
if (err) {
636-
reject(err);
637-
} else {
638-
resolve();
639-
}
640-
}
641-
));
642-
643476
// Check Content-Type vs. incoming relation targetTypes:
644477

645478
for (const asset of ag.findAssets({expectedTypes: {$exists: true}})) {

package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
"async": "^2.6.0",
4242
"optimist": "^0.6.1",
4343
"pretty-bytes": "^4.0.2",
44-
"request": "^2.83.0",
4544
"tap-render": "Munter/tap-render#0.1.7-patch3",
4645
"urltools": "^0.3.1"
4746
},

test/index.js

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ describe('hyperlink', function () {
109109
name: 'load https://example.com/',
110110
ok: true
111111
});
112-
t.push( { name: 'Crawling 2 outgoing urls' } );
112+
// t.push( { name: 'Crawling 2 outgoing urls' } );
113113
t.push(null, {
114114
ok: true,
115115
name: 'external-check https://google.com'
@@ -217,7 +217,7 @@ describe('hyperlink', function () {
217217
ok: false,
218218
operator: 'content-type-mismatch',
219219
name: 'content-type-mismatch https://example.com/hey.png',
220-
actual: 'Asset is used as both Image and Text',
220+
actual: 'Asset is used as both Png and Text',
221221
at: 'https://example.com/ (6:39) <img src="hey.png">'
222222
});
223223
});
@@ -260,8 +260,8 @@ describe('hyperlink', function () {
260260
expect(t.push, 'to have a call satisfying', () => {
261261
t.push(null, {
262262
ok: false,
263-
operator: 'content-type-missing',
264-
name: 'content-type-missing https://example.com/hey.png',
263+
operator: 'error',
264+
actual: 'https://example.com/hey.png: No Content-Type response header received',
265265
at: 'https://example.com/ (6:39) <img src="hey.png">'
266266
});
267267
});
@@ -332,9 +332,9 @@ describe('hyperlink', function () {
332332
actual: expect.it('to begin with', 'ENOENT: no such file or directory')
333333
});
334334

335-
t.push({
336-
name: 'Crawling 0 outgoing urls'
337-
});
335+
// t.push({
336+
// name: 'Crawling 0 outgoing urls'
337+
// });
338338

339339
t.push({
340340
name: 'Connecting to 0 hosts (checking <link rel="preconnect" href="...">'
@@ -673,12 +673,12 @@ describe('hyperlink', function () {
673673
{
674674
request: 'HEAD https://mycdn.com/404.eot',
675675
response: 404
676-
},
676+
}/*,
677677
// retry
678678
{
679679
request: 'GET https://mycdn.com/404.eot',
680680
response: 404
681-
}
681+
}*/
682682
]);
683683

684684
const t = new TapRender();
@@ -694,7 +694,7 @@ describe('hyperlink', function () {
694694
operator: 'external-check',
695695
name: 'external-check https://mycdn.com/404.eot',
696696
expected: '200 https://mycdn.com/404.eot',
697-
actual: '404 https://mycdn.com/404.eot'
697+
actual: 'HTTP 404 Not Found'
698698
});
699699
});
700700
});

0 commit comments

Comments
 (0)