Skip to content

Commit a94ed21

Browse files
committed
WIP: Check external resources in the main loop as well via asset.load({ metadataOnly: true })
1 parent ada6936 commit a94ed21

2 files changed

Lines changed: 46 additions & 226 deletions

File tree

lib/index.js

Lines changed: 31 additions & 212 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
const AssetGraph = require('assetgraph');
22
const async = require('async');
3-
const request = require('request');
43
const version = require('../package.json').version;
54
const relationDebugDescription = require('./relationDebugDescription');
65
const prettyBytes = require('pretty-bytes');
@@ -147,184 +146,6 @@ async function hyperlink(
147146
};
148147
}
149148

150-
function httpStatus(asset, attempt = 1) {
151-
const url = asset.url;
152-
const relations = asset._incoming;
153-
154-
const loadReport = {
155-
operator: 'external-check',
156-
name: `external-check ${url}`,
157-
at: [...new Set(relations.map(r => r.debugDescription))].join(
158-
'\n '
159-
),
160-
expected: `200 ${url}`
161-
};
162-
163-
return callback => {
164-
if (shouldSkip(loadReport)) {
165-
return setTimeout(callback);
166-
}
167-
168-
request(
169-
{
170-
method: attempt === 1 ? 'head' : 'get',
171-
url: asset.url,
172-
strictSSL: true,
173-
gzip: true,
174-
headers: {
175-
'User-Agent': hyperlinkUserAgent,
176-
Accept:
177-
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
178-
'Accept-Encoding': 'gzip, deflate, sdch, br'
179-
}
180-
},
181-
(error, res) => {
182-
if (error) {
183-
const code = error.code;
184-
let actual = code || 'Unknown error';
185-
186-
switch (code) {
187-
case 'ENOTFOUND':
188-
actual = `DNS missing: ${asset.hostname}`;
189-
break;
190-
case 'HPE_INVALID_CONSTANT':
191-
if (attempt === 1) {
192-
return httpStatus(asset, attempt + 1)(callback);
193-
}
194-
break;
195-
}
196-
197-
reportTest({
198-
...loadReport,
199-
ok: false,
200-
actual
201-
});
202-
203-
return callback();
204-
}
205-
206-
const status = res.statusCode;
207-
208-
if (status >= 200 && status < 300) {
209-
const contentType = res.headers['content-type'];
210-
if (contentType && asset.type) {
211-
const matchContentType = contentType.match(
212-
/^\s*([\w\-+.]+\/[\w-+.]+)(?:\s|;|$)/i
213-
);
214-
if (matchContentType && asset.expectedTypes) {
215-
asset.contentType = matchContentType[1].toLowerCase();
216-
asset._tryUpgrade();
217-
}
218-
} else if (!contentType) {
219-
const contentTypeMisingReport = {
220-
ok: false,
221-
name: `content-type-missing ${asset.urlOrDescription}`,
222-
operator: 'content-type-missing',
223-
expected:
224-
asset.contentType ||
225-
`A Content-Type compatible with ${asset.type}`,
226-
actual: contentType,
227-
at: [...new Set(relations.map(r => r.debugDescription))].join(
228-
'\n '
229-
)
230-
};
231-
232-
if (!shouldSkip(contentTypeMisingReport)) {
233-
reportTest(contentTypeMisingReport);
234-
}
235-
}
236-
}
237-
238-
// Some servers respond weirdly to HEAD requests. Make a second attempt with GET
239-
if (attempt === 1 && status >= 400 && status < 600) {
240-
return httpStatus(asset, attempt + 1)(callback);
241-
}
242-
243-
// Some servers (jspm.io) respond with 502 if requesting HEAD, then GET to close in succession. Give the server a second to cool down
244-
if (attempt === 2 && status === 502) {
245-
setTimeout(() => httpStatus(asset, attempt + 1)(callback), 1000);
246-
return;
247-
}
248-
249-
const redirects = res.request._redirect.redirects;
250-
if (redirects.length > 0) {
251-
const log = [{ redirectUri: url }, ...redirects].map(
252-
(item, idx, arr) => {
253-
if (arr[idx + 1]) {
254-
item.statusCode = arr[idx + 1].statusCode;
255-
} else {
256-
item.statusCode = 200;
257-
}
258-
259-
return item;
260-
}
261-
);
262-
263-
const redirectReport = {
264-
operator: 'external-redirect',
265-
name: `external-redirect ${url}`,
266-
at: [...new Set(relations.map(r => r.debugDescription))].join(
267-
'\n '
268-
),
269-
expected: `302 ${url} --> 200 ${log[log.length - 1].redirectUri}`
270-
};
271-
272-
const actual = log
273-
.map(redirect => `${redirect.statusCode} ${redirect.redirectUri}`)
274-
.join(' --> ');
275-
276-
if (!shouldSkip(redirectReport)) {
277-
// A single temporary redirect is allowed
278-
if ([302, 307].includes(log[0].statusCode)) {
279-
if (log.length < 3) {
280-
reportTest({
281-
...redirectReport,
282-
expected: actual,
283-
actual,
284-
ok: true
285-
});
286-
} else {
287-
reportTest({
288-
...redirectReport,
289-
expected: `${log[0].statusCode} ${url} --> 200 ${
290-
log[log.length - 1].redirectUri
291-
}`,
292-
actual,
293-
ok: false
294-
});
295-
}
296-
} else {
297-
reportTest({
298-
...redirectReport,
299-
actual,
300-
ok: false
301-
});
302-
}
303-
}
304-
}
305-
306-
if (status === 200) {
307-
reportTest({
308-
...loadReport,
309-
ok: true,
310-
actual: loadReport.expected
311-
});
312-
313-
return callback();
314-
}
315-
316-
reportTest({
317-
...loadReport,
318-
actual: `${status} ${url}`,
319-
ok: false
320-
});
321-
322-
return callback();
323-
}
324-
);
325-
};
326-
}
327-
328149
if (verbose) {
329150
ag.on('addRelation', relation => {
330151
console.error('addRelation', relation.toString());
@@ -424,9 +245,10 @@ async function hyperlink(
424245
async function processAsset(asset) {
425246
if (!processedAssets.has(asset)) {
426247
processedAssets.add(asset);
248+
const operator = asset._metadataOnly ? 'external-check' : 'load';
427249
const loadReport = {
428-
operator: 'load',
429-
name: `load ${asset.urlOrDescription}`,
250+
operator,
251+
name: `${operator} ${asset.urlOrDescription}`,
430252
expected: `200 ${asset.urlOrDescription}`
431253
};
432254

@@ -441,7 +263,8 @@ async function hyperlink(
441263
}
442264

443265
try {
444-
await asset.load();
266+
// FIXME: Make sure we do a full load if an asset is added to the queue again in non-metadataOnly mode
267+
await asset.load({ metadataOnly: asset._metadataOnly });
445268

446269
reportTest({
447270
...loadReport,
@@ -462,6 +285,20 @@ async function hyperlink(
462285
return;
463286
}
464287

288+
if (asset.statusCode >= 300 && asset.statusCode < 400) {
289+
// TODO: Warn about chains of temporary redirects
290+
const redirectRelation = asset.outgoingRelations.find(
291+
r => r.type === 'HttpRedirect'
292+
);
293+
reportTest({
294+
ok: asset.statusCode !== 301,
295+
operator: 'external-redirect',
296+
name: `external-redirect ${asset.url}`,
297+
at: loadReport.at,
298+
expected: `302 ${asset.url} --> 200 ${redirectRelation.to.url}}`
299+
});
300+
}
301+
465302
for (const relation of asset.externalRelations) {
466303
// Only do work for supported protocols
467304
if (!['http:', 'https:', 'file:'].includes(relation.to.protocol)) {
@@ -547,8 +384,10 @@ async function hyperlink(
547384
}
548385

549386
let follow;
550-
551-
if (
387+
let metadataOnly = asset._metadataOnly;
388+
if (['HttpRedirect', 'FileRedirect'].includes(relation.type)) {
389+
follow = true;
390+
} else if (
552391
['HtmlPreconnectLink', 'HtmlDnsPrefetchLink'].includes(relation.type)
553392
) {
554393
follow = false;
@@ -568,7 +407,7 @@ async function hyperlink(
568407
follow = true;
569408
relation.to.stopProcessing = true;
570409
} else {
571-
relation.to.check = true;
410+
metadataOnly = true;
572411
}
573412
}
574413
} else if (
@@ -577,19 +416,19 @@ async function hyperlink(
577416
if (followSourceMaps) {
578417
follow = true;
579418
} else {
580-
relation.to.check = true;
419+
metadataOnly = true;
581420
}
582421
} else if (
583422
['SourceMapFile', 'SourceMapSource'].includes(relation.type)
584423
) {
585424
if (followSourceMaps) {
586-
relation.to.check = true;
425+
metadataOnly = true;
587426
}
588427
} else {
589428
follow = true;
590429
}
591430

592-
if (follow) {
431+
if (follow || metadataOnly) {
593432
if (assetTypesWithoutRelations.includes(relation.to.type)) {
594433
// If we are handling local file-urls, follow but mark as end-of-line in processing
595434
if (
@@ -599,15 +438,17 @@ async function hyperlink(
599438
relation.to.stopProcessing = !recursive;
600439
assetQueue.push(relation.to);
601440
} else {
602-
relation.to.check = true;
441+
metadataOnly = true;
603442
}
604443
} else {
605444
assetQueue.push(relation.to);
606445
}
446+
relation.to._metadataOnly = metadataOnly;
447+
assetQueue.push(relation.to);
607448
}
608449
}
609450

610-
if (asset.type === 'Html') {
451+
if (asset.type === 'Html' && !asset._metadataOnly) {
611452
// Remember the set of ids in the document before unloading so incoming fragments can be checked:
612453
asset.ids = new Set();
613454
for (const element of Array.from(
@@ -680,28 +521,6 @@ async function hyperlink(
680521
}
681522
}
682523

683-
// Check urls
684-
const assetsToCheck = ag
685-
.findAssets({ check: true })
686-
.filter(asset => !processedAssets.has(asset));
687-
t.push({
688-
name: `Crawling ${assetsToCheck.length} outgoing urls`
689-
});
690-
691-
await new Promise((resolve, reject) =>
692-
async.parallelLimit(
693-
assetsToCheck.map(asset => httpStatus(asset)),
694-
20,
695-
err => {
696-
if (err) {
697-
reject(err);
698-
} else {
699-
resolve();
700-
}
701-
}
702-
)
703-
);
704-
705524
// Check Content-Type vs. incoming relation targetTypes:
706525

707526
for (const asset of ag.findAssets({ expectedTypes: { $exists: true } })) {

0 commit comments

Comments
 (0)