11const AssetGraph = require ( 'assetgraph' ) ;
22const async = require ( 'async' ) ;
3- const request = require ( 'request' ) ;
43const version = require ( '../package.json' ) . version ;
54const relationDebugDescription = require ( './relationDebugDescription' ) ;
65const prettyBytes = require ( 'pretty-bytes' ) ;
@@ -147,184 +146,6 @@ async function hyperlink(
147146 } ;
148147 }
149148
150- function httpStatus ( asset , attempt = 1 ) {
151- const url = asset . url ;
152- const relations = asset . _incoming ;
153-
154- const loadReport = {
155- operator : 'external-check' ,
156- name : `external-check ${ url } ` ,
157- at : [ ...new Set ( relations . map ( r => r . debugDescription ) ) ] . join (
158- '\n '
159- ) ,
160- expected : `200 ${ url } `
161- } ;
162-
163- return callback => {
164- if ( shouldSkip ( loadReport ) ) {
165- return setTimeout ( callback ) ;
166- }
167-
168- request (
169- {
170- method : attempt === 1 ? 'head' : 'get' ,
171- url : asset . url ,
172- strictSSL : true ,
173- gzip : true ,
174- headers : {
175- 'User-Agent' : hyperlinkUserAgent ,
176- Accept :
177- 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ,
178- 'Accept-Encoding' : 'gzip, deflate, sdch, br'
179- }
180- } ,
181- ( error , res ) => {
182- if ( error ) {
183- const code = error . code ;
184- let actual = code || 'Unknown error' ;
185-
186- switch ( code ) {
187- case 'ENOTFOUND' :
188- actual = `DNS missing: ${ asset . hostname } ` ;
189- break ;
190- case 'HPE_INVALID_CONSTANT' :
191- if ( attempt === 1 ) {
192- return httpStatus ( asset , attempt + 1 ) ( callback ) ;
193- }
194- break ;
195- }
196-
197- reportTest ( {
198- ...loadReport ,
199- ok : false ,
200- actual
201- } ) ;
202-
203- return callback ( ) ;
204- }
205-
206- const status = res . statusCode ;
207-
208- if ( status >= 200 && status < 300 ) {
209- const contentType = res . headers [ 'content-type' ] ;
210- if ( contentType && asset . type ) {
211- const matchContentType = contentType . match (
212- / ^ \s * ( [ \w \- + . ] + \/ [ \w - + . ] + ) (?: \s | ; | $ ) / i
213- ) ;
214- if ( matchContentType && asset . expectedTypes ) {
215- asset . contentType = matchContentType [ 1 ] . toLowerCase ( ) ;
216- asset . _tryUpgrade ( ) ;
217- }
218- } else if ( ! contentType ) {
219- const contentTypeMisingReport = {
220- ok : false ,
221- name : `content-type-missing ${ asset . urlOrDescription } ` ,
222- operator : 'content-type-missing' ,
223- expected :
224- asset . contentType ||
225- `A Content-Type compatible with ${ asset . type } ` ,
226- actual : contentType ,
227- at : [ ...new Set ( relations . map ( r => r . debugDescription ) ) ] . join (
228- '\n '
229- )
230- } ;
231-
232- if ( ! shouldSkip ( contentTypeMisingReport ) ) {
233- reportTest ( contentTypeMisingReport ) ;
234- }
235- }
236- }
237-
238- // Some servers respond weirdly to HEAD requests. Make a second attempt with GET
239- if ( attempt === 1 && status >= 400 && status < 600 ) {
240- return httpStatus ( asset , attempt + 1 ) ( callback ) ;
241- }
242-
243- // Some servers (jspm.io) respond with 502 if requesting HEAD, then GET to close in succession. Give the server a second to cool down
244- if ( attempt === 2 && status === 502 ) {
245- setTimeout ( ( ) => httpStatus ( asset , attempt + 1 ) ( callback ) , 1000 ) ;
246- return ;
247- }
248-
249- const redirects = res . request . _redirect . redirects ;
250- if ( redirects . length > 0 ) {
251- const log = [ { redirectUri : url } , ...redirects ] . map (
252- ( item , idx , arr ) => {
253- if ( arr [ idx + 1 ] ) {
254- item . statusCode = arr [ idx + 1 ] . statusCode ;
255- } else {
256- item . statusCode = 200 ;
257- }
258-
259- return item ;
260- }
261- ) ;
262-
263- const redirectReport = {
264- operator : 'external-redirect' ,
265- name : `external-redirect ${ url } ` ,
266- at : [ ...new Set ( relations . map ( r => r . debugDescription ) ) ] . join (
267- '\n '
268- ) ,
269- expected : `302 ${ url } --> 200 ${ log [ log . length - 1 ] . redirectUri } `
270- } ;
271-
272- const actual = log
273- . map ( redirect => `${ redirect . statusCode } ${ redirect . redirectUri } ` )
274- . join ( ' --> ' ) ;
275-
276- if ( ! shouldSkip ( redirectReport ) ) {
277- // A single temporary redirect is allowed
278- if ( [ 302 , 307 ] . includes ( log [ 0 ] . statusCode ) ) {
279- if ( log . length < 3 ) {
280- reportTest ( {
281- ...redirectReport ,
282- expected : actual ,
283- actual,
284- ok : true
285- } ) ;
286- } else {
287- reportTest ( {
288- ...redirectReport ,
289- expected : `${ log [ 0 ] . statusCode } ${ url } --> 200 ${
290- log [ log . length - 1 ] . redirectUri
291- } `,
292- actual,
293- ok : false
294- } ) ;
295- }
296- } else {
297- reportTest ( {
298- ...redirectReport ,
299- actual,
300- ok : false
301- } ) ;
302- }
303- }
304- }
305-
306- if ( status === 200 ) {
307- reportTest ( {
308- ...loadReport ,
309- ok : true ,
310- actual : loadReport . expected
311- } ) ;
312-
313- return callback ( ) ;
314- }
315-
316- reportTest ( {
317- ...loadReport ,
318- actual : `${ status } ${ url } ` ,
319- ok : false
320- } ) ;
321-
322- return callback ( ) ;
323- }
324- ) ;
325- } ;
326- }
327-
328149 if ( verbose ) {
329150 ag . on ( 'addRelation' , relation => {
330151 console . error ( 'addRelation' , relation . toString ( ) ) ;
@@ -424,9 +245,10 @@ async function hyperlink(
424245 async function processAsset ( asset ) {
425246 if ( ! processedAssets . has ( asset ) ) {
426247 processedAssets . add ( asset ) ;
248+ const operator = asset . _metadataOnly ? 'external-check' : 'load' ;
427249 const loadReport = {
428- operator : 'load' ,
429- name : `load ${ asset . urlOrDescription } ` ,
250+ operator,
251+ name : `${ operator } ${ asset . urlOrDescription } ` ,
430252 expected : `200 ${ asset . urlOrDescription } `
431253 } ;
432254
@@ -441,7 +263,8 @@ async function hyperlink(
441263 }
442264
443265 try {
444- await asset . load ( ) ;
266+ // FIXME: Make sure we do a full load if an asset is added to the queue again in non-metadataOnly mode
267+ await asset . load ( { metadataOnly : asset . _metadataOnly } ) ;
445268
446269 reportTest ( {
447270 ...loadReport ,
@@ -462,6 +285,20 @@ async function hyperlink(
462285 return ;
463286 }
464287
288+ if ( asset . statusCode >= 300 && asset . statusCode < 400 ) {
289+ // TODO: Warn about chains of temporary redirects
290+ const redirectRelation = asset . outgoingRelations . find (
291+ r => r . type === 'HttpRedirect'
292+ ) ;
293+ reportTest ( {
294+ ok : asset . statusCode !== 301 ,
295+ operator : 'external-redirect' ,
296+ name : `external-redirect ${ asset . url } ` ,
297+ at : loadReport . at ,
298+ expected : `302 ${ asset . url } --> 200 ${ redirectRelation . to . url } }`
299+ } ) ;
300+ }
301+
465302 for ( const relation of asset . externalRelations ) {
466303 // Only do work for supported protocols
467304 if ( ! [ 'http:' , 'https:' , 'file:' ] . includes ( relation . to . protocol ) ) {
@@ -547,8 +384,10 @@ async function hyperlink(
547384 }
548385
549386 let follow ;
550-
551- if (
387+ let metadataOnly = asset . _metadataOnly ;
388+ if ( [ 'HttpRedirect' , 'FileRedirect' ] . includes ( relation . type ) ) {
389+ follow = true ;
390+ } else if (
552391 [ 'HtmlPreconnectLink' , 'HtmlDnsPrefetchLink' ] . includes ( relation . type )
553392 ) {
554393 follow = false ;
@@ -568,7 +407,7 @@ async function hyperlink(
568407 follow = true ;
569408 relation . to . stopProcessing = true ;
570409 } else {
571- relation . to . check = true ;
410+ metadataOnly = true ;
572411 }
573412 }
574413 } else if (
@@ -577,19 +416,19 @@ async function hyperlink(
577416 if ( followSourceMaps ) {
578417 follow = true ;
579418 } else {
580- relation . to . check = true ;
419+ metadataOnly = true ;
581420 }
582421 } else if (
583422 [ 'SourceMapFile' , 'SourceMapSource' ] . includes ( relation . type )
584423 ) {
585424 if ( followSourceMaps ) {
586- relation . to . check = true ;
425+ metadataOnly = true ;
587426 }
588427 } else {
589428 follow = true ;
590429 }
591430
592- if ( follow ) {
431+ if ( follow || metadataOnly ) {
593432 if ( assetTypesWithoutRelations . includes ( relation . to . type ) ) {
594433 // If we are handling local file-urls, follow but mark as end-of-line in processing
595434 if (
@@ -599,15 +438,17 @@ async function hyperlink(
599438 relation . to . stopProcessing = ! recursive ;
600439 assetQueue . push ( relation . to ) ;
601440 } else {
602- relation . to . check = true ;
441+ metadataOnly = true ;
603442 }
604443 } else {
605444 assetQueue . push ( relation . to ) ;
606445 }
446+ relation . to . _metadataOnly = metadataOnly ;
447+ assetQueue . push ( relation . to ) ;
607448 }
608449 }
609450
610- if ( asset . type === 'Html' ) {
451+ if ( asset . type === 'Html' && ! asset . _metadataOnly ) {
611452 // Remember the set of ids in the document before unloading so incoming fragments can be checked:
612453 asset . ids = new Set ( ) ;
613454 for ( const element of Array . from (
@@ -680,28 +521,6 @@ async function hyperlink(
680521 }
681522 }
682523
683- // Check urls
684- const assetsToCheck = ag
685- . findAssets ( { check : true } )
686- . filter ( asset => ! processedAssets . has ( asset ) ) ;
687- t . push ( {
688- name : `Crawling ${ assetsToCheck . length } outgoing urls`
689- } ) ;
690-
691- await new Promise ( ( resolve , reject ) =>
692- async . parallelLimit (
693- assetsToCheck . map ( asset => httpStatus ( asset ) ) ,
694- 20 ,
695- err => {
696- if ( err ) {
697- reject ( err ) ;
698- } else {
699- resolve ( ) ;
700- }
701- }
702- )
703- ) ;
704-
705524 // Check Content-Type vs. incoming relation targetTypes:
706525
707526 for ( const asset of ag . findAssets ( { expectedTypes : { $exists : true } } ) ) {
0 commit comments