11const AssetGraph = require ( 'assetgraph' ) ;
22const async = require ( 'async' ) ;
3- const request = require ( 'request' ) ;
43const version = require ( '../package.json' ) . version ;
54const relationDebugDescription = require ( './relationDebugDescription' ) ;
65const prettyBytes = require ( 'pretty-bytes' ) ;
@@ -140,171 +139,6 @@ async function hyperlink({
140139 } ;
141140 }
142141
143- function httpStatus ( asset , attempt = 1 ) {
144- const url = asset . url ;
145- const relations = asset . _incoming ;
146-
147- const loadReport = {
148- operator : 'external-check' ,
149- name : `external-check ${ url } ` ,
150- at : [ ...new Set ( relations . map ( r => r . debugDescription ) ) ] . join ( '\n ' ) ,
151- expected : `200 ${ url } `
152- } ;
153-
154- return callback => {
155- if ( shouldSkip ( loadReport ) ) {
156- return setTimeout ( callback ) ;
157- }
158-
159- request ( {
160- method : attempt === 1 ? 'head' : 'get' ,
161- url : asset . url ,
162- strictSSL : true ,
163- gzip : true ,
164- headers : {
165- 'User-Agent' : hyperlinkUserAgent ,
166- 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ,
167- 'Accept-Encoding' : 'gzip, deflate, sdch, br'
168- }
169- } , ( error , res ) => {
170- if ( error ) {
171- const code = error . code ;
172- let actual = code || 'Unknown error' ;
173-
174- switch ( code ) {
175- case 'ENOTFOUND' :
176- actual = `DNS missing: ${ asset . hostname } ` ;
177- break ;
178- case 'HPE_INVALID_CONSTANT' :
179- if ( attempt === 1 ) {
180- return httpStatus ( asset , attempt + 1 ) ( callback ) ;
181- }
182- break ;
183- }
184-
185- reportTest ( {
186- ...loadReport ,
187- ok : false ,
188- actual
189- } ) ;
190-
191- return callback ( ) ;
192- }
193-
194- const status = res . statusCode ;
195-
196- if ( status >= 200 && status < 300 ) {
197- const contentType = res . headers [ 'content-type' ] ;
198- if ( contentType && asset . type ) {
199- const matchContentType = contentType . match (
200- / ^ \s * ( [ \w \- + . ] + \/ [ \w - + . ] + ) (?: \s | ; | $ ) / i
201- ) ;
202- if ( matchContentType && asset . expectedTypes ) {
203- asset . contentType = matchContentType [ 1 ] . toLowerCase ( ) ;
204- asset . _tryUpgrade ( ) ;
205- }
206- } else if ( ! contentType ) {
207- const contentTypeMisingReport = {
208- ok : false ,
209- name : `content-type-missing ${ asset . urlOrDescription } ` ,
210- operator : 'content-type-missing' ,
211- expected : asset . contentType || `A Content-Type compatible with ${ asset . type } ` ,
212- actual : contentType ,
213- at : [ ...new Set ( relations . map ( r => r . debugDescription ) ) ] . join ( '\n ' ) ,
214- } ;
215-
216- if ( ! shouldSkip ( contentTypeMisingReport ) ) {
217- reportTest ( contentTypeMisingReport ) ;
218- } ;
219- }
220- }
221-
222- // Some servers respond weirdly to HEAD requests. Make a second attempt with GET
223- if ( attempt === 1 && status >= 400 && status < 600 ) {
224- return httpStatus ( asset , attempt + 1 ) ( callback ) ;
225- }
226-
227- // Some servers (jspm.io) respond with 502 if requesting HEAD, then GET to close in succession. Give the server a second to cool down
228- if ( attempt === 2 && status === 502 ) {
229- setTimeout (
230- ( ) => httpStatus ( asset , attempt + 1 ) ( callback ) ,
231- 1000
232- ) ;
233- return ;
234- }
235-
236- const redirects = res . request . _redirect . redirects ;
237- if ( redirects . length > 0 ) {
238- const log = [ { redirectUri : url } , ...redirects ] . map ( ( item , idx , arr ) => {
239- if ( arr [ idx + 1 ] ) {
240- item . statusCode = arr [ idx + 1 ] . statusCode ;
241- } else {
242- item . statusCode = 200 ;
243- }
244-
245- return item ;
246- } ) ;
247-
248- const redirectReport = {
249- operator : 'external-redirect' ,
250- name : `external-redirect ${ url } ` ,
251- at : [ ...new Set ( relations . map ( r => r . debugDescription ) ) ] . join ( '\n ' ) ,
252- expected : `302 ${ url } --> 200 ${ log [ log . length - 1 ] . redirectUri } `
253- } ;
254-
255- const actual = log . map (
256- redirect => `${ redirect . statusCode } ${ redirect . redirectUri } `
257- ) . join ( ' --> ' ) ;
258-
259- if ( ! shouldSkip ( redirectReport ) ) {
260- // A single temporary redirect is allowed
261- if ( [ 302 , 307 ] . includes ( log [ 0 ] . statusCode ) ) {
262- if ( log . length < 3 ) {
263- reportTest ( {
264- ...redirectReport ,
265- expected : actual ,
266- actual,
267- ok : true
268- } ) ;
269- } else {
270- reportTest ( {
271- ...redirectReport ,
272- expected : `${ log [ 0 ] . statusCode } ${ url } --> 200 ${ log [ log . length - 1 ] . redirectUri } ` ,
273- actual,
274- ok : false
275- } ) ;
276- }
277- } else {
278- reportTest ( {
279- ...redirectReport ,
280- actual,
281- ok : false
282- } ) ;
283- }
284- }
285- }
286-
287- if ( status === 200 ) {
288- reportTest ( {
289- ...loadReport ,
290- ok : true ,
291- actual : loadReport . expected
292- } ) ;
293-
294- return callback ( ) ;
295- }
296-
297- reportTest ( {
298- ...loadReport ,
299- actual : `${ status } ${ url } ` ,
300- ok : false
301- } ) ;
302-
303- return callback ( ) ;
304- } ) ;
305- } ;
306- }
307-
308142 if ( verbose ) {
309143 ag . on ( 'addRelation' , relation => {
310144 console . error ( 'addRelation' , relation . toString ( ) ) ;
@@ -394,9 +228,10 @@ async function hyperlink({
394228 async function processAsset ( asset ) {
395229 if ( ! processedAssets . has ( asset ) ) {
396230 processedAssets . add ( asset ) ;
231+ const operator = asset . _metadataOnly ? 'external-check' : 'load' ;
397232 const loadReport = {
398- operator : 'load' ,
399- name : `load ${ asset . urlOrDescription } ` ,
233+ operator,
234+ name : `${ operator } ${ asset . urlOrDescription } ` ,
400235 expected : `200 ${ asset . urlOrDescription } `
401236 } ;
402237
@@ -411,7 +246,8 @@ async function hyperlink({
411246 }
412247
413248 try {
414- await asset . load ( ) ;
249+ // FIXME: Make sure we do a full load if an asset is added to the queue again in non-metadataOnly mode
250+ await asset . load ( { metadataOnly : asset . _metadataOnly } ) ;
415251
416252 reportTest ( {
417253 ...loadReport ,
@@ -432,6 +268,18 @@ async function hyperlink({
432268 return ;
433269 }
434270
271+ if ( asset . statusCode >= 300 && asset . statusCode < 400 ) {
272+ // TODO: Warn about chains of temporary redirects
273+ const redirectRelation = asset . outgoingRelations . find ( r => r . type === 'HttpRedirect' ) ;
274+ reportTest ( {
275+ ok : asset . statusCode !== 301 ,
276+ operator : 'external-redirect' ,
277+ name : `external-redirect ${ asset . url } ` ,
278+ at : loadReport . at ,
279+ expected : `302 ${ asset . url } --> 200 ${ redirectRelation . to . url } }`
280+ } ) ;
281+ }
282+
435283 for ( const relation of asset . externalRelations ) {
436284 // Only do work for supported protocols
437285 if ( ! [ 'http:' , 'https:' , 'file:' ] . includes ( relation . to . protocol ) ) {
@@ -463,7 +311,6 @@ async function hyperlink({
463311 } ) ;
464312 }
465313 }
466-
467314 } else if ( relation . to . type === 'Html' ) {
468315 ( relation . to . incomingFragments = relation . to . incomingFragments || [ ] ) . push ( {
469316 fragment,
@@ -509,8 +356,10 @@ async function hyperlink({
509356 }
510357
511358 let follow ;
512-
513- if ( [ 'HtmlPreconnectLink' , 'HtmlDnsPrefetchLink' ] . includes ( relation . type ) ) {
359+ let metadataOnly = asset . _metadataOnly ;
360+ if ( [ 'HttpRedirect' , 'FileRedirect' ] . includes ( relation . type ) ) {
361+ follow = true ;
362+ } else if ( [ 'HtmlPreconnectLink' , 'HtmlDnsPrefetchLink' ] . includes ( relation . type ) ) {
514363 follow = false ;
515364 relation . to [ 'check' + relation . type ] = true ;
516365 } else if ( [ 'HtmlAnchor' , 'SvgAnchor' , 'HtmlIFrame' ] . includes ( relation . type ) ) {
@@ -522,39 +371,41 @@ async function hyperlink({
522371 follow = true ;
523372 relation . to . stopProcessing = true ;
524373 } else {
525- relation . to . check = true ;
374+ metadataOnly = true ;
526375 }
527376 }
528377 } else if ( / ^ (?: J a v a S c r i p t | C s s ) S o u r c e (?: M a p p i n g ) U r l $ / . test ( relation . type ) ) {
529378 if ( followSourceMaps ) {
530379 follow = true ;
531380 } else {
532- relation . to . check = true ;
381+ metadataOnly = true ;
533382 }
534383 } else if ( [ 'SourceMapFile' , 'SourceMapSource' ] . includes ( relation . type ) ) {
535384 if ( followSourceMaps ) {
536- relation . to . check = true ;
385+ metadataOnly = true ;
537386 }
538387 } else {
539388 follow = true ;
540389 }
541390
542- if ( follow ) {
391+ if ( follow || metadataOnly ) {
543392 if ( assetTypesWithoutRelations . includes ( relation . to . type ) ) {
544393 // If we are handling local file-urls, follow but mark as end-of-line in processing
545394 if ( relation . from . protocol === 'file:' && relation . to . protocol === 'file:' ) {
546395 relation . to . stopProcessing = ! recursive ;
547396 assetQueue . push ( relation . to ) ;
548397 } else {
549- relation . to . check = true ;
398+ metadataOnly = true ;
550399 }
551400 } else {
552401 assetQueue . push ( relation . to ) ;
553402 }
403+ relation . to . _metadataOnly = metadataOnly ;
404+ assetQueue . push ( relation . to ) ;
554405 }
555406 }
556407
557- if ( asset . type === 'Html' ) {
408+ if ( asset . type === 'Html' && ! asset . _metadataOnly ) {
558409 // Remember the set of ids in the document before unloading so incoming fragments can be checked:
559410 asset . ids = new Set ( ) ;
560411 for ( const element of Array . from ( asset . parseTree . querySelectorAll ( '[id]' ) ) ) {
@@ -622,24 +473,6 @@ async function hyperlink({
622473 }
623474 }
624475
625- // Check urls
626- const assetsToCheck = ag . findAssets ( { check : true } ) . filter ( asset => ! processedAssets . has ( asset ) ) ;
627- t . push ( {
628- name : `Crawling ${ assetsToCheck . length } outgoing urls`
629- } ) ;
630-
631- await new Promise ( ( resolve , reject ) => async . parallelLimit (
632- assetsToCheck . map ( asset => httpStatus ( asset ) ) ,
633- 20 ,
634- err => {
635- if ( err ) {
636- reject ( err ) ;
637- } else {
638- resolve ( ) ;
639- }
640- }
641- ) ) ;
642-
643476 // Check Content-Type vs. incoming relation targetTypes:
644477
645478 for ( const asset of ag . findAssets ( { expectedTypes : { $exists : true } } ) ) {
0 commit comments