@@ -76,6 +76,35 @@ def do_warning(self, msg, ds, c):
7676 logger .warning (W_FAILDL + '{} during download of `{}` [{}]' .format (msg , ds , c .ref ))
7777 return None
7878
79+ def validate_downloaded (self , file ):
80+ if not os .path .isfile (file ):
81+ # Not there
82+ logger .debug ("- Download command succeed, but no downloaded file" )
83+ return False
84+ if os .path .getsize (file ) == 0 :
85+ # Empty
86+ os .remove (file )
87+ logger .debug ("- Empty file downloaded, removing it" )
88+ return False
89+ if os .path .splitext (file )[1 ].lower () != ".pdf" :
90+ # There, not empty and not a PDF
91+ return True
92+ # Test for PDF and not a web page
93+ try :
94+ with open (file , 'r' ) as f :
95+ f .read
96+ header = f .read (4 )
97+ except Exception :
98+ # Failed to read it
99+ logger .debug ("- Downloaded file can't be read" )
100+ return False
101+ if header != b'%PDF' :
102+ # Not a PDF, remove it
103+ os .remove (file )
104+ logger .debug ("- Downloaded file isn't a PDF" )
105+ return False
106+ return True
107+
79108 def download (self , c , ds , dir , name , known ):
80109 if self .classify :
81110 subdir = self .classify_extra .get (c .ref_prefix , SUBDIRS .get (c .ref_prefix , 'Miscellaneous' ))
@@ -100,7 +129,7 @@ def download(self, c, ds, dir, name, known):
100129 cmd = [self ._curl_command , '-o' , dest , ds ]
101130 try :
102131 run_command (cmd , just_raise = True )
103- downloaded = True
132+ downloaded = self . validate_downloaded ( dest )
104133 except CalledProcessError as e :
105134 logger .warning (W_FAILDL + f'Failed to download { ds } using { self .get_command } ({ e } )' )
106135 if not downloaded :
0 commit comments