Skip to content

Commit b774137

Browse files
committed
[Download Datasheet][Added] Check for curl output
Curl is robust, but some sites detects it, so we must remove HTML or empty files and try with requests.
1 parent 319051e commit b774137

1 file changed

Lines changed: 30 additions & 1 deletion

File tree

kibot/out_download_datasheets.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,35 @@ def do_warning(self, msg, ds, c):
7676
logger.warning(W_FAILDL+'{} during download of `{}` [{}]'.format(msg, ds, c.ref))
7777
return None
7878

79+
def validate_downloaded(self, file):
80+
if not os.path.isfile(file):
81+
# Not there
82+
logger.debug("- Download command succeed, but no downloaded file")
83+
return False
84+
if os.path.getsize(file) == 0:
85+
# Empty
86+
os.remove(file)
87+
logger.debug("- Empty file downloaded, removing it")
88+
return False
89+
if os.path.splitext(file)[1].lower() != ".pdf":
90+
# There, not empty and not a PDF
91+
return True
92+
# Test for PDF and not a web page
93+
try:
94+
with open(file, 'r') as f:
95+
f.read
96+
header = f.read(4)
97+
except Exception:
98+
# Failed to read it
99+
logger.debug("- Downloaded file can't be read")
100+
return False
101+
if header != b'%PDF':
102+
# Not a PDF, remove it
103+
os.remove(file)
104+
logger.debug("- Downloaded file isn't a PDF")
105+
return False
106+
return True
107+
79108
def download(self, c, ds, dir, name, known):
80109
if self.classify:
81110
subdir = self.classify_extra.get(c.ref_prefix, SUBDIRS.get(c.ref_prefix, 'Miscellaneous'))
@@ -100,7 +129,7 @@ def download(self, c, ds, dir, name, known):
100129
cmd = [self._curl_command, '-o', dest, ds]
101130
try:
102131
run_command(cmd, just_raise=True)
103-
downloaded = True
132+
downloaded = self.validate_downloaded(dest)
104133
except CalledProcessError as e:
105134
logger.warning(W_FAILDL+f'Failed to download {ds} using {self.get_command} ({e})')
106135
if not downloaded:

0 commit comments

Comments
 (0)