Skip to content

Commit ee824f3

Browse files
committed
feat: update index_download_advice to recommend cc-downloader and check local files
1 parent 1548807 commit ee824f3

1 file changed

Lines changed: 16 additions & 5 deletions

File tree

duck.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from pathlib import Path
22

33
import time
4-
import glob
54
import json
65
import os.path
76
import sys
@@ -12,11 +11,13 @@
1211
import duckdb
1312

1413

15-
def index_download_advice(prefix, crawl):
14+
def index_download_advice(local_prefix, crawl):
1615
print('Do you need to download this index?')
17-
print(f' mkdir -p {prefix}/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/')
18-
print(f' cd {prefix}/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/')
19-
print(f' aws s3 sync s3://commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/ .')
16+
print('The recommended way is to use cc-downloader https://github.com/commoncrawl/cc-downloader')
17+
print('If you have cargo and Rust already installed: `cargo install cc-downloader` '
18+
'(alternatively, the binaries are available on the GitHub repository) , and then, ')
19+
print(f' ~/.cargo/bin/cc-downloader download-paths {crawl} cc-index-table {local_prefix}')
20+
print(f' ~/.cargo/bin/cc-downloader download {local_prefix}/cc-index-table.paths.gz --progress {local_prefix}')
2021

2122

2223
def print_row_as_cdxj(row):
@@ -57,8 +58,18 @@ def get_files(algo, crawl, local_prefix=None):
5758
raise NotImplementedError('will cause a 403')
5859
elif algo == 'local_files':
5960
files = [str(f) for f in Path(os.path.expanduser(f'{local_prefix}')).rglob('*.parquet')]
61+
# Check whether the local files have been already downloaded.
62+
# We expect 300 files of about a gigabyte
63+
if len(files) < 250:
64+
index_download_advice(local_prefix, crawl)
65+
exit(1)
6066
elif algo == 'ccf_local_files':
6167
files = [str(f) for f in Path(f'/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc').rglob('*.parquet')]
68+
# Check whether the local files have been already downloaded
69+
# We expect 300 files of about a gigabyte
70+
if len(files) < 250:
71+
index_download_advice('/home/cc-pds/', crawl)
72+
exit(1)
6273
elif algo == 'cloudfront_glob':
6374
# duckdb can't glob this, same reason as s3_glob above
6475
files = f'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet'

0 commit comments

Comments
 (0)