|
1 | 1 | from pathlib import Path |
2 | 2 |
|
3 | 3 | import time |
4 | | -import glob |
5 | 4 | import json |
6 | 5 | import os.path |
7 | 6 | import sys |
|
12 | 11 | import duckdb |
13 | 12 |
|
14 | 13 |
|
15 | | -def index_download_advice(prefix, crawl): |
| 14 | +def index_download_advice(local_prefix, crawl): |
16 | 15 | print('Do you need to download this index?') |
17 | | - print(f' mkdir -p {prefix}/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/') |
18 | | - print(f' cd {prefix}/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/') |
19 | | - print(f' aws s3 sync s3://commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/ .') |
| 16 | + print('The recommended way is to use cc-downloader https://github.com/commoncrawl/cc-downloader') |
| 17 | + print('If you have cargo and Rust already installed: `cargo install cc-downloader` ' |
| 18 | + '(alternatively, the binaries are available on the GitHub repository) , and then, ') |
| 19 | + print(f' ~/.cargo/bin/cc-downloader download-paths {crawl} cc-index-table {local_prefix}') |
| 20 | + print(f' ~/.cargo/bin/cc-downloader download {local_prefix}/cc-index-table.paths.gz --progress {local_prefix}') |
20 | 21 |
|
21 | 22 |
|
22 | 23 | def print_row_as_cdxj(row): |
@@ -57,8 +58,18 @@ def get_files(algo, crawl, local_prefix=None): |
57 | 58 | raise NotImplementedError('will cause a 403') |
58 | 59 | elif algo == 'local_files': |
59 | 60 | files = [str(f) for f in Path(os.path.expanduser(f'{local_prefix}')).rglob('*.parquet')] |
| 61 | + # Check whether the local files have been already downloaded. |
| 62 | + # We expect 300 files of about a gigabyte |
| 63 | + if len(files) < 250: |
| 64 | + index_download_advice(local_prefix, crawl) |
| 65 | + exit(1) |
60 | 66 | elif algo == 'ccf_local_files': |
61 | 67 | files = [str(f) for f in Path(f'/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc').rglob('*.parquet')] |
| 68 | + # Check whether the local files have been already downloaded |
| 69 | + # We expect 300 files of about a gigabyte |
| 70 | + if len(files) < 250: |
| 71 | + index_download_advice('/home/cc-pds/', crawl) |
| 72 | + exit(1) |
62 | 73 | elif algo == 'cloudfront_glob': |
63 | 74 | # duckdb can't glob this, same reason as s3_glob above |
64 | 75 | files = f'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet' |
|
0 commit comments