|
5 | 5 |
|
6 | 6 | import asyncio |
7 | 7 | import logging |
| 8 | +import os |
8 | 9 | from contextlib import suppress |
9 | 10 | from enum import Enum |
10 | 11 | from io import BytesIO |
|
35 | 36 | THREADS = {} |
36 | 37 | LOGGER = logging.getLogger('ccb.task_fetcher') |
37 | 38 | FILES_INDEXING_BATCH_SIZE = 64 # todo: config? |
| 39 | +# divides the batch into these many chunks |
| 40 | +PARALLEL_FILE_PARSING = max(1, (os.cpu_count() or 2) - 1) # todo: config? |
38 | 41 | # max concurrent fetches to avoid overloading the NC server or hitting rate limits |
39 | 42 | CONCURRENT_FILE_FETCHES = 10 # todo: config? |
40 | 43 | MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, todo: config? |
@@ -217,8 +220,18 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro |
217 | 220 | else: |
218 | 221 | source_errors[file_id] = result |
219 | 222 |
|
220 | | - files_result = _load_sources(source_files) |
221 | | - providers_result = _load_sources(q_items.content_providers) |
| 223 | + files_result = {} |
| 224 | + providers_result = {} |
| 225 | + chunk_size = FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING |
| 226 | + |
| 227 | + # chunk file parsing for better file operation parallelism |
| 228 | + for i in range(0, len(source_files), chunk_size): |
| 229 | + chunk = dict(list(source_files.items())[i:i+chunk_size]) |
| 230 | + files_result.update(_load_sources(chunk)) |
| 231 | + |
| 232 | + for i in range(0, len(q_items.content_providers), chunk_size): |
| 233 | + chunk = dict(list(q_items.content_providers.items())[i:i+chunk_size]) |
| 234 | + providers_result.update(_load_sources(chunk)) |
222 | 235 |
|
223 | 236 | if ( |
224 | 237 | any(isinstance(res, IndexingError) for res in files_result.values()) |
|
0 commit comments