Parallel async streaming

Pamparampam · Pamparampam · commit f8c3b4d4f01d · 2025-08-13T13:15:15.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,6 @@
 /out/
 /files/
 /dist/
+/.test-commands
+/tester2.py
+/.xtest/
diff --git a/README.md b/README.md
@@ -167,6 +167,17 @@ If resume ZipFly instance has different files than pause ZipFly instance there w
 > [!CAUTION]
 > You mustn't reuse `GenFile` instances. 
 
+
+## Parallel async streaming
+
+If your `GenFile`'s rely on network requests to fetch data, network latency can limit throughput
+below the available bandwidth. To address this, I introduce `async_stream_parallel`.
+
+```python
+zipFly = ZipFly(files)
+zipFly.async_stream_parallel(prefetch_files=20, max_chunks_per_file=2)
+```
+
 ### Other
 Python is not optimized for async I/O operations, thus to speed up the async streaming the chunk_size is changed to 4MB, you can override this by passing `chunksize` as argument to LocalFile.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "zipFly64"
-version = "1.2.3"
+version = "1.3.1"
 description = "Stream zip64 archives on the fly."
 readme = "README.md"
 authors = [{ name = "Pamparampampam" }]
diff --git a/src/zipFly/BaseFile.py b/src/zipFly/BaseFile.py
@@ -1,18 +1,19 @@
 import time
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Generator
+from typing import Optional
 
 from . import consts
 from .Compressor import Compressor
 
 
 class BaseFile(ABC):
-    def __init__(self, name: str, compression_method: int):
+    def __init__(self, name: str, compression_method: int = consts.NO_COMPRESSION):
         self.__used = False
         self.__compressed_size = 0
         self.__offset = 0  # Offset to local file header
         self.__crc = 0
-        self.__compression_method = compression_method or consts.NO_COMPRESSION
+        self.__compression_method = compression_method
         self.__flags = 0b00001000  # flag about using data descriptor is always on
         self.__byte_offset_mode = False
         if name == "":
diff --git a/src/zipFly/Compressor.py b/src/zipFly/Compressor.py
@@ -1,19 +1,20 @@
 import zlib
 
+from src.zipFly import consts
+
 
 class Compressor:
     def __init__(self, file: 'BaseFile'):
         self.file = file
-
-        if file.compression_method == 0:
+        if file.compression_method == consts.NO_COMPRESSION:  # no compression
             self.process = self._process_through
             self.tail = self._no_tail
-        elif file.compression_method == 8:  # deflate compression
+        elif file.compression_method == consts.COMPRESSION_DEFLATE:  # deflate compression
             self.compr = zlib.compressobj(5, zlib.DEFLATED, -15)
             self.process = self._process_deflate
             self.tail = self._tail_deflate
         else:
-            raise KeyError("Unknown compression method in compressor")
+            raise KeyError(f"Unknown compression method in compressor: {file.compression_method}")
 
     # no compression
     def _process_through(self, chunk):
diff --git a/src/zipFly/FilePrefetcher.py b/src/zipFly/FilePrefetcher.py
@@ -0,0 +1,75 @@
+import asyncio
+from typing import List, Optional, AsyncGenerator
+
+class FilePrefetcher:
+    """Encapsulates a list of files, a sliding prefetch window, and streaming queues."""
+
+    def __init__(self, files: List, prefetch_files: int = 20, queue_maxsize: int = 2):
+        self.files = files
+        self.prefetch_files = prefetch_files
+        self.queue_maxsize = queue_maxsize
+
+        self.n = len(files)
+        self.prefetchers: List[Optional[_SingleFilePrefetch]] = [None] * self.n
+
+        self.inflight = 0
+        self.next_to_start = 0
+
+    async def _start_prefetch(self, idx: int):
+        pf = _SingleFilePrefetch(self.files[idx], self.queue_maxsize)
+        self.prefetchers[idx] = pf
+        await pf.start()
+        self.inflight += 1
+        self.next_to_start = max(self.next_to_start, idx + 1)
+
+    async def ensure_prefetch(self, idx: int):
+        """Ensure the prefetcher for file `idx` is started, refill window as needed."""
+        if self.prefetchers[idx] is None:
+            await self._start_prefetch(idx)
+
+        # Refill window
+        while self.next_to_start < self.n and self.inflight < self.prefetch_files:
+            await self._start_prefetch(self.next_to_start)
+
+    async def stream_file_data(self, idx: int) -> AsyncGenerator[bytes, None]:
+        """Yields chunks of a single file in order, managing prefetch completion."""
+        pf = self.prefetchers[idx]
+        while True:
+            chunk = await pf.queue.get()
+            if chunk is None:
+                if pf.task:
+                    await pf.task
+                    pf.task = None
+                self.inflight -= 1
+
+                # keep window full
+                while self.next_to_start < self.n and self.inflight < self.prefetch_files:
+                    await self._start_prefetch(self.next_to_start)
+                break
+            yield chunk
+
+
+class _SingleFilePrefetch:
+    """Handles a single file's async queue and task."""
+
+    def __init__(self, file, queue_maxsize: int = 2):
+        self.file = file
+        self.queue = asyncio.Queue(maxsize=queue_maxsize)
+        self.task: Optional[asyncio.Task] = None
+
+    async def start(self):
+        self.task = asyncio.create_task(self._prefetch())
+
+    async def _prefetch(self):
+        agen = self.file.async_generate_processed_file_data()
+        try:
+            async for chunk in agen:
+                await self.queue.put(chunk)
+        except (GeneratorExit, asyncio.CancelledError):
+            await agen.aclose()
+            raise
+        except Exception:
+            await self.queue.put(None)
+            raise
+        else:
+            await self.queue.put(None)
diff --git a/src/zipFly/ZipFly.py b/src/zipFly/ZipFly.py
@@ -1,11 +1,14 @@
-from typing import Generator, AsyncGenerator, Union
+import asyncio
+from typing import Generator, AsyncGenerator, Union, Optional
 
 from . import consts
 from .BaseFile import BaseFile
+from .FilePrefetcher import FilePrefetcher
 from .ZipBase import ZipBase
 import copy
 import types
 
+
 def process_file_names(files) -> list[BaseFile]:
     """Renames duplicated file names"""
     seen_names = set()
@@ -185,7 +188,7 @@ def _make_end_structures(self) -> Generator[bytes, None, None]:
         yield self._apply_remaining_offset(self._make_end_of_cdir_record())
 
     async def _async_stream_single_file(self, file: BaseFile) -> AsyncGenerator[bytes, None]:
-        """This function streams a single file, it also applies running_offset is needed"""
+        """This function streams a single file, it also applies remaining_offset if needed"""
 
         yield self._apply_remaining_offset(self._make_local_file_header(file))
 
@@ -225,6 +228,51 @@ async def async_stream(self) -> AsyncGenerator[bytes, None]:
 
         # self._cleanup()
 
+    async def async_stream_parallel(self, prefetch_files: int = 20, max_chunks_per_file: int = 2):
+        """
+        Stream files in parallel.
+        - prefetch_files: number of files' DATA to read ahead concurrently
+        - queue_maxsize: per-file buffered DATA chunks (backpressure)
+        """
+        self._check_if_can_stream()
+        start_idx, remaining_offset = self._find_starting_file()
+        self._remaining_offset = remaining_offset
+        self._set_offset(self._byte_offset - remaining_offset)
+
+        if start_idx is not None:
+
+            files = self._files[start_idx:]
+            prefetch_mgr = FilePrefetcher(files, prefetch_files, max_chunks_per_file)
+
+            for i, file in enumerate(files):
+                await prefetch_mgr.ensure_prefetch(i)
+
+                # 1) Local File Header
+                file.set_offset(self._get_offset())
+                header = self._make_local_file_header(file)
+                header = self._apply_remaining_offset(header)
+                self._add_offset(len(header))
+                if header:
+                    yield header
+
+                # 2) Stream DATA
+                async for chunk in prefetch_mgr.stream_file_data(i):
+                    out = self._apply_remaining_offset(chunk)
+                    if out:
+                        self._add_offset(len(out))
+                        yield out
+
+                # 3) Data Descriptor
+                dd = self._make_data_descriptor(file)
+                dd = self._apply_remaining_offset(dd)
+                self._add_offset(len(dd))
+                if dd:
+                    yield dd
+
+        # 4) Central directory & end records
+        for chunk in self._make_end_structures():
+            yield chunk
+
     def stream(self) -> Generator[bytes, None, None]:
         self._check_if_can_stream()
 
@@ -243,8 +291,6 @@ def stream(self) -> Generator[bytes, None, None]:
         for chunk in self._make_end_structures():
             yield chunk
 
-        # self._cleanup()
-
     def _check_if_can_stream(self):
         if self.__used:
             raise RuntimeError("Do not re-use zipFly instances. Recreate it.")
@@ -266,10 +312,3 @@ def _apply_remaining_offset(self, data):
         self._add_offset(self._remaining_offset)
         self._remaining_offset = 0  # Offset is fully applied
         return result
-
-    # def _cleanup(self):
-    #     pass
-    #     """Clean all data after streaming"""
-    #     super()._cleanup()
-    #     self._remaining_offset = 0
-    #     # self.__used = False
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -55,3 +55,9 @@ async def sized_zeros_generator_async(size: int) -> AsyncGenerator[bytes]:
     """Yield zeros up to a certain size."""
     for zeros in sized_zeros_generator(size):
         yield zeros
+
+def generate_data_async(data: bytes, repeat: int):
+    async def generator():
+        for _ in range(repeat):
+            yield data
+    return generator
diff --git a/tests/test_zipfly.py b/tests/test_zipfly.py
@@ -12,9 +12,46 @@
 
 from src.zipFly import GenFile, LocalFile, ZipFly, consts
 from src.zipFly.EmptyFolder import EmptyFolder
-from tests.test_utils import lorem_ipsum_generator, lorem_ipsum, single_archive_size, lorem_ipsum_generator_async, multifile_archive_size
+from tests.test_utils import lorem_ipsum_generator, lorem_ipsum, single_archive_size, lorem_ipsum_generator_async, multifile_archive_size, generate_data_async
 
 
+@pytest.mark.asyncio
+async def test_GenFile_multiple_files_async(tmp_path):
+    """Test async ZIP with multiple small files (~10KB each)."""
+    n = 10
+    chunk = b"x" * 1024  # 1KB
+    chunks_per_file = 10
+    expected_content = chunk * chunks_per_file
+
+    files = []
+    for i in range(n):
+        generator_func = generate_data_async(chunk, chunks_per_file)
+        file = GenFile(
+            name=f"file_{i}.txt",
+            generator=generator_func(),
+            modification_time=time.time(),
+            compression_method=consts.COMPRESSION_DEFLATE,
+        )
+        files.append(file)
+
+    zip_fly = ZipFly(files)
+    zip_path = tmp_path / "multi_file_async.zip"
+
+    with zip_path.open("wb") as fp:
+        async for zip_chunk in zip_fly.async_stream():
+            fp.write(zip_chunk)
+
+    with zipfile.ZipFile(zip_path) as zfp:
+        namelist = zfp.namelist()
+        assert len(namelist) == n
+
+        for i in range(n):
+            fname = f"file_{i}.txt"
+            assert fname in namelist
+            with zfp.open(fname) as tfp:
+                data = tfp.read()
+                assert data == expected_content
+
 def test_GenFile_COMPRESSION_DEFLATE(tmp_path):
     """Test GenFile with compression."""
     file1 = GenFile(