Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ version: 2
updates:

# Maintain dependencies for GitHub Actions
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
- package-ecosystem: github-actions
directory: /
schedule:
interval: weekly

# Maintain dependencies for PIP
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "weekly"
- package-ecosystem: pip
directory: /
schedule:
interval: weekly
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@ build/**
test/
examples/*_output/
examples/ultra_fast_extraction_demo.py
*.prof
/tests/profiling/prof
4 changes: 1 addition & 3 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,6 @@ confidence=

# Disable docstring errors
disable=
# C0114,
# C0115,
# C0116,
raw-checker-failed,
bad-inline-option,
locally-disabled,
Expand All @@ -77,6 +74,7 @@ disable=
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
useless-return, # mypy expects return/return None


# Enable the message, report, category or checker with the given id(s). You can
Expand Down
16 changes: 6 additions & 10 deletions INTEGRATION_SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ relic sga unpack archive.sga ./output
- **86x faster** than original
- **3-4 seconds** for 7,815 files

#### **`--compatible`**
#### **`--legacy`**

- Fallback to fs-based extraction
- Use if fast mode has issues
Expand All @@ -42,10 +42,10 @@ relic sga unpack archive.sga ./output

## 📊 Performance Comparison

| Mode | Command | Time | Speed | Use Case |
|------|---------|------|-------|----------|
| **Fast (NEW DEFAULT)** | `relic sga unpack file.sga out/` | **3.5s** | **2,248 files/s** | Production use |
| Compatible (Legacy) | `relic sga unpack file.sga out/ --compatible` | 300s | 26 files/s | Compatibility |
| Mode | Command | Time | Speed | Use Case |
|------|-------------------------------------------|------|-------|----------|
| **Fast (NEW DEFAULT)** | `relic sga unpack file.sga out/` | **3.5s** | **2,248 files/s** | Production use |
| Compatible (Legacy) | `relic sga unpack file.sga out/ --legacy` | 300s | 26 files/s | Compatibility |

**Performance Gain: 86x faster!** 🚀

Expand Down Expand Up @@ -98,11 +98,7 @@ relic sga unpack archive.sga ./output --isolate

```python
# Use the advanced parallel unpacker
from relic.sga.core.parallel_advanced import AdvancedParallelUnpacker

# Standard extraction (still fast)
unpacker = AdvancedParallelUnpacker(num_workers=4)
stats = unpacker.extract_streaming(sga_path, output_dir)
from relic.sga.core.native.parallel_advanced import AdvancedParallelUnpacker

# Fast extraction (86x faster!)
unpacker = AdvancedParallelUnpacker(num_workers=15)
Expand Down
100 changes: 62 additions & 38 deletions src/relic/sga/core/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,24 @@
from fs.base import FS
from fs.copy import copy_fs
from relic.core.cli import CliPluginGroup, _SubParsersAction, CliPlugin, RelicArgParser

from relic.sga.core.definitions import MAGIC_WORD
from relic.sga.core.essencefs import EssenceFS
from relic.sga.core.essencefs.opener import registry as sga_registry
from relic.sga.core.serialization import VersionSerializer
from relic.core.logmsg import BraceMessage
from relic.core.cli import (
get_file_type_validator,
get_dir_type_validator,
get_path_validator,
)
from relic.sga.core.parallel_advanced import AdvancedParallelUnpacker
from relic.core.logmsg import BraceMessage

from relic.sga.core.definitions import MAGIC_WORD
from relic.sga.core.essencefs import EssenceFS
from relic.sga.core.essencefs.opener import registry as sga_registry
from relic.sga.core.native.parallel_advanced import (
AdvancedParallelUnpacker,
UnpackerConfig,
)
from relic.sga.core.serialization import VersionSerializer

_SUCCESS = 0
_FAIL = 1

# Backwards compatibility aliases for v2 package
_get_file_type_validator = get_file_type_validator
Expand Down Expand Up @@ -56,7 +60,9 @@ def _create_parser(
desc = """Unpack an SGA archive to the filesystem.
If only one root is present in the SGA, '--merge' is implied.
If multiple roots are in the SGA '--isolate' is implied.
Manually specify the flags to override this behaviour."""
Manually specify the flags to override this behaviour.
"""

if command_group is None:
parser = RelicArgParser("unpack", description=desc)
else:
Expand Down Expand Up @@ -88,17 +94,16 @@ def _create_parser(
)

# Performance options
parser.add_argument(
"--fast",
help="Use Fast native extraction (default, 80x faster)",
sga_legacy_flags = parser.add_mutually_exclusive_group()
sga_legacy_flags.add_argument(
"--legacy",
help="Use fs-based extraction instead of native/parallelized extraction",
action="store_true",
default=True,
)
parser.add_argument(
"--compatible",
help="Use compatible fs-based extraction (slower, more compatible)",
sga_legacy_flags.add_argument(
"--nolegacy",
help="Do not fallback to legacy extraction if native/parallelized extraction fails",
action="store_true",
default=False,
)
parser.add_argument(
"--workers",
Expand All @@ -114,19 +119,28 @@ def command(self, ns: Namespace, *, logger: Logger) -> Optional[int]:
outdir: str = ns.out_dir
merge: bool = ns.merge
isolate: bool = ns.isolate
use_fast: bool = not ns.compatible # Use fast unless --compatible specified
use_legacy: bool = ns.legacy
should_fallback = not ns.nolegacy
num_workers: Optional[int] = ns.workers

if merge and isolate: # pragma: nocover
# This error should be impossible
raise relic.core.cli.RelicArgParserError(
"Isolate and Merge flags are mutually exclusive"
)
if use_legacy and should_fallback: # pragma: nocover
# This error should be impossible
raise relic.core.cli.RelicArgParserError(
"Legacy and NoLegacy flags are mutually exclusive"
)

logger.info(f"Unpacking `{infile}`")

def use_merge_mode(drive_count: int) -> bool:
return merge or (not isolate and drive_count == 1)

# Use Fast native extraction by default
if use_fast:
if not use_legacy:
try:
import multiprocessing

Expand All @@ -135,7 +149,15 @@ def command(self, ns: Namespace, *, logger: Logger) -> Optional[int]:

logger.info(f"Using Fast native extraction ({num_workers} workers)")
unpacker = AdvancedParallelUnpacker(
num_workers=num_workers, enable_delta=False, logger=logger
UnpackerConfig(
num_workers=num_workers,
logger=logger,
disable_gc=True,
native_files=False,
precache_dirs=True,
verbose=True,
should_merge=use_merge_mode,
)
)

# Progress callback
Expand All @@ -145,27 +167,29 @@ def _progress(current: int, total: int) -> None:
f" Progress: {current}/{total} files ({current*100//total}%)"
)

stats = unpacker.extract_native_ultra_fast(
infile, outdir, on_progress=_progress
)
stats = unpacker.extract(infile, outdir, on_progress=_progress)

logger.info(
f"Extraction complete: {stats.extracted_files} files extracted"
)
if stats.failed_files > 0:
logger.warning(f"Failed: {stats.failed_files} files")
return 1
return _FAIL

return _SUCCESS

except Exception as e:
logger.warning(f"Fast extraction failed: {e}")
logger.info("Falling back to compatible mode...")
use_fast = False
logger.warning("Fast extraction failed:")
logger.exception(e)
if use_legacy:
logger.info("Falling back to legacy mode...")
else:
return _FAIL
use_legacy = should_fallback

# Fallback to compatible fs-based extraction
if not use_fast:
logger.info("Using compatible fs-based extraction")
if use_legacy:
logger.info("Using fs-based (legacy) extraction")

def _callback(_1: FS, srcfile: str, _2: FS, dstfile: str) -> None:
logger.info(f"\t\tUnpacking File `{srcfile}`\n\t\tWrote to `{dstfile}`")
Expand All @@ -174,8 +198,9 @@ def _callback(_1: FS, srcfile: str, _2: FS, dstfile: str) -> None:
sga: EssenceFS
with open_fs(infile, default_protocol="sga") as sga: # type: ignore
roots = list(sga.iterate_fs())

# Explicit and Implicit merge; we reuse sga to avoid reopening the filesystem
if merge or (not isolate and len(roots) == 1):
if use_merge_mode(len(roots)):
copy_fs(
sga, f"osfs://{outdir}", on_copy=_callback, preserve_time=True
)
Expand All @@ -198,9 +223,8 @@ def default(self, o: Any) -> Any:
return dataclasses.asdict(o) # type: ignore
try:
return super().default(o)
except (
TypeError
): # Kinda bad; but we don't want to serialize, we want to logger.info; so i think this is an acceptable tradeoff
# Kinda bad; but we don't want to serialize, we want to logger.info; so i think this is an acceptable tradeoff
except TypeError:
return str(o)


Expand Down Expand Up @@ -263,7 +287,7 @@ def command(self, ns: Namespace, *, logger: Logger) -> Optional[int]:
os.makedirs(outjson_dir, exist_ok=True)
outjson = os.path.join(outjson_dir, outjson_file)

with open(outjson, "w", encoding=None) as info_h:
with open(outjson, "w", encoding="utf8") as info_h:
json_kwargs: Dict[str, Any] = (
self._JSON_MINIFY_KWARGS if minify else self._JSON_MAXIFY_KWARGS
)
Expand Down Expand Up @@ -331,16 +355,17 @@ def command(self, ns: Namespace, *, logger: logging.Logger) -> Optional[int]:
with open(sga_file, "rb") as sga:
if not MAGIC_WORD.check(sga, advance=True):
logger.warning("File is not an SGA")
else:
version = VersionSerializer.read(sga)
logger.info(version)
return _FAIL

version = VersionSerializer.read(sga)
logger.info(version)
return _SUCCESS
except IOError: # pragma: nocover
# I don't know how to force an io error here for coverage testing
# we safely handle bad file paths
# So I believe this only occurs when a genuine fatal error occurs
logger.error("Error reading file")
raise
return None


class RelicSgaListCli(CliPlugin):
Expand All @@ -362,5 +387,4 @@ def command(self, ns: Namespace, *, logger: logging.Logger) -> Optional[int]:
logger.info(key)
if len(keys) == 0:
logger.info("No Plugins Found!")

return None
6 changes: 2 additions & 4 deletions src/relic/sga/core/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
from __future__ import annotations

import os
import sys
from dataclasses import dataclass
from functools import total_ordering
from enum import Enum, IntFlag
from functools import total_ordering
from typing import Any, Tuple, Iterable, Union, List, TypeVar

from relic.core.serialization import MagicWord
Expand All @@ -22,8 +21,7 @@
def _has_get_attr(o: Any, name: str, default: _T) -> _T:
if hasattr(o, name):
return getattr(o, name) # type: ignore
else:
return default
return default


# Safe versions of existing flags
Expand Down
6 changes: 3 additions & 3 deletions src/relic/sga/core/errors.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""Error definitions for the SGA API."""

from typing import List, Optional, Generic, TypeVar

from typing import List, Optional, Generic, TypeVar, Sequence
from relic.core.errors import MismatchError, RelicToolError

from relic.sga.core.definitions import Version

_T = TypeVar("_T")
Expand All @@ -27,7 +27,7 @@ def __init__(
class VersionNotSupportedError(RelicToolError):
"""An unknown version was provided."""

def __init__(self, received: Version, allowed: List[Version]):
def __init__(self, received: Version, allowed: Sequence[Version]):
super().__init__()
self.received = received
self.allowed = allowed
Expand Down
3 changes: 1 addition & 2 deletions src/relic/sga/core/essencefs/opener.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@

import fs.opener
from fs.opener import Opener

from fs.opener.parse import ParseResult
from relic.core.entrytools import EntrypointRegistry
from relic.core.errors import RelicToolError
from relic.core.lazyio import BinaryProxy, get_proxy
from relic.core.entrytools import EntrypointRegistry

from relic.sga.core.definitions import Version, MAGIC_WORD
from relic.sga.core.errors import VersionNotSupportedError
Expand Down
8 changes: 5 additions & 3 deletions src/relic/sga/core/hashtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from relic.core.lazyio import read_chunks
from relic.core.logmsg import BraceMessage

from relic.sga.core.errors import (
HashMismatchError,
Md5MismatchError,
Expand Down Expand Up @@ -103,6 +104,7 @@ def validate(
name if name is not None else self._hasher_name, result, expected
)

_HASH_CHUNK_SIZE = 1024 * 1024 * 16

def _md5(
stream: Hashable,
Expand All @@ -116,7 +118,7 @@ def _md5(
if eigen is not None
else hashlib.md5(usedforsecurity=False)
)
for chunk in read_chunks(stream, start, size):
for chunk in read_chunks(stream, start, size, chunk_size=_HASH_CHUNK_SIZE):
hasher.update(chunk)
return hasher.digest()

Expand All @@ -129,7 +131,7 @@ def _crc32(
eigen: Optional[int] = None,
) -> int:
crc = eigen if eigen is not None else 0
for chunk in read_chunks(stream, start, size):
for chunk in read_chunks(stream, start, size, chunk_size=_HASH_CHUNK_SIZE):
crc = zlib.crc32(chunk, crc)
return crc

Expand All @@ -146,7 +148,7 @@ def _sha1(
if eigen is not None
else hashlib.sha1(usedforsecurity=False)
)
for chunk in read_chunks(stream, start, size):
for chunk in read_chunks(stream, start, size, chunk_size=_HASH_CHUNK_SIZE):
hasher.update(chunk)
return hasher.digest()

Expand Down
Empty file.
Loading
Loading