Connect S3

haochengxia · haochengxia · commit 7642dac89e51 · 2025-07-23T14:14:56.000-04:00
diff --git a/libCacheSim-python/libcachesim/__init__.py b/libCacheSim-python/libcachesim/__init__.py
@@ -8,6 +8,8 @@
     ReqOp,
     TraceType,
     SamplerType,
+    AnalysisParam,
+    AnalysisOption,
     __doc__,
     __version__,
 )
@@ -43,6 +45,7 @@
 from .trace_analyzer import TraceAnalyzer
 from .synthetic_reader import SyntheticReader, create_zipf_requests, create_uniform_requests
 from .util import Util
+from .data_loader import DataLoader
 
 __all__ = [
     # Core classes
@@ -51,6 +54,8 @@
     "ReqOp",
     "TraceType",
     "SamplerType",
+    "AnalysisParam",
+    "AnalysisOption",
     # Cache base class
     "CacheBase",
     # Core cache algorithms
@@ -85,6 +90,8 @@
     "create_uniform_requests",
     # Utilities
     "Util",
+    # Data loader
+    "DataLoader",
     # Metadata
     "__doc__",
     "__version__",
diff --git a/libCacheSim-python/libcachesim/data_loader.py b/libCacheSim-python/libcachesim/data_loader.py
@@ -0,0 +1,131 @@
+"""S3 Bucket data loader with local caching (HuggingFace-style)."""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import shutil
+from pathlib import Path
+from typing import Optional, Union
+from urllib.parse import quote
+
+logger = logging.getLogger(__name__)
+
+
+class DataLoader:
+    DEFAULT_BUCKET = "cache-datasets"
+    DEFAULT_CACHE_DIR = Path.home() / ".cache/libcachesim_hub"
+
+    def __init__(
+        self,
+        bucket_name: str = DEFAULT_BUCKET,
+        cache_dir: Optional[Union[str, Path]] = None,
+        use_auth: bool = False
+    ):
+        self.bucket_name = bucket_name
+        self.cache_dir = Path(cache_dir) if cache_dir else self.DEFAULT_CACHE_DIR
+        self.use_auth = use_auth
+        self._s3_client = None
+        self._ensure_cache_dir()
+
+    def _ensure_cache_dir(self) -> None:
+        (self.cache_dir / self.bucket_name).mkdir(parents=True, exist_ok=True)
+
+    @property
+    def s3_client(self):
+        if self._s3_client is None:
+            try:
+                import boto3
+                from botocore.config import Config
+                from botocore import UNSIGNED
+
+                self._s3_client = boto3.client(
+                    's3',
+                    config=None if self.use_auth else Config(signature_version=UNSIGNED)
+                )
+            except ImportError:
+                raise ImportError("Install boto3: pip install boto3")
+        return self._s3_client
+
+    def _cache_path(self, key: str) -> Path:
+        safe_name = hashlib.sha256(key.encode()).hexdigest()[:16] + "_" + quote(key, safe='')
+        return self.cache_dir / self.bucket_name / safe_name
+
+    def _download(self, key: str, dest: Path) -> None:
+        temp = dest.with_suffix(dest.suffix + '.tmp')
+        temp.parent.mkdir(parents=True, exist_ok=True)
+
+        try:
+            logger.info(f"Downloading s3://{self.bucket_name}/{key}")
+            obj = self.s3_client.get_object(Bucket=self.bucket_name, Key=key)
+            with open(temp, 'wb') as f:
+                f.write(obj['Body'].read())
+            shutil.move(str(temp), str(dest))
+            logger.info(f"Saved to: {dest}")
+        except Exception as e:
+            if temp.exists():
+                temp.unlink()
+            raise RuntimeError(f"Download failed for s3://{self.bucket_name}/{key}: {e}")
+
+    def load(self, key: str, force: bool = False, mode: str = 'rb') -> Union[bytes, str]:
+        path = self._cache_path(key)
+        if not path.exists() or force:
+            self._download(key, path)
+        with open(path, mode) as f:
+            return f.read()
+
+    def is_cached(self, key: str) -> bool:
+        return self._cache_path(key).exists()
+
+    def get_cache_path(self, key: str) -> Path:
+        return self._cache_path(key).as_posix()
+
+    def clear_cache(self, key: Optional[str] = None) -> None:
+        if key:
+            path = self._cache_path(key)
+            if path.exists():
+                path.unlink()
+                logger.info(f"Cleared: {path}")
+        else:
+            shutil.rmtree(self.cache_dir, ignore_errors=True)
+            logger.info(f"Cleared entire cache: {self.cache_dir}")
+
+    def list_cached_files(self) -> list[str]:
+        if not self.cache_dir.exists():
+            return []
+        return [
+            str(p) for p in self.cache_dir.rglob('*')
+            if p.is_file() and not p.name.endswith('.tmp')
+        ]
+
+    def get_cache_size(self) -> int:
+        return sum(
+            p.stat().st_size for p in self.cache_dir.rglob('*') if p.is_file()
+        )
+
+    def list_s3_objects(self, prefix: str = "", delimiter: str = "/") -> dict:
+        """
+        List S3 objects and pseudo-folders under a prefix.
+
+        Args:
+            prefix: The S3 prefix to list under (like folder path)
+            delimiter: Use "/" to simulate folder structure
+
+        Returns:
+            A dict with two keys:
+                - "folders": list of sub-prefixes (folders)
+                - "files": list of object keys (files)
+        """
+        paginator = self.s3_client.get_paginator('list_objects_v2')
+        result = {"folders": [], "files": []}
+
+        for page in paginator.paginate(
+            Bucket=self.bucket_name,
+            Prefix=prefix,
+            Delimiter=delimiter
+        ):
+            # CommonPrefixes are like subdirectories
+            result["folders"].extend(cp["Prefix"] for cp in page.get("CommonPrefixes", []))
+            result["files"].extend(obj["Key"] for obj in page.get("Contents", []))
+
+        return result
diff --git a/libCacheSim-python/libcachesim/protocols.py b/libCacheSim-python/libcachesim/protocols.py
@@ -6,7 +6,7 @@
 """
 
 from __future__ import annotations
-from typing import Protocol, runtime_checkable, TYPE_CHECKING
+from typing import Iterator, Protocol, runtime_checkable, TYPE_CHECKING
 
 if TYPE_CHECKING:
     from .libcachesim_python import Request
@@ -18,17 +18,16 @@ class ReaderProtocol(Protocol):
 
     This protocol ensures that different reader implementations
     (SyntheticReader, TraceReader) can be used interchangeably.
+
+    Only core methods are defined here.
     """
 
     def get_num_of_req(self) -> int: ...
     def read_one_req(self, req: Request) -> Request: ...
+    def skip_n_req(self, n: int) -> int: ...
     def reset(self) -> None: ...
     def close(self) -> None: ...
     def clone(self) -> "ReaderProtocol": ...
-    def read_first_req(self, req: Request) -> Request: ...
-    def read_last_req(self, req: Request) -> Request: ...
-    def skip_n_req(self, n: int) -> int: ...
-    def read_one_req_above(self, req: Request) -> Request: ...
-    def go_back_one_req(self) -> None: ...
-    def set_read_pos(self, pos: float) -> None: ...
-    def get_read_pos(self) -> float: ...
+    def __iter__(self) -> Iterator[Request]: ...
+    def __next__(self) -> Request: ...
+    def __len__(self) -> int: ...
diff --git a/libCacheSim-python/libcachesim/trace_analyzer.py b/libCacheSim-python/libcachesim/trace_analyzer.py
@@ -12,18 +12,38 @@
     AnalysisParam,
 )
 
+# Import ReaderException
+class ReaderException(Exception):
+    """Exception raised when reader is not compatible"""
+    pass
 
 class TraceAnalyzer:
     _analyzer: Analyzer
 
     def __init__(
         self,
-        analyzer: Analyzer,
         reader: ReaderProtocol,
         output_path: str,
-        analysis_param: AnalysisParam,
-        analysis_option: AnalysisOption,
+        analysis_param: AnalysisParam = None,
+        analysis_option: AnalysisOption = None,
     ):
+        """
+        Initialize trace analyzer.
+
+        Args:
+            reader: Reader protocol
+            output_path: Path to output file
+            analysis_param: Analysis parameters
+            analysis_option: Analysis options
+        """
+        if not hasattr(reader, 'c_reader') or not reader.c_reader:
+            raise ReaderException("Only C/C++ reader is supported")
+
+        if analysis_param is None:
+            analysis_param = AnalysisParam()
+        if analysis_option is None:
+            analysis_option = AnalysisOption()
+
         self._analyzer = Analyzer(reader._reader, output_path, analysis_option, analysis_param)
 
     def run(self) -> None:
diff --git a/libCacheSim-python/src/export_analyzer.cpp b/libCacheSim-python/src/export_analyzer.cpp
@@ -92,8 +92,8 @@ void export_analyzer(py::module& m) {
                                    AnalysisOptionDeleter>(
                 new traceAnalyzer::analysis_option_t(option));
           }),
-          "req_rate"_a = false, "access_pattern"_a = false, "size"_a = false,
-          "reuse"_a = false, "popularity"_a = false, "ttl"_a = false,
+          "req_rate"_a = true, "access_pattern"_a = true, "size"_a = true,
+          "reuse"_a = true, "popularity"_a = true, "ttl"_a = false,
           "popularity_decay"_a = false, "lifetime"_a = false,
           "create_future_reuse_ccdf"_a = false, "prob_at_age"_a = false,
           "size_change"_a = false)
@@ -119,18 +119,17 @@ void export_analyzer(py::module& m) {
   py::class_<traceAnalyzer::TraceAnalyzer,
              std::unique_ptr<traceAnalyzer::TraceAnalyzer>>(m, "Analyzer")
       .def(py::init([](reader_t* reader, std::string output_path,
-                       const traceAnalyzer::analysis_param_t& param,
-                       const traceAnalyzer::analysis_option_t& option) {
+                       const traceAnalyzer::analysis_option_t& option,
+                       const traceAnalyzer::analysis_param_t& param) {
              traceAnalyzer::TraceAnalyzer* analyzer =
                  new traceAnalyzer::TraceAnalyzer(reader, output_path, option,
                                                   param);
              return std::unique_ptr<traceAnalyzer::TraceAnalyzer>(analyzer);
            }),
            "reader"_a, "output_path"_a,
-           "param"_a = traceAnalyzer::default_param(),
-           "option"_a = traceAnalyzer::default_option())
-      .def("run", &traceAnalyzer::TraceAnalyzer::run)
-      .def("cleanup", &traceAnalyzer::TraceAnalyzer::cleanup);
+           "option"_a = traceAnalyzer::default_option(),
+           "param"_a = traceAnalyzer::default_param())
+      .def("run", &traceAnalyzer::TraceAnalyzer::run);
 }
 
 }  // namespace libcachesim
diff --git a/libCacheSim-python/tests/test_analyzer.py b/libCacheSim-python/tests/test_analyzer.py
@@ -0,0 +1,15 @@
+from libcachesim import TraceAnalyzer, TraceReader, DataLoader
+import os
+
+
+def test_analyzer_common():
+    # Add debugging and error handling
+    loader = DataLoader()
+    loader.load("cache_dataset_oracleGeneral/2020_tencentBlock/1K/tencentBlock_1621.oracleGeneral.zst")
+    file_path = loader.get_cache_path("cache_dataset_oracleGeneral/2020_tencentBlock/1K/tencentBlock_1621.oracleGeneral.zst")
+
+    reader = TraceReader(file_path)
+
+    analyzer = TraceAnalyzer(reader, output_path="./")
+
+    analyzer.run()
diff --git a/libCacheSim-python/tests/test_data_loader.py b/libCacheSim-python/tests/test_data_loader.py
@@ -0,0 +1,8 @@
+from libcachesim import DataLoader
+
+
+def test_data_loader_common():
+    loader = DataLoader()
+    loader.load("cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst")
+    path = loader.get_cache_path("cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst")
+    filles = loader.list_s3_objects("cache_dataset_oracleGeneral/2007_msr/")