Skip to content

Commit 7642dac

Browse files
committed
Connect S3
1 parent 7f11a0a commit 7642dac

7 files changed

Lines changed: 198 additions & 19 deletions

File tree

libCacheSim-python/libcachesim/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
ReqOp,
99
TraceType,
1010
SamplerType,
11+
AnalysisParam,
12+
AnalysisOption,
1113
__doc__,
1214
__version__,
1315
)
@@ -43,6 +45,7 @@
4345
from .trace_analyzer import TraceAnalyzer
4446
from .synthetic_reader import SyntheticReader, create_zipf_requests, create_uniform_requests
4547
from .util import Util
48+
from .data_loader import DataLoader
4649

4750
__all__ = [
4851
# Core classes
@@ -51,6 +54,8 @@
5154
"ReqOp",
5255
"TraceType",
5356
"SamplerType",
57+
"AnalysisParam",
58+
"AnalysisOption",
5459
# Cache base class
5560
"CacheBase",
5661
# Core cache algorithms
@@ -85,6 +90,8 @@
8590
"create_uniform_requests",
8691
# Utilities
8792
"Util",
93+
# Data loader
94+
"DataLoader",
8895
# Metadata
8996
"__doc__",
9097
"__version__",
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""S3 Bucket data loader with local caching (HuggingFace-style)."""
2+
3+
from __future__ import annotations
4+
5+
import hashlib
6+
import logging
7+
import shutil
8+
from pathlib import Path
9+
from typing import Optional, Union
10+
from urllib.parse import quote
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
class DataLoader:
16+
DEFAULT_BUCKET = "cache-datasets"
17+
DEFAULT_CACHE_DIR = Path.home() / ".cache/libcachesim_hub"
18+
19+
def __init__(
20+
self,
21+
bucket_name: str = DEFAULT_BUCKET,
22+
cache_dir: Optional[Union[str, Path]] = None,
23+
use_auth: bool = False
24+
):
25+
self.bucket_name = bucket_name
26+
self.cache_dir = Path(cache_dir) if cache_dir else self.DEFAULT_CACHE_DIR
27+
self.use_auth = use_auth
28+
self._s3_client = None
29+
self._ensure_cache_dir()
30+
31+
def _ensure_cache_dir(self) -> None:
32+
(self.cache_dir / self.bucket_name).mkdir(parents=True, exist_ok=True)
33+
34+
@property
35+
def s3_client(self):
36+
if self._s3_client is None:
37+
try:
38+
import boto3
39+
from botocore.config import Config
40+
from botocore import UNSIGNED
41+
42+
self._s3_client = boto3.client(
43+
's3',
44+
config=None if self.use_auth else Config(signature_version=UNSIGNED)
45+
)
46+
except ImportError:
47+
raise ImportError("Install boto3: pip install boto3")
48+
return self._s3_client
49+
50+
def _cache_path(self, key: str) -> Path:
51+
safe_name = hashlib.sha256(key.encode()).hexdigest()[:16] + "_" + quote(key, safe='')
52+
return self.cache_dir / self.bucket_name / safe_name
53+
54+
def _download(self, key: str, dest: Path) -> None:
55+
temp = dest.with_suffix(dest.suffix + '.tmp')
56+
temp.parent.mkdir(parents=True, exist_ok=True)
57+
58+
try:
59+
logger.info(f"Downloading s3://{self.bucket_name}/{key}")
60+
obj = self.s3_client.get_object(Bucket=self.bucket_name, Key=key)
61+
with open(temp, 'wb') as f:
62+
f.write(obj['Body'].read())
63+
shutil.move(str(temp), str(dest))
64+
logger.info(f"Saved to: {dest}")
65+
except Exception as e:
66+
if temp.exists():
67+
temp.unlink()
68+
raise RuntimeError(f"Download failed for s3://{self.bucket_name}/{key}: {e}")
69+
70+
def load(self, key: str, force: bool = False, mode: str = 'rb') -> Union[bytes, str]:
71+
path = self._cache_path(key)
72+
if not path.exists() or force:
73+
self._download(key, path)
74+
with open(path, mode) as f:
75+
return f.read()
76+
77+
def is_cached(self, key: str) -> bool:
78+
return self._cache_path(key).exists()
79+
80+
def get_cache_path(self, key: str) -> Path:
81+
return self._cache_path(key).as_posix()
82+
83+
def clear_cache(self, key: Optional[str] = None) -> None:
84+
if key:
85+
path = self._cache_path(key)
86+
if path.exists():
87+
path.unlink()
88+
logger.info(f"Cleared: {path}")
89+
else:
90+
shutil.rmtree(self.cache_dir, ignore_errors=True)
91+
logger.info(f"Cleared entire cache: {self.cache_dir}")
92+
93+
def list_cached_files(self) -> list[str]:
94+
if not self.cache_dir.exists():
95+
return []
96+
return [
97+
str(p) for p in self.cache_dir.rglob('*')
98+
if p.is_file() and not p.name.endswith('.tmp')
99+
]
100+
101+
def get_cache_size(self) -> int:
102+
return sum(
103+
p.stat().st_size for p in self.cache_dir.rglob('*') if p.is_file()
104+
)
105+
106+
def list_s3_objects(self, prefix: str = "", delimiter: str = "/") -> dict:
107+
"""
108+
List S3 objects and pseudo-folders under a prefix.
109+
110+
Args:
111+
prefix: The S3 prefix to list under (like folder path)
112+
delimiter: Use "/" to simulate folder structure
113+
114+
Returns:
115+
A dict with two keys:
116+
- "folders": list of sub-prefixes (folders)
117+
- "files": list of object keys (files)
118+
"""
119+
paginator = self.s3_client.get_paginator('list_objects_v2')
120+
result = {"folders": [], "files": []}
121+
122+
for page in paginator.paginate(
123+
Bucket=self.bucket_name,
124+
Prefix=prefix,
125+
Delimiter=delimiter
126+
):
127+
# CommonPrefixes are like subdirectories
128+
result["folders"].extend(cp["Prefix"] for cp in page.get("CommonPrefixes", []))
129+
result["files"].extend(obj["Key"] for obj in page.get("Contents", []))
130+
131+
return result

libCacheSim-python/libcachesim/protocols.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"""
77

88
from __future__ import annotations
9-
from typing import Protocol, runtime_checkable, TYPE_CHECKING
9+
from typing import Iterator, Protocol, runtime_checkable, TYPE_CHECKING
1010

1111
if TYPE_CHECKING:
1212
from .libcachesim_python import Request
@@ -18,17 +18,16 @@ class ReaderProtocol(Protocol):
1818
1919
This protocol ensures that different reader implementations
2020
(SyntheticReader, TraceReader) can be used interchangeably.
21+
22+
Only core methods are defined here.
2123
"""
2224

2325
def get_num_of_req(self) -> int: ...
2426
def read_one_req(self, req: Request) -> Request: ...
27+
def skip_n_req(self, n: int) -> int: ...
2528
def reset(self) -> None: ...
2629
def close(self) -> None: ...
2730
def clone(self) -> "ReaderProtocol": ...
28-
def read_first_req(self, req: Request) -> Request: ...
29-
def read_last_req(self, req: Request) -> Request: ...
30-
def skip_n_req(self, n: int) -> int: ...
31-
def read_one_req_above(self, req: Request) -> Request: ...
32-
def go_back_one_req(self) -> None: ...
33-
def set_read_pos(self, pos: float) -> None: ...
34-
def get_read_pos(self) -> float: ...
31+
def __iter__(self) -> Iterator[Request]: ...
32+
def __next__(self) -> Request: ...
33+
def __len__(self) -> int: ...

libCacheSim-python/libcachesim/trace_analyzer.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,38 @@
1212
AnalysisParam,
1313
)
1414

15+
# Import ReaderException
16+
class ReaderException(Exception):
17+
"""Exception raised when reader is not compatible"""
18+
pass
1519

1620
class TraceAnalyzer:
1721
_analyzer: Analyzer
1822

1923
def __init__(
2024
self,
21-
analyzer: Analyzer,
2225
reader: ReaderProtocol,
2326
output_path: str,
24-
analysis_param: AnalysisParam,
25-
analysis_option: AnalysisOption,
27+
analysis_param: AnalysisParam = None,
28+
analysis_option: AnalysisOption = None,
2629
):
30+
"""
31+
Initialize trace analyzer.
32+
33+
Args:
34+
reader: Reader protocol
35+
output_path: Path to output file
36+
analysis_param: Analysis parameters
37+
analysis_option: Analysis options
38+
"""
39+
if not hasattr(reader, 'c_reader') or not reader.c_reader:
40+
raise ReaderException("Only C/C++ reader is supported")
41+
42+
if analysis_param is None:
43+
analysis_param = AnalysisParam()
44+
if analysis_option is None:
45+
analysis_option = AnalysisOption()
46+
2747
self._analyzer = Analyzer(reader._reader, output_path, analysis_option, analysis_param)
2848

2949
def run(self) -> None:

libCacheSim-python/src/export_analyzer.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ void export_analyzer(py::module& m) {
9292
AnalysisOptionDeleter>(
9393
new traceAnalyzer::analysis_option_t(option));
9494
}),
95-
"req_rate"_a = false, "access_pattern"_a = false, "size"_a = false,
96-
"reuse"_a = false, "popularity"_a = false, "ttl"_a = false,
95+
"req_rate"_a = true, "access_pattern"_a = true, "size"_a = true,
96+
"reuse"_a = true, "popularity"_a = true, "ttl"_a = false,
9797
"popularity_decay"_a = false, "lifetime"_a = false,
9898
"create_future_reuse_ccdf"_a = false, "prob_at_age"_a = false,
9999
"size_change"_a = false)
@@ -119,18 +119,17 @@ void export_analyzer(py::module& m) {
119119
py::class_<traceAnalyzer::TraceAnalyzer,
120120
std::unique_ptr<traceAnalyzer::TraceAnalyzer>>(m, "Analyzer")
121121
.def(py::init([](reader_t* reader, std::string output_path,
122-
const traceAnalyzer::analysis_param_t& param,
123-
const traceAnalyzer::analysis_option_t& option) {
122+
const traceAnalyzer::analysis_option_t& option,
123+
const traceAnalyzer::analysis_param_t& param) {
124124
traceAnalyzer::TraceAnalyzer* analyzer =
125125
new traceAnalyzer::TraceAnalyzer(reader, output_path, option,
126126
param);
127127
return std::unique_ptr<traceAnalyzer::TraceAnalyzer>(analyzer);
128128
}),
129129
"reader"_a, "output_path"_a,
130-
"param"_a = traceAnalyzer::default_param(),
131-
"option"_a = traceAnalyzer::default_option())
132-
.def("run", &traceAnalyzer::TraceAnalyzer::run)
133-
.def("cleanup", &traceAnalyzer::TraceAnalyzer::cleanup);
130+
"option"_a = traceAnalyzer::default_option(),
131+
"param"_a = traceAnalyzer::default_param())
132+
.def("run", &traceAnalyzer::TraceAnalyzer::run);
134133
}
135134

136135
} // namespace libcachesim
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from libcachesim import TraceAnalyzer, TraceReader, DataLoader
2+
import os
3+
4+
5+
def test_analyzer_common():
6+
# Add debugging and error handling
7+
loader = DataLoader()
8+
loader.load("cache_dataset_oracleGeneral/2020_tencentBlock/1K/tencentBlock_1621.oracleGeneral.zst")
9+
file_path = loader.get_cache_path("cache_dataset_oracleGeneral/2020_tencentBlock/1K/tencentBlock_1621.oracleGeneral.zst")
10+
11+
reader = TraceReader(file_path)
12+
13+
analyzer = TraceAnalyzer(reader, output_path="./")
14+
15+
analyzer.run()
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from libcachesim import DataLoader
2+
3+
4+
def test_data_loader_common():
5+
loader = DataLoader()
6+
loader.load("cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst")
7+
path = loader.get_cache_path("cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst")
8+
filles = loader.list_s3_objects("cache_dataset_oracleGeneral/2007_msr/")

0 commit comments

Comments
 (0)