From 9d0b8a7206fb0d933495fbc615fe1b5a9ba7967d Mon Sep 17 00:00:00 2001 From: haochengxia Date: Sun, 20 Jul 2025 23:33:41 -0400 Subject: [PATCH 1/4] Pybind refactor --- .vscode/settings.json | 8 +- libCacheSim-python/CMakeLists.txt | 32 +- libCacheSim-python/libcachesim/__init__.py | 96 +- libCacheSim-python/libcachesim/__init__.pyi | 472 +++---- libCacheSim-python/libcachesim/cache.py | 396 ++++++ libCacheSim-python/libcachesim/const.py | 1 - libCacheSim-python/libcachesim/eviction.py | 713 ---------- libCacheSim-python/libcachesim/protocols.py | 71 + .../libcachesim/synthetic_reader.py | 408 ++++++ .../libcachesim/trace_analyzer.py | 29 + .../libcachesim/trace_generator.py | 215 --- .../libcachesim/trace_reader.py | 251 ++++ libCacheSim-python/libcachesim/util.py | 50 + libCacheSim-python/src/exception.cpp | 56 + libCacheSim-python/src/exception.h | 33 + libCacheSim-python/src/export.cpp | 38 + libCacheSim-python/src/export.h | 27 + libCacheSim-python/src/export_analyzer.cpp | 136 ++ libCacheSim-python/src/export_cache.cpp | 493 +++++++ libCacheSim-python/src/export_misc.cpp | 30 + libCacheSim-python/src/export_reader.cpp | 312 +++++ libCacheSim-python/src/pylibcachesim.cpp | 1223 ----------------- libCacheSim-python/tests/conftest.py | 26 - libCacheSim-python/tests/test_eviction.py | 62 - libCacheSim-python/tests/test_example.py | 16 + .../tests/test_process_trace.py | 220 --- .../tests/test_python_hook_cache.py | 205 --- .../tests/test_trace_generator.py | 135 -- .../tests/test_unified_interface.py | 181 --- libCacheSim-python/tests/utils.py | 16 - libCacheSim/traceReader/CMakeLists.txt | 6 +- scripts/install_python_dev.sh | 2 +- 32 files changed, 2641 insertions(+), 3318 deletions(-) create mode 100644 libCacheSim-python/libcachesim/cache.py delete mode 100644 libCacheSim-python/libcachesim/const.py delete mode 100644 libCacheSim-python/libcachesim/eviction.py create mode 100644 libCacheSim-python/libcachesim/protocols.py create mode 100644 libCacheSim-python/libcachesim/synthetic_reader.py create mode 100644 libCacheSim-python/libcachesim/trace_analyzer.py delete mode 100644 libCacheSim-python/libcachesim/trace_generator.py create mode 100644 libCacheSim-python/libcachesim/trace_reader.py create mode 100644 libCacheSim-python/libcachesim/util.py create mode 100644 libCacheSim-python/src/exception.cpp create mode 100644 libCacheSim-python/src/exception.h create mode 100644 libCacheSim-python/src/export.cpp create mode 100644 libCacheSim-python/src/export.h create mode 100644 libCacheSim-python/src/export_analyzer.cpp create mode 100644 libCacheSim-python/src/export_cache.cpp create mode 100644 libCacheSim-python/src/export_misc.cpp create mode 100644 libCacheSim-python/src/export_reader.cpp delete mode 100644 libCacheSim-python/src/pylibcachesim.cpp delete mode 100644 libCacheSim-python/tests/test_eviction.py create mode 100644 libCacheSim-python/tests/test_example.py delete mode 100644 libCacheSim-python/tests/test_process_trace.py delete mode 100644 libCacheSim-python/tests/test_python_hook_cache.py delete mode 100644 libCacheSim-python/tests/test_trace_generator.py delete mode 100644 libCacheSim-python/tests/test_unified_interface.py delete mode 100644 libCacheSim-python/tests/utils.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 407bc808a..986387493 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -108,7 +108,10 @@ "editor.formatOnSave": true, "editor.insertSpaces": true, "editor.detectIndentation": true, - "editor.rulers": [80, 100], + "editor.rulers": [ + 80, + 100 + ], "editor.wordWrap": "wordWrapColumn", "editor.wordWrapColumn": 100, "files.trimTrailingWhitespace": true, @@ -133,5 +136,6 @@ "**/*.code-search": true }, "git.ignoreLimitWarning": true, - "terminal.integrated.cwd": "${workspaceFolder}" + "terminal.integrated.cwd": "${workspaceFolder}", + "python.formatting.provider": "yapf" } diff --git a/libCacheSim-python/CMakeLists.txt b/libCacheSim-python/CMakeLists.txt index aebee06c3..a8b76ec79 100644 --- a/libCacheSim-python/CMakeLists.txt +++ b/libCacheSim-python/CMakeLists.txt @@ -76,22 +76,32 @@ else() message(FATAL_ERROR "Pre-built libCacheSim library not found. Please build the main project first: cd .. && cmake -G Ninja -B build && ninja -C build") endif() -python_add_library(_libcachesim MODULE - src/pylibcachesim.cpp +include_directories(src) + +python_add_library(libcachesim_python MODULE + src/export.cpp + src/export_cache.cpp + src/export_reader.cpp + src/export_analyzer.cpp + src/export_misc.cpp + src/exception.cpp ${MAIN_PROJECT_SOURCE_DIR}/libCacheSim/bin/cli_reader_utils.c + ${MAIN_PROJECT_SOURCE_DIR}/libCacheSim/bin/traceUtils/traceConvLCS.cpp + ${MAIN_PROJECT_SOURCE_DIR}/libCacheSim/bin/traceUtils/traceConvOracleGeneral.cpp + ${MAIN_PROJECT_SOURCE_DIR}/libCacheSim/bin/traceUtils/utils.cpp WITH_SOABI ) -set_target_properties(_libcachesim PROPERTIES +set_target_properties(libcachesim_python PROPERTIES POSITION_INDEPENDENT_CODE ON INSTALL_RPATH_USE_LINK_PATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN" ) -target_compile_definitions(_libcachesim PRIVATE VERSION_INFO=${PROJECT_VERSION}) +target_compile_definitions(libcachesim_python PRIVATE VERSION_INFO=${PROJECT_VERSION}) -target_link_libraries(_libcachesim PRIVATE +target_link_libraries(libcachesim_python PRIVATE ${LIBCACHESIM_TARGET} pybind11::headers pybind11::module @@ -102,8 +112,8 @@ target_link_libraries(_libcachesim PRIVATE # Add platform-specific link options and libraries if(CMAKE_SYSTEM_NAME STREQUAL "Linux") # GNU ld option, only available on Linux - target_link_options(_libcachesim PRIVATE -Wl,--no-as-needed) - target_link_libraries(_libcachesim PRIVATE dl) + target_link_options(libcachesim_python PRIVATE -Wl,--no-as-needed) + target_link_libraries(libcachesim_python PRIVATE dl) elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") # macOS doesn't need --no-as-needed # dl functions are part of the system library on macOS @@ -112,21 +122,21 @@ elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") # Find argp library on macOS find_library(ARGP_LIBRARY argp PATHS /opt/homebrew/lib /usr/local/lib) if(ARGP_LIBRARY) - target_link_libraries(_libcachesim PRIVATE ${ARGP_LIBRARY}) + target_link_libraries(libcachesim_python PRIVATE ${ARGP_LIBRARY}) endif() # Find and link other dependencies that might be needed find_library(INTL_LIBRARY intl PATHS /opt/homebrew/lib /usr/local/lib) if(INTL_LIBRARY) - target_link_libraries(_libcachesim PRIVATE ${INTL_LIBRARY}) + target_link_libraries(libcachesim_python PRIVATE ${INTL_LIBRARY}) endif() else() # Other platforms - try to link dl if available find_library(DL_LIBRARY dl) if(DL_LIBRARY) - target_link_libraries(_libcachesim PRIVATE ${DL_LIBRARY}) + target_link_libraries(libcachesim_python PRIVATE ${DL_LIBRARY}) endif() endif() # install to wheel directory -install(TARGETS _libcachesim LIBRARY DESTINATION libcachesim) +install(TARGETS libcachesim_python LIBRARY DESTINATION libcachesim) diff --git a/libCacheSim-python/libcachesim/__init__.py b/libCacheSim-python/libcachesim/__init__.py index 47e693cde..b9424a37b 100644 --- a/libCacheSim-python/libcachesim/__init__.py +++ b/libCacheSim-python/libcachesim/__init__.py @@ -2,83 +2,89 @@ from __future__ import annotations -from ._libcachesim import ( +from .libcachesim_python import ( Cache, - Reader, - ReaderInitParam, Request, ReqOp, TraceType, + SamplerType, __doc__, __version__, - open_trace, - process_trace, - process_trace_python_hook, ) -from .eviction import ( - ARC, - Belady, - BeladySize, - Cacheus, - Clock, + +from .cache import ( + CacheBase, + # Core algorithms + LRU, FIFO, - LeCaR, LFU, - LFUDA, - LRB, - LRU, - PythonHookCachePolicy, - QDLP, + ARC, + Clock, + Random, + # Advanced algorithms S3FIFO, Sieve, - SLRU, - ThreeLCache, - TinyLFU, + LIRS, TwoQ, + SLRU, WTinyLFU, -) -from .trace_generator import ( - create_zipf_requests, - create_uniform_requests, + LeCaR, + LFUDA, + ClockPro, + Cacheus, + # Optimal algorithms + Belady, + BeladySize, + # Plugin cache + PythonHookCachePolicy, ) +from .trace_reader import TraceReader +from .trace_analyzer import TraceAnalyzer +from .synthetic_reader import SyntheticReader, create_zipf_requests, create_uniform_requests +from .util import Util + __all__ = [ # Core classes "Cache", - "Reader", "Request", - "ReaderInitParam", - # Trace types and operations - "TraceType", "ReqOp", - # Cache policies + "TraceType", + "SamplerType", + # Cache base class + "CacheBase", + # Core cache algorithms "LRU", "FIFO", + "LFU", "ARC", "Clock", - "LFU", - "LFUDA", - "SLRU", + "Random", + # Advanced cache algorithms "S3FIFO", "Sieve", - "TinyLFU", - "WTinyLFU", + "LIRS", "TwoQ", - "ThreeLCache", - "Belady", - "BeladySize", - "LRB", - "QDLP", + "SLRU", + "WTinyLFU", "LeCaR", + "LFUDA", + "ClockPro", "Cacheus", - # Custom cache policy + # Optimal algorithms + "Belady", + "BeladySize", + # Plugin cache "PythonHookCachePolicy", - # Functions - "open_trace", - "process_trace", - "process_trace_python_hook", + # Readers and analyzers + "TraceReader", + "TraceAnalyzer", + "SyntheticReader", + # Trace generators "create_zipf_requests", "create_uniform_requests", + # Utilities + "Util", # Metadata "__doc__", "__version__", diff --git a/libCacheSim-python/libcachesim/__init__.pyi b/libCacheSim-python/libcachesim/__init__.pyi index 6992a74ae..213eb1eb8 100644 --- a/libCacheSim-python/libcachesim/__init__.pyi +++ b/libCacheSim-python/libcachesim/__init__.pyi @@ -1,293 +1,247 @@ -""" -libCacheSim Python bindings --------------------------- - -.. currentmodule:: libcachesim - -.. autosummary:: - :toctree: _generate - - open_trace - ARC - Clock - FIFO - LRB - LRU - S3FIFO - Sieve - ThreeLCache - TinyLFU - TwoQ - Cache - Request - Reader - reader_init_param_t - TraceType - PythonHookCachePolicy - process_trace - process_trace_python_hook - create_zipf_requests - create_uniform_requests -""" - -from typing import Any, Callable, Optional, Union, overload +from __future__ import annotations +from typing import bool, int, str, tuple from collections.abc import Iterator -from _libcachesim import TraceType, ReqOp - -def open_trace( - trace_path: str, - type: Optional[TraceType] = None, - reader_init_param: Optional[Union[dict, reader_init_param_t]] = None, -) -> Reader: ... -def process_trace( - cache: Cache, - reader: Reader, - start_req: int = 0, - max_req: int = -1, -) -> tuple[float, float]: - """ - Process a trace with a cache and return miss ratio. - """ - -def process_trace_python_hook( - cache: PythonHookCache, - reader: Reader, - start_req: int = 0, - max_req: int = -1, -) -> tuple[float, float]: - """ - Process a trace with a Python hook cache and return miss ratio. - """ - -# Trace generation functions -def create_zipf_requests( - num_objects: int, - num_requests: int, - alpha: float = 1.0, - obj_size: int = 4000, - time_span: int = 86400 * 7, - start_obj_id: int = 0, - seed: Optional[int] = None, -) -> Iterator[Request]: - """Create a Zipf-distributed request generator. - - Args: - num_objects (int): Number of unique objects - num_requests (int): Number of requests to generate - alpha (float): Zipf skewness parameter (alpha >= 0) - obj_size (int): Object size in bytes - time_span (int): Time span in seconds - start_obj_id (int): Starting object ID - seed (int, optional): Random seed for reproducibility - - Returns: - Iterator[Request]: A generator that yields Request objects - """ - -def create_uniform_requests( - num_objects: int, - num_requests: int, - obj_size: int = 4000, - time_span: int = 86400 * 7, - start_obj_id: int = 0, - seed: Optional[int] = None, -) -> Iterator[Request]: - """Create a uniform-distributed request generator. - - Args: - num_objects (int): Number of unique objects - num_requests (int): Number of requests to generate - obj_size (int): Object size in bytes - time_span (int): Time span in seconds - start_obj_id (int): Starting object ID - seed (int, optional): Random seed for reproducibility - - Returns: - Iterator[Request]: A generator that yields Request objects - """ - -class reader_init_param_t: - time_field: int - obj_id_field: int - obj_size_field: int - delimiter: str - has_header: bool - binary_fmt_str: str - -class Cache: - n_req: int - cache_size: int - @property - def n_obj(self) -> int: ... - @property - def occupied_byte(self) -> int: ... - def get(self, req: Request) -> bool: ... +from .libcachesim_python import ReqOp, TraceType, SamplerType +from .protocols import ReaderProtocol, CacheProtocol class Request: clock_time: int hv: int obj_id: int obj_size: int + ttl: int op: ReqOp + valid: bool + next_access_vtime: int - @overload - def __init__(self) -> None: ... - @overload - def __init__( - self, obj_id: int, obj_size: int = 1, clock_time: int = 0, hv: int = 0, op: ReqOp = ReqOp.GET - ) -> None: ... def __init__( - self, obj_id: Optional[int] = None, obj_size: int = 1, clock_time: int = 0, hv: int = 0, op: ReqOp = ReqOp.GET - ) -> None: - """Create a request instance. - - Args: - obj_id (int, optional): The object ID. - obj_size (int): The object size. (default: 1) - clock_time (int): The clock time. (default: 0) - hv (int): The hash value. (default: 0) - op (ReqOp): The operation. (default: ReqOp.GET) - - Returns: - Request: A new request instance. - """ - -class Reader: - n_read_req: int - n_total_req: int - trace_path: str - file_size: int - def get_wss(self, ignore_obj_size: bool = False) -> int: ... - def seek(self, offset: int, from_beginning: bool = False) -> None: ... - def __iter__(self) -> Reader: ... - def __next__(self) -> Request: ... - -class PythonHookCache: - n_req: int - n_obj: int - occupied_byte: int + self, + obj_size: int = 1, + op: ReqOp = ReqOp.READ, + valid: bool = True, + obj_id: int = 0, + clock_time: int = 0, + hv: int = 0, + next_access_vtime: int = -2, + ttl: int = 0, + ): ... + def __init__(self): ... + +class CacheObject: + obj_id: int + obj_size: int + +class CommonCacheParams: cache_size: int + default_ttl: int + hashpower: int + consider_obj_metadata: bool - def __init__(self, cache_size: int, cache_name: str = "PythonHookCache") -> None: ... - def set_hooks( - self, - init_hook: Callable[[int], Any], - hit_hook: Callable[[Any, int, int], None], - miss_hook: Callable[[Any, int, int], None], - eviction_hook: Callable[[Any, int, int], int], - remove_hook: Callable[[Any, int], None], - free_hook: Optional[Callable[[Any], None]] = None, - ) -> None: ... - def get(self, req: Request) -> bool: ... +class Cache: + cache_size: int + default_ttl: int + obj_md_size: int + n_req: int + cache_name: str + init_params: CommonCacheParams -# Base class for all eviction policies -class EvictionPolicyBase: - """Abstract base class for all eviction policies.""" + def __init__(self, init_params: CommonCacheParams, cache_specific_params: str = ""): ... def get(self, req: Request) -> bool: ... - def process_trace(self, reader: Reader, start_req: int = 0, max_req: int = -1) -> tuple[float, float]: ... - @property - def n_req(self) -> int: ... - @property - def n_obj(self) -> int: ... - @property - def occupied_byte(self) -> int: ... + def find(self, req: Request, update_cache: bool = True) -> CacheObject: ... + def can_insert(self, req: Request) -> bool: ... + def insert(self, req: Request) -> CacheObject: ... + def need_eviction(self, req: Request) -> bool: ... + def evict(self, req: Request) -> CacheObject: ... + def remove(self, obj_id: int) -> bool: ... + def to_evict(self, req: Request) -> CacheObject: ... + def get_occupied_byte(self) -> int: ... + def get_n_obj(self) -> int: ... + def print_cache(self) -> str: ... + +class CacheBase(CacheProtocol): + """Base class implementing CacheProtocol""" + def __init__(self, _cache: Cache): ... + def get(self, req: Request) -> bool: ... + def find(self, req: Request, update_cache: bool = True) -> CacheObject: ... + def can_insert(self, req: Request) -> bool: ... + def insert(self, req: Request) -> CacheObject: ... + def need_eviction(self, req: Request) -> bool: ... + def evict(self, req: Request) -> CacheObject: ... + def remove(self, obj_id: int) -> bool: ... + def to_evict(self, req: Request) -> CacheObject: ... + def get_occupied_byte(self) -> int: ... + def get_n_obj(self) -> int: ... + def print_cache(self) -> str: ... + def process_trace(self, reader: ReaderProtocol, start_req: int = 0, max_req: int = -1) -> tuple[float, float]: ... @property def cache_size(self) -> int: ... - def __repr__(self) -> str: ... + @property + def cache_name(self) -> str: ... -# Eviction policy classes -class ARC(EvictionPolicyBase): - """Adaptive Replacement Cache policy.""" - def __init__(self, cache_size: int) -> None: ... +# Core cache algorithms +class LRU(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class Belady(EvictionPolicyBase): - """Belady replacement policy (optimal offline algorithm).""" - def __init__(self, cache_size: int) -> None: ... +class FIFO(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class BeladySize(EvictionPolicyBase): - """BeladySize replacement policy (optimal offline algorithm with size consideration).""" - def __init__(self, cache_size: int) -> None: ... +class LFU(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class Cacheus(EvictionPolicyBase): - """Cacheus replacement policy.""" - def __init__(self, cache_size: int) -> None: ... +class ARC(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... + +class Clock(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... + +class Random(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... + +# Advanced algorithms +class S3FIFO(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class Clock(EvictionPolicyBase): - """Clock (Second Chance or FIFO-Reinsertion) replacement policy.""" - def __init__(self, cache_size: int, n_bit_counter: int = 1, init_freq: int = 0) -> None: ... +class Sieve(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class FIFO(EvictionPolicyBase): - """First In First Out replacement policy.""" - def __init__(self, cache_size: int) -> None: ... +class LIRS(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class LeCaR(EvictionPolicyBase): - """LeCaR (Learning Cache Replacement) adaptive replacement policy.""" - def __init__(self, cache_size: int) -> None: ... +class TwoQ(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class LFU(EvictionPolicyBase): - """LFU (Least Frequently Used) replacement policy.""" - def __init__(self, cache_size: int) -> None: ... +class SLRU(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class LFUDA(EvictionPolicyBase): - """LFUDA (LFU with Dynamic Aging) replacement policy.""" - def __init__(self, cache_size: int) -> None: ... +class WTinyLFU(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class LRB(EvictionPolicyBase): - """LRB (Learning Relaxed Belady) replacement policy.""" - def __init__(self, cache_size: int, objective: str = "byte-miss-ratio") -> None: ... +class LeCaR(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class LRU(EvictionPolicyBase): - """Least Recently Used replacement policy.""" - def __init__(self, cache_size: int) -> None: ... +class LFUDA(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class QDLP(EvictionPolicyBase): - """QDLP (Queue Demotion with Lazy Promotion) replacement policy.""" - def __init__(self, cache_size: int) -> None: ... +class ClockPro(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... -class S3FIFO(EvictionPolicyBase): - """S3FIFO replacement policy.""" +class Cacheus(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... + +# Optimal algorithms +class Belady(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... + +class BeladySize(CacheBase): + def __init__( + self, cache_size: int, default_ttl: int = 25920000, hashpower: int = 24, consider_obj_metadata: bool = False + ): ... + +# Plugin cache +class PythonHookCachePolicy(CacheBase): def __init__( self, cache_size: int, - fifo_size_ratio: float = 0.1, - ghost_size_ratio: float = 0.9, - move_to_main_threshold: int = 2, - ) -> None: ... - -class Sieve(EvictionPolicyBase): - """Sieve replacement policy.""" - def __init__(self, cache_size: int) -> None: ... - -class SLRU(EvictionPolicyBase): - """SLRU (Segmented LRU) replacement policy.""" - def __init__(self, cache_size: int) -> None: ... - -class ThreeLCache(EvictionPolicyBase): - """ThreeL cache replacement policy.""" - def __init__(self, cache_size: int, objective: str = "byte-miss-ratio") -> None: ... - -class TinyLFU(EvictionPolicyBase): - """TinyLFU replacement policy.""" - def __init__(self, cache_size: int, main_cache: str = "SLRU", window_size: float = 0.01) -> None: ... - -class TwoQ(EvictionPolicyBase): - """2Q replacement policy.""" - def __init__(self, cache_size: int, ain_size_ratio: float = 0.25, aout_size_ratio: float = 0.5) -> None: ... - -class WTinyLFU(EvictionPolicyBase): - """WTinyLFU (Windowed TinyLFU) replacement policy.""" - def __init__(self, cache_size: int, main_cache: str = "SLRU", window_size: float = 0.01) -> None: ... - -class PythonHookCachePolicy(EvictionPolicyBase): - """Python hook-based cache policy.""" - def __init__(self, cache_size: int, cache_name: str = "PythonHookCache") -> None: ... - def set_hooks( + cache_name: str = "PythonHookCache", + default_ttl: int = 25920000, + hashpower: int = 24, + consider_obj_metadata: bool = False, + cache_init_hook=None, + cache_hit_hook=None, + cache_miss_hook=None, + cache_eviction_hook=None, + cache_remove_hook=None, + cache_free_hook=None, + ): ... + def set_hooks(self, init_hook, hit_hook, miss_hook, eviction_hook, remove_hook, free_hook=None): ... + +# Readers +class TraceReader(ReaderProtocol): + c_reader: bool + def __init__(self, trace: str, trace_type: TraceType = TraceType.UNKNOWN_TRACE, **kwargs): ... + +class SyntheticReader(ReaderProtocol): + c_reader: bool + def __init__( self, - init_hook: Callable[[int], Any], - hit_hook: Callable[[Any, int, int], None], - miss_hook: Callable[[Any, int, int], None], - eviction_hook: Callable[[Any, int, int], int], - remove_hook: Callable[[Any, int], None], - free_hook: Optional[Callable[[Any], None]] = None, - ) -> None: ... + num_of_req: int, + obj_size: int = 4000, + time_span: int = 604800, + start_obj_id: int = 0, + seed: int | None = None, + alpha: float = 1.0, + dist: str = "zipf", + num_objects: int | None = None, + ): ... + +# Trace generators +def create_zipf_requests( + num_objects: int, + num_requests: int, + alpha: float = 1.0, + obj_size: int = 4000, + time_span: int = 604800, + start_obj_id: int = 0, + seed: int | None = None, +) -> Iterator[Request]: ... +def create_uniform_requests( + num_objects: int, + num_requests: int, + obj_size: int = 4000, + time_span: int = 604800, + start_obj_id: int = 0, + seed: int | None = None, +) -> Iterator[Request]: ... + +# Analyzer +class TraceAnalyzer: + def __init__(self, analyzer): ... + def analyze(self, reader: ReaderProtocol, output_path: str, analysis_param, analysis_option) -> None: ... + +# Utilities +class Util: + @staticmethod + def convert_to_oracleGeneral(reader, ofilepath, output_txt: bool = False, remove_size_change: bool = False): ... + @staticmethod + def convert_to_lcs( + reader, ofilepath, output_txt: bool = False, remove_size_change: bool = False, lcs_ver: int = 1 + ): ... + @staticmethod + def process_trace( + cache: CacheBase, reader: ReaderProtocol, start_req: int = 0, max_req: int = -1 + ) -> tuple[float, float]: ... diff --git a/libCacheSim-python/libcachesim/cache.py b/libCacheSim-python/libcachesim/cache.py new file mode 100644 index 000000000..3f3a2bd38 --- /dev/null +++ b/libCacheSim-python/libcachesim/cache.py @@ -0,0 +1,396 @@ +from abc import ABC +from typing import Protocol +from .libcachesim_python import ( + CommonCacheParams, + Request, + CacheObject, + Cache, + # Core cache algorithms + LRU_init, + FIFO_init, + LFU_init, + ARC_init, + Clock_init, + Random_init, + LIRS_init, + TwoQ_init, + SLRU_init, + # Advanced algorithms + S3FIFO_init, + Sieve_init, + WTinyLFU_init, + LeCaR_init, + LFUDA_init, + ClockPro_init, + Cacheus_init, + # Optimal algorithms + Belady_init, + BeladySize_init, + # Probabilistic algorithms + LRU_Prob_init, + flashProb_init, + # Size-based algorithms + Size_init, + GDSF_init, + # Hyperbolic algorithms + Hyperbolic_init, + # Plugin cache + pypluginCache_init, + # Process trace function + c_process_trace, +) + +from .protocols import CacheProtocol, ReaderProtocol + + +class CacheBase(CacheProtocol): + """Base class for all cache implementations that implements CacheProtocol""" + + _cache: Cache # Internal C++ cache object + + def __init__(self, _cache: Cache): + self._cache = _cache + + def get(self, req: Request) -> bool: + return self._cache.get(req) + + def find(self, req: Request, update_cache: bool = True) -> CacheObject: + return self._cache.find(req, update_cache) + + def can_insert(self, req: Request) -> bool: + return self._cache.can_insert(req) + + def insert(self, req: Request) -> CacheObject: + return self._cache.insert(req) + + def need_eviction(self, req: Request) -> bool: + return self._cache.need_eviction(req) + + def evict(self, req: Request) -> CacheObject: + return self._cache.evict(req) + + def remove(self, obj_id: int) -> bool: + return self._cache.remove(obj_id) + + def to_evict(self, req: Request) -> CacheObject: + return self._cache.to_evict(req) + + def get_occupied_byte(self) -> int: + return self._cache.get_occupied_byte() + + def get_n_obj(self) -> int: + return self._cache.get_n_obj() + + def print_cache(self) -> str: + return self._cache.print_cache() + + def process_trace(self, reader: ReaderProtocol, start_req: int = 0, max_req: int = -1) -> tuple[float, float]: + """Process trace with this cache and return miss ratios""" + if hasattr(reader, "c_reader") and reader.c_reader: + # C++ reader with _reader attribute + if hasattr(reader, "_reader"): + return c_process_trace(self._cache, reader._reader, start_req, max_req) + else: + raise ValueError("C++ reader missing _reader attribute") + else: + # Python reader - use Python implementation + return self._process_trace_python(reader, start_req, max_req) + + def _process_trace_python( + self, reader: ReaderProtocol, start_req: int = 0, max_req: int = -1 + ) -> tuple[float, float]: + """Python fallback for processing traces""" + reader.reset() + if start_req > 0: + reader.skip_n_req(start_req) + + n_req = 0 + n_hit = 0 + bytes_req = 0 + bytes_hit = 0 + + for req in reader: + if not req.valid: + break + + n_req += 1 + bytes_req += req.obj_size + + if self.get(req): + n_hit += 1 + bytes_hit += req.obj_size + + if max_req > 0 and n_req >= max_req: + break + + obj_miss_ratio = 1.0 - (n_hit / n_req) if n_req > 0 else 0.0 + byte_miss_ratio = 1.0 - (bytes_hit / bytes_req) if bytes_req > 0 else 0.0 + return obj_miss_ratio, byte_miss_ratio + + # Properties + @property + def cache_size(self) -> int: + return self._cache.cache_size + + @property + def cache_name(self) -> str: + return self._cache.cache_name + + +def _create_common_params( + cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False +) -> CommonCacheParams: + """Helper to create common cache parameters""" + return CommonCacheParams( + cache_size=cache_size, + default_ttl=default_ttl, + hashpower=hashpower, + consider_obj_metadata=consider_obj_metadata, + ) + + +# Core cache algorithms +class LRU(CacheBase): + """Least Recently Used cache""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=LRU_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class FIFO(CacheBase): + """First In First Out cache""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=FIFO_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class LFU(CacheBase): + """Least Frequently Used cache""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=LFU_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class ARC(CacheBase): + """Adaptive Replacement Cache""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=ARC_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class Clock(CacheBase): + """Clock replacement algorithm""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=Clock_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class Random(CacheBase): + """Random replacement cache""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=Random_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +# Advanced algorithms +class S3FIFO(CacheBase): + """S3-FIFO cache algorithm""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=S3FIFO_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class Sieve(CacheBase): + """Sieve cache algorithm""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=Sieve_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class LIRS(CacheBase): + """Low Inter-reference Recency Set""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=LIRS_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class TwoQ(CacheBase): + """2Q replacement algorithm""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=TwoQ_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class SLRU(CacheBase): + """Segmented LRU""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=SLRU_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class WTinyLFU(CacheBase): + """Window TinyLFU""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=WTinyLFU_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class LeCaR(CacheBase): + """Learning Cache Replacement""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=LeCaR_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class LFUDA(CacheBase): + """LFU with Dynamic Aging""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=LFUDA_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class ClockPro(CacheBase): + """Clock-Pro replacement algorithm""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=ClockPro_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class Cacheus(CacheBase): + """Cacheus algorithm""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=Cacheus_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +# Optimal algorithms +class Belady(CacheBase): + """Belady's optimal algorithm""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=Belady_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +class BeladySize(CacheBase): + """Belady's optimal algorithm with size consideration""" + + def __init__( + self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False + ): + super().__init__( + _cache=BeladySize_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata)) + ) + + +# Plugin cache for custom Python implementations +def nop_method(*args, **kwargs): + """No-operation method for default hooks""" + pass + + +class PythonHookCachePolicy(CacheBase): + """Python plugin cache for custom implementations""" + + def __init__( + self, + cache_size: int, + cache_name: str = "PythonHookCache", + default_ttl: int = 86400 * 300, + hashpower: int = 24, + consider_obj_metadata: bool = False, + cache_init_hook=nop_method, + cache_hit_hook=nop_method, + cache_miss_hook=nop_method, + cache_eviction_hook=nop_method, + cache_remove_hook=nop_method, + cache_free_hook=nop_method, + ): + self.cache_name = cache_name + self.common_cache_params = _create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata) + + super().__init__( + _cache=pypluginCache_init( + self.common_cache_params, + cache_name, + cache_init_hook, + cache_hit_hook, + cache_miss_hook, + cache_eviction_hook, + cache_remove_hook, + cache_free_hook, + ) + ) + + def set_hooks(self, init_hook, hit_hook, miss_hook, eviction_hook, remove_hook, free_hook=nop_method): + """Set the cache hooks after initialization""" + # Note: This would require C++ side support to change hooks after creation + # For now, hooks should be set during initialization + pass diff --git a/libCacheSim-python/libcachesim/const.py b/libCacheSim-python/libcachesim/const.py deleted file mode 100644 index 9d48db4f9..000000000 --- a/libCacheSim-python/libcachesim/const.py +++ /dev/null @@ -1 +0,0 @@ -from __future__ import annotations diff --git a/libCacheSim-python/libcachesim/eviction.py b/libCacheSim-python/libcachesim/eviction.py deleted file mode 100644 index 63599ec0f..000000000 --- a/libCacheSim-python/libcachesim/eviction.py +++ /dev/null @@ -1,713 +0,0 @@ -"""Registry of eviction policies.""" - -from __future__ import annotations - -from abc import ABC, abstractmethod - -from ._libcachesim import ( - ARC_init, - Belady_init, - BeladySize_init, - Cacheus_init, - Cache, - Clock_init, - FIFO_init, - LeCaR_init, - LFU_init, - LFUDA_init, - LRB_init, - LRU_init, - QDLP_init, - Reader, - Request, - S3FIFO_init, - Sieve_init, - SLRU_init, - ThreeLCache_init, - TinyLFU_init, - TwoQ_init, - WTinyLFU_init, - PythonHookCache, -) - -from .trace_generator import _ZipfRequestGenerator, _UniformRequestGenerator - -# Define generator types once to avoid repeated tuple creation -_GENERATOR_TYPES = (_ZipfRequestGenerator, _UniformRequestGenerator) - - -class EvictionPolicyBase(ABC): - """Abstract base class for all eviction policies.""" - - @abstractmethod - def get(self, req: Request) -> bool: - pass - - @abstractmethod - def __repr__(self) -> str: - pass - - @abstractmethod - def process_trace(self, reader, start_req=0, max_req=-1) -> tuple[float, float]: - """Process a trace with this cache and return miss ratio. - - This method processes trace data entirely on the C++ side to avoid - data movement overhead between Python and C++. - - Args: - reader: The trace reader instance - start_req: Start request index (-1 for no limit) - max_req: Number of requests to process (-1 for no limit) - - Returns: - tuple[float, float]: Object miss ratio (0.0 to 1.0) and byte miss ratio (0.0 to 1.0) - """ - pass - - -class EvictionPolicy(EvictionPolicyBase): - """Base class for all eviction policies.""" - - def __init__(self, cache_size: int, **kwargs) -> None: - self.cache: Cache = self.init_cache(cache_size, **kwargs) - - @abstractmethod - def init_cache(self, cache_size: int, **kwargs) -> Cache: - pass - - def get(self, req: Request) -> bool: - return self.cache.get(req) - - def process_trace(self, reader, start_req=0, max_req=-1) -> tuple[float, float]: - """Process a trace with this cache and return miss ratio. - - This method processes trace data entirely on the C++ side to avoid - data movement overhead between Python and C++. - - Args: - reader: The trace reader instance - start_req: Start request index (-1 for no limit) - max_req: Number of requests to process (-1 for no limit) - - Returns: - tuple[float, float]: Object miss ratio (0.0 to 1.0) and byte miss ratio (0.0 to 1.0) - Example: - >>> cache = LRU(1024*1024) - >>> reader = open_trace("trace.csv", TraceType.CSV_TRACE) - >>> obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) - >>> print(f"Obj miss ratio: {obj_miss_ratio:.4f}, byte miss ratio: {byte_miss_ratio:.4f}") - """ - obj_miss_ratio = 0.0 - byte_miss_ratio = 0.0 - if not isinstance(reader, Reader): - # streaming generator - if isinstance(reader, _GENERATOR_TYPES): - miss_cnt = 0 - byte_miss_cnt = 0 - total_byte = 0 - for req in reader: - hit = self.get(req) - total_byte += req.obj_size - if not hit: - miss_cnt += 1 - byte_miss_cnt += req.obj_size - obj_miss_ratio = miss_cnt / len(reader) if len(reader) > 0 else 0.0 - byte_miss_ratio = byte_miss_cnt / total_byte if total_byte > 0 else 0.0 - return obj_miss_ratio, byte_miss_ratio - else: - from ._libcachesim import process_trace - - obj_miss_ratio, byte_miss_ratio = process_trace(self.cache, reader, start_req, max_req) - - return obj_miss_ratio, byte_miss_ratio - - def __repr__(self): - return f"{self.__class__.__name__}(cache_size={self.cache.cache_size})" - - @property - def n_req(self): - """Number of requests processed.""" - return self.cache.n_req - - @property - def n_obj(self): - """Number of objects currently in cache.""" - return self.cache.n_obj - - @property - def occupied_byte(self): - """Number of bytes currently occupied in cache.""" - return self.cache.occupied_byte - - @property - def cache_size(self): - """Total cache size in bytes.""" - return self.cache.cache_size - - -class FIFO(EvictionPolicy): - """First In First Out replacement policy. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs) -> Cache: # noqa: ARG002 - return FIFO_init(cache_size) - - -class Clock(EvictionPolicy): - """Clock (Second Chance or FIFO-Reinsertion) replacement policy. - - Args: - cache_size: Size of the cache - n_bit_counter: Number of bits for counter (default: 1) - init_freq: Initial frequency value (default: 0) - """ - - def __init__(self, cache_size: int, n_bit_counter: int = 1, init_freq: int = 0): - super().__init__(cache_size, n_bit_counter=n_bit_counter, init_freq=init_freq) - - def init_cache(self, cache_size: int, **kwargs): - init_freq = kwargs.get("init_freq", 0) - n_bit_counter = kwargs.get("n_bit_counter", 1) - - if n_bit_counter < 1 or n_bit_counter > 32: - msg = "n_bit_counter must be between 1 and 32" - raise ValueError(msg) - if init_freq < 0 or init_freq > 2**n_bit_counter - 1: - msg = "init_freq must be between 0 and 2^n_bit_counter - 1" - raise ValueError(msg) - - self.init_freq = init_freq - self.n_bit_counter = n_bit_counter - - return Clock_init(cache_size, n_bit_counter, init_freq) - - def __repr__(self): - return ( - f"{self.__class__.__name__}(cache_size={self.cache.cache_size}, " - f"n_bit_counter={self.n_bit_counter}, " - f"init_freq={self.init_freq})" - ) - - -class TwoQ(EvictionPolicy): - """2Q replacement policy. - - 2Q has three queues: Ain, Aout, Am. When a obj hits in Aout, it will be - inserted into Am otherwise it will be inserted into Ain. - - Args: - cache_size: Total size of the cache - ain_size_ratio: Size ratio for Ain queue (default: 0.25) - aout_size_ratio: Size ratio for Aout queue (default: 0.5) - """ - - def __init__(self, cache_size: int, ain_size_ratio: float = 0.25, aout_size_ratio: float = 0.5): - super().__init__(cache_size, ain_size_ratio=ain_size_ratio, aout_size_ratio=aout_size_ratio) - - def init_cache(self, cache_size: int, **kwargs): - ain_size_ratio = kwargs.get("ain_size_ratio", 0.25) - aout_size_ratio = kwargs.get("aout_size_ratio", 0.5) - - if ain_size_ratio <= 0 or aout_size_ratio <= 0: - msg = "ain_size_ratio and aout_size_ratio must be greater than 0" - raise ValueError(msg) - - self.ain_size_ratio = ain_size_ratio - self.aout_size_ratio = aout_size_ratio - - return TwoQ_init(cache_size, ain_size_ratio, aout_size_ratio) - - def __repr__(self): - return ( - f"{self.__class__.__name__}(cache_size={self.cache.cache_size}, " - f"ain_size_ratio={self.ain_size_ratio}, " - f"aout_size_ratio={self.aout_size_ratio})" - ) - - -class LRB(EvictionPolicy): - """LRB (Learning Relaxed Belady) replacement policy. - - LRB is a learning-based replacement policy that uses a neural network to - predict the future access patterns of the cache, randomly select one obj - outside the Belady boundary to evict. - - Args: - cache_size: Size of the cache - objective: Objective function to optimize (default: "byte-miss-ratio") - """ - - def __init__(self, cache_size: int, objective: str = "byte-miss-ratio"): - super().__init__(cache_size, objective=objective) - - def init_cache(self, cache_size: int, **kwargs) -> Cache: - objective = kwargs.get("objective", "byte-miss-ratio") - - if objective not in ["byte-miss-ratio", "byte-hit-ratio"]: - msg = "objective must be either 'byte-miss-ratio' or 'byte-hit-ratio'" - raise ValueError(msg) - - self.objective = objective - - return LRB_init(cache_size, objective) - - def __repr__(self): - return f"{self.__class__.__name__}(cache_size={self.cache.cache_size}, objective={self.objective})" - - -class LRU(EvictionPolicy): - """Least Recently Used replacement policy. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return LRU_init(cache_size) - - -class ARC(EvictionPolicy): - """Adaptive Replacement Cache policy. - - ARC is a two-tiered cache with two LRU caches (T1 and T2) and two ghost - lists (B1 and B2). T1 records the obj accessed only once, T2 records - the obj accessed more than once. ARC has an internal parameter `p` to - learn and dynamically control the size of T1 and T2. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return ARC_init(cache_size) - - -class S3FIFO(EvictionPolicy): - """S3FIFO replacement policy. - - S3FIFO consists of three FIFO queues: Small, Main, and Ghost. Small - queue gets the obj and records the freq. - When small queue is full, if the obj to evict satisfies the threshold, - it will be moved to main queue. Otherwise, it will be evicted from small - queue and inserted into ghost queue. - When main queue is full, the obj to evict will be evicted and reinserted - like Clock. - If obj hits in the ghost queue, it will be moved to main queue. - - Args: - cache_size: Size of the cache - fifo_size_ratio: Size ratio for FIFO queue (default: 0.1) - ghost_size_ratio: Size ratio for ghost queue (default: 0.9) - move_to_main_threshold: Threshold for moving obj from ghost to main (default: 2) - """ - - def __init__( - self, - cache_size: int, - fifo_size_ratio: float = 0.1, - ghost_size_ratio: float = 0.9, - move_to_main_threshold: int = 2, - ): - super().__init__( - cache_size, - fifo_size_ratio=fifo_size_ratio, - ghost_size_ratio=ghost_size_ratio, - move_to_main_threshold=move_to_main_threshold, - ) - - def init_cache(self, cache_size: int, **kwargs): - fifo_size_ratio = kwargs.get("fifo_size_ratio", 0.1) - ghost_size_ratio = kwargs.get("ghost_size_ratio", 0.9) - move_to_main_threshold = kwargs.get("move_to_main_threshold", 2) - - if fifo_size_ratio <= 0 or ghost_size_ratio <= 0: - msg = "fifo_size_ratio and ghost_size_ratio must be greater than 0" - raise ValueError(msg) - if move_to_main_threshold < 0: - msg = "move_to_main_threshold must be greater or equal to 0" - raise ValueError(msg) - - self.fifo_size_ratio = fifo_size_ratio - self.ghost_size_ratio = ghost_size_ratio - self.move_to_main_threshold = move_to_main_threshold - - return S3FIFO_init(cache_size, fifo_size_ratio, ghost_size_ratio, move_to_main_threshold) - - def __repr__(self): - return ( - f"{self.__class__.__name__}(cache_size={self.cache.cache_size}, " - f"fifo_size_ratio={self.fifo_size_ratio}, " - f"ghost_size_ratio={self.ghost_size_ratio}, " - f"move_to_main_threshold={self.move_to_main_threshold})" - ) - - -class Sieve(EvictionPolicy): - """Sieve replacement policy. - - FIFO-Reinsertion with check pointer. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return Sieve_init(cache_size) - - -class ThreeLCache(EvictionPolicy): - """3L-Cache replacement policy. - - Args: - cache_size: Size of the cache - objective: Objective function to optimize (default: "byte-miss-ratio") - """ - - def __init__(self, cache_size: int, objective: str = "byte-miss-ratio"): - super().__init__(cache_size, objective=objective) - - def init_cache(self, cache_size: int, **kwargs): - objective = kwargs.get("objective", "byte-miss-ratio") - - if objective not in ["byte-miss-ratio", "byte-hit-ratio"]: - msg = "objective must be either 'byte-miss-ratio' or 'byte-hit-ratio'" - raise ValueError(msg) - - self.objective = objective - - return ThreeLCache_init(cache_size, objective) - - def __repr__(self): - return f"{self.__class__.__name__}(cache_size={self.cache.cache_size}, objective={self.objective})" - - -class TinyLFU(EvictionPolicy): - """TinyLFU replacement policy. - - Args: - cache_size: Size of the cache - main_cache: Main cache to use (default: "SLRU") - window_size: Window size for TinyLFU (default: 0.01) - """ - - def __init__(self, cache_size: int, main_cache: str = "SLRU", window_size: float = 0.01): - super().__init__(cache_size, main_cache=main_cache, window_size=window_size) - - def init_cache(self, cache_size: int, **kwargs): - main_cache = kwargs.get("main_cache", "SLRU") - window_size = kwargs.get("window_size", 0.01) - - if window_size <= 0: - msg = "window_size must be greater than 0" - raise ValueError(msg) - - self.main_cache = main_cache - self.window_size = window_size - - return TinyLFU_init(cache_size, main_cache, window_size) - - def __repr__(self): - return ( - f"{self.__class__.__name__}(cache_size={self.cache.cache_size}, " - f"main_cache={self.main_cache}, " - f"window_size={self.window_size})" - ) - - -class LFU(EvictionPolicy): - """LFU (Least Frequently Used) replacement policy. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return LFU_init(cache_size) - - -class LFUDA(EvictionPolicy): - """LFUDA (LFU with Dynamic Aging) replacement policy. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return LFUDA_init(cache_size) - - -class SLRU(EvictionPolicy): - """SLRU (Segmented LRU) replacement policy. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return SLRU_init(cache_size) - - -class Belady(EvictionPolicy): - """Belady replacement policy (optimal offline algorithm). - - Note: Requires oracle trace with future access information. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return Belady_init(cache_size) - - -class BeladySize(EvictionPolicy): - """BeladySize replacement policy (optimal offline algorithm with size consideration). - - Note: Requires oracle trace with future access information. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return BeladySize_init(cache_size) - - -class QDLP(EvictionPolicy): - """QDLP (Queue Demotion with Lazy Promotion) replacement policy. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return QDLP_init(cache_size) - - -class LeCaR(EvictionPolicy): - """LeCaR (Learning Cache Replacement) adaptive replacement policy. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return LeCaR_init(cache_size) - - -class Cacheus(EvictionPolicy): - """Cacheus replacement policy. - - Args: - cache_size: Size of the cache - """ - - def init_cache(self, cache_size: int, **kwargs): # noqa: ARG002 - return Cacheus_init(cache_size) - - -class WTinyLFU(EvictionPolicy): - """WTinyLFU (Windowed TinyLFU) replacement policy. - - Args: - cache_size: Size of the cache - main_cache: Main cache to use (default: "SLRU") - window_size: Window size for TinyLFU (default: 0.01) - """ - - def __init__(self, cache_size: int, main_cache: str = "SLRU", window_size: float = 0.01): - super().__init__(cache_size, main_cache=main_cache, window_size=window_size) - - def init_cache(self, cache_size: int, **kwargs): - main_cache = kwargs.get("main_cache", "SLRU") - window_size = kwargs.get("window_size", 0.01) - - if window_size <= 0: - msg = "window_size must be greater than 0" - raise ValueError(msg) - - self.main_cache = main_cache - self.window_size = window_size - - return WTinyLFU_init(cache_size, main_cache, window_size) - - def __repr__(self): - return ( - f"{self.__class__.__name__}(cache_size={self.cache.cache_size}, " - f"main_cache={self.main_cache}, " - f"window_size={self.window_size})" - ) - - -class PythonHookCachePolicy(EvictionPolicyBase): - """Python hook-based cache that allows defining custom policies using Python functions. - - This cache implementation allows users to define custom cache replacement algorithms - using pure Python functions instead of compiling C/C++ plugins. Users provide hook - functions for cache initialization, hit handling, miss handling, eviction decisions, - and cleanup. - - Args: - cache_size: Size of the cache in bytes - cache_name: Optional name for the cache (default: "PythonHookCache") - - Hook Functions Required: - init_hook(cache_size: int) -> Any: - Initialize plugin data structures. Return any object to be passed to other hooks. - - hit_hook(plugin_data: Any, obj_id: int, obj_size: int) -> None: - Handle cache hit events. Update internal state as needed. - - miss_hook(plugin_data: Any, obj_id: int, obj_size: int) -> None: - Handle cache miss events. Update internal state for new object. - - eviction_hook(plugin_data: Any, obj_id: int, obj_size: int) -> int: - Determine which object to evict. Return the object ID to be evicted. - - remove_hook(plugin_data: Any, obj_id: int) -> None: - Clean up when objects are removed from cache. - - free_hook(plugin_data: Any) -> None: [Optional] - Clean up plugin resources when cache is destroyed. - - Example: - >>> from collections import OrderedDict - >>> - >>> cache = PythonHookCachePolicy(1024) - >>> - >>> def init_hook(cache_size): - ... return OrderedDict() # LRU tracking - >>> - >>> def hit_hook(lru_dict, obj_id, obj_size): - ... lru_dict.move_to_end(obj_id) # Move to end (most recent) - >>> - >>> def miss_hook(lru_dict, obj_id, obj_size): - ... lru_dict[obj_id] = True # Add to end - >>> - >>> def eviction_hook(lru_dict, obj_id, obj_size): - ... return next(iter(lru_dict)) # Return least recent - >>> - >>> def remove_hook(lru_dict, obj_id): - ... lru_dict.pop(obj_id, None) - >>> - >>> cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - >>> - >>> req = Request() - >>> req.obj_id = 1 - >>> req.obj_size = 100 - >>> hit = cache.get(req) - """ - - def __init__(self, cache_size: int, cache_name: str = "PythonHookCache"): - self._cache_size = cache_size - self.cache_name = cache_name - self.cache = PythonHookCache(cache_size, cache_name) - self._hooks_set = False - - def set_hooks(self, init_hook, hit_hook, miss_hook, eviction_hook, remove_hook, free_hook=None): - """Set the hook functions for the cache. - - Args: - init_hook: Function called during cache initialization - hit_hook: Function called on cache hit - miss_hook: Function called on cache miss - eviction_hook: Function called to select eviction candidate - remove_hook: Function called when object is removed - free_hook: Optional function called during cache cleanup - """ - self.cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook, free_hook) - self._hooks_set = True - - def get(self, req: Request) -> bool: - """Process a cache request. - - Args: - req: The cache request to process - - Returns: - True if cache hit, False if cache miss - - Raises: - RuntimeError: If hooks have not been set - """ - if not self._hooks_set: - raise RuntimeError("Hooks must be set before using the cache. Call set_hooks() first.") - return self.cache.get(req) - - def process_trace(self, reader, start_req=0, max_req=-1) -> tuple[float, float]: - """Process a trace with this cache and return miss ratio. - - This method processes trace data entirely on the C++ side to avoid - data movement overhead between Python and C++. - - Args: - reader: The trace reader instance - start_req: Start request index (-1 for no limit) - n_req: Number of requests to process (-1 for no limit) - - Returns: - tuple[float, float]: Object miss ratio (0.0 to 1.0) and byte miss ratio (0.0 to 1.0) - - Raises: - RuntimeError: If hooks have not been set - - Example: - >>> cache = PythonHookCachePolicy(1024*1024) - >>> cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - >>> reader = open_trace("trace.csv", TraceType.CSV_TRACE) - >>> obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) - >>> print(f"Obj miss ratio: {obj_miss_ratio:.4f}, byte miss ratio: {byte_miss_ratio:.4f}") - """ - if not self._hooks_set: - raise RuntimeError("Hooks must be set before processing trace. Call set_hooks() first.") - obj_miss_ratio = 0.0 - byte_miss_ratio = 0.0 - if not isinstance(reader, Reader): - # streaming generator - if isinstance(reader, _GENERATOR_TYPES): - miss_cnt = 0 - byte_miss_cnt = 0 - total_byte = 0 - for req in reader: - hit = self.get(req) - total_byte += req.obj_size - if not hit: - miss_cnt += 1 - byte_miss_cnt += req.obj_size - obj_miss_ratio = miss_cnt / len(reader) if len(reader) > 0 else 0.0 - byte_miss_ratio = byte_miss_cnt / total_byte if total_byte > 0 else 0.0 - return obj_miss_ratio, byte_miss_ratio - else: - from ._libcachesim import process_trace_python_hook - - obj_miss_ratio, byte_miss_ratio = process_trace_python_hook(self.cache, reader, start_req, max_req) - return obj_miss_ratio, byte_miss_ratio - - @property - def n_req(self): - """Number of requests processed.""" - return self.cache.n_req - - @property - def n_obj(self): - """Number of objects currently in cache.""" - return self.cache.n_obj - - @property - def occupied_byte(self): - """Number of bytes currently occupied in cache.""" - return self.cache.occupied_byte - - @property - def cache_size(self): - """Total cache size in bytes.""" - return self.cache.cache_size - - def __repr__(self): - return ( - f"{self.__class__.__name__}(cache_size={self._cache_size}, " - f"cache_name='{self.cache_name}', hooks_set={self._hooks_set})" - ) diff --git a/libCacheSim-python/libcachesim/protocols.py b/libCacheSim-python/libcachesim/protocols.py new file mode 100644 index 000000000..d362946a0 --- /dev/null +++ b/libCacheSim-python/libcachesim/protocols.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from typing import Protocol, TYPE_CHECKING + +if TYPE_CHECKING: + from .libcachesim_python import Request, CacheObject, Reader, Analyzer + + +class CacheProtocol(Protocol): + def get(self, req: Request) -> bool: ... + + def find(self, req: Request, update_cache: bool = True) -> CacheObject: ... + + def can_insert(self, req: Request) -> bool: ... + + def insert(self, req: Request) -> CacheObject: ... + + def need_eviction(self, req: Request) -> bool: ... + + def evict(self, req: Request) -> CacheObject: ... + + def remove(self, obj_id: int) -> bool: ... + + def to_evict(self, req: Request) -> CacheObject: ... + + def get_occupied_byte(self) -> int: ... + + def get_n_obj(self) -> int: ... + + def print_cache(self) -> str: ... + + def process_trace(self, reader: "ReaderProtocol", start_req: int = 0, max_req: int = -1) -> tuple[float, float]: ... + + # Properties + @property + def cache_size(self) -> int: ... + + @property + def cache_name(self) -> str: ... + + +class ReaderProtocol(Protocol): + def get_num_of_req(self) -> int: ... + + def read_one_req(self, req: Request) -> Request: ... + + def reset(self) -> None: ... + + def close(self) -> None: ... + + def clone(self) -> ReaderProtocol: ... + + def read_first_req(self, req: Request) -> Request: ... + + def read_last_req(self, req: Request) -> Request: ... + + def skip_n_req(self, n: int) -> int: ... + + def read_one_req_above(self, req: Request) -> Request: ... + + def go_back_one_req(self) -> None: ... + + def set_read_pos(self, pos: float) -> None: ... + + def get_read_pos(self) -> float: ... + + +class AnalyzerProtocol(Protocol): + def run(self) -> None: ... + + def cleanup(self) -> None: ... diff --git a/libCacheSim-python/libcachesim/synthetic_reader.py b/libCacheSim-python/libcachesim/synthetic_reader.py new file mode 100644 index 000000000..c9d3575fc --- /dev/null +++ b/libCacheSim-python/libcachesim/synthetic_reader.py @@ -0,0 +1,408 @@ +""" +Trace generator module for libCacheSim Python bindings. + +This module provides functions to generate synthetic traces with different distributions. +""" + +import numpy as np +import random +from typing import Optional, Union, Any +from collections.abc import Iterator +from .libcachesim_python import Request, ReqOp + +from .protocols import ReaderProtocol + + +class SyntheticReader(ReaderProtocol): + """Efficient synthetic request generator supporting multiple distributions""" + + def __init__( + self, + num_of_req: int, + obj_size: int = 4000, + time_span: int = 86400 * 7, + start_obj_id: int = 0, + seed: Optional[int] = None, + alpha: float = 1.0, + dist: str = "zipf", + num_objects: Optional[int] = None, + ): + """ + Initialize synthetic reader. + + Args: + num_of_req: Number of requests to generate + obj_size: Object size in bytes + time_span: Time span in seconds + start_obj_id: Starting object ID + seed: Random seed for reproducibility + alpha: Zipf skewness parameter (only for dist="zipf") + dist: Distribution type ("zipf" or "uniform") + num_objects: Number of unique objects (defaults to num_of_req) + """ + if num_of_req <= 0: + raise ValueError("num_of_req must be positive") + if obj_size <= 0: + raise ValueError("obj_size must be positive") + if time_span <= 0: + raise ValueError("time_span must be positive") + if alpha < 0: + raise ValueError("alpha must be non-negative") + if dist not in ["zipf", "uniform"]: + raise ValueError(f"Unsupported distribution: {dist}") + + self.num_of_req = num_of_req + self.obj_size = obj_size + self.time_span = time_span + self.start_obj_id = start_obj_id + self.seed = seed + self.alpha = alpha + self.dist = dist + self.num_objects = num_objects or num_of_req + self.current_pos = 0 + + # Set the reader type - this is a Python reader, not C++ + self.c_reader = False + + # Set random seed for reproducibility + if seed is not None: + np.random.seed(seed) + random.seed(seed) + + # Lazy generation: generate object IDs only when needed + self._obj_ids: Optional[np.ndarray] = None + + @property + def obj_ids(self) -> np.ndarray: + """Lazy generation of object ID array""" + if self._obj_ids is None: + if self.dist == "zipf": + self._obj_ids = _gen_zipf(self.num_objects, self.alpha, self.num_of_req, self.start_obj_id) + elif self.dist == "uniform": + self._obj_ids = _gen_uniform(self.num_objects, self.num_of_req, self.start_obj_id) + return self._obj_ids + + def get_num_of_req(self) -> int: + return self.num_of_req + + def read_one_req(self, req: Request) -> Request: + """Read one request and fill Request object""" + if self.current_pos >= self.num_of_req: + req.valid = False + return req + + obj_id = self.obj_ids[self.current_pos] + req.obj_id = obj_id + req.obj_size = self.obj_size + req.clock_time = self.current_pos * self.time_span // self.num_of_req + req.op = ReqOp.OP_NOP + req.valid = True + + self.current_pos += 1 + return req + + def reset(self) -> None: + """Reset read position to beginning""" + self.current_pos = 0 + + def close(self) -> None: + """Close reader and release resources""" + self._obj_ids = None + + def clone(self) -> "SyntheticReader": + """Create a copy of the reader""" + return SyntheticReader( + num_of_req=self.num_of_req, + obj_size=self.obj_size, + time_span=self.time_span, + start_obj_id=self.start_obj_id, + seed=self.seed, + alpha=self.alpha, + dist=self.dist, + num_objects=self.num_objects, + ) + + def read_first_req(self, req: Request) -> Request: + """Read the first request""" + if self.num_of_req == 0: + req.valid = False + return req + + obj_id = self.obj_ids[0] + req.obj_id = obj_id + req.obj_size = self.obj_size + req.clock_time = 0 + req.op = ReqOp.OP_NOP + req.valid = True + return req + + def read_last_req(self, req: Request) -> Request: + """Read the last request""" + if self.num_of_req == 0: + req.valid = False + return req + + obj_id = self.obj_ids[-1] + req.obj_id = obj_id + req.obj_size = self.obj_size + req.clock_time = (self.num_of_req - 1) * self.time_span // self.num_of_req + req.op = ReqOp.OP_NOP + req.valid = True + return req + + def skip_n_req(self, n: int) -> int: + """Skip n requests""" + self.current_pos = min(self.current_pos + n, self.num_of_req) + return self.current_pos + + def read_one_req_above(self, req: Request) -> Request: + """Read one request above current position""" + if self.current_pos + 1 >= self.num_of_req: + req.valid = False + return req + + obj_id = self.obj_ids[self.current_pos + 1] + req.obj_id = obj_id + req.obj_size = self.obj_size + req.clock_time = (self.current_pos + 1) * self.time_span // self.num_of_req + req.op = ReqOp.OP_NOP + req.valid = True + return req + + def go_back_one_req(self) -> None: + """Go back one request""" + self.current_pos = max(0, self.current_pos - 1) + + def set_read_pos(self, pos: float) -> None: + """Set read position""" + self.current_pos = max(0, min(int(pos), self.num_of_req)) + + def get_read_pos(self) -> float: + """Get current read position""" + return float(self.current_pos) + + def __iter__(self) -> Iterator[Request]: + """Iterator implementation""" + self.reset() + return self + + def __len__(self) -> int: + return self.num_of_req + + def __next__(self) -> Request: + """Next element for iterator""" + if self.current_pos >= self.num_of_req: + raise StopIteration + + req = Request() + return self.read_one_req(req) + + def __getitem__(self, index: int) -> Request: + """Support index access""" + if index < 0 or index >= self.num_of_req: + raise IndexError("Index out of range") + + req = Request() + obj_id = self.obj_ids[index] + req.obj_id = obj_id + req.obj_size = self.obj_size + req.clock_time = index * self.time_span // self.num_of_req + req.op = ReqOp.OP_NOP + req.valid = True + return req + + +def _gen_zipf(m: int, alpha: float, n: int, start: int = 0) -> np.ndarray: + """Generate Zipf-distributed workload. + + Args: + m: Number of objects + alpha: Skewness parameter (alpha >= 0) + n: Number of requests + start: Starting object ID + + Returns: + Array of object IDs following Zipf distribution + """ + if m <= 0 or n <= 0: + raise ValueError("num_objects and num_requests must be positive") + if alpha < 0: + raise ValueError("alpha must be non-negative") + + # Optimization: for alpha=0 (uniform), use uniform distribution directly + if alpha == 0: + return _gen_uniform(m, n, start) + + # Calculate Zipf distribution PMF + np_tmp = np.power(np.arange(1, m + 1), -alpha) + np_zeta = np.cumsum(np_tmp) + dist_map = np_zeta / np_zeta[-1] + + # Generate random samples + r = np.random.uniform(0, 1, n) + return np.searchsorted(dist_map, r) + start + + +def _gen_uniform(m: int, n: int, start: int = 0) -> np.ndarray: + """Generate uniform-distributed workload. + + Args: + m: Number of objects + n: Number of requests + start: Starting object ID + + Returns: + Array of object IDs following uniform distribution + """ + if m <= 0 or n <= 0: + raise ValueError("num_objects and num_requests must be positive") + return np.random.randint(0, m, n) + start + + +class _BaseRequestGenerator: + """Base class for request generators to reduce code duplication""" + + def __init__( + self, + num_objects: int, + num_requests: int, + obj_size: int = 4000, + time_span: int = 86400 * 7, + start_obj_id: int = 0, + seed: Optional[int] = None, + ): + """Initialize base request generator.""" + if num_objects <= 0 or num_requests <= 0: + raise ValueError("num_objects and num_requests must be positive") + if obj_size <= 0: + raise ValueError("obj_size must be positive") + if time_span <= 0: + raise ValueError("time_span must be positive") + + self.num_requests = num_requests + self.obj_size = obj_size + self.time_span = time_span + + # Set random seed + if seed is not None: + np.random.seed(seed) + random.seed(seed) + + # Subclasses must implement this method + self.obj_ids = self._generate_obj_ids(num_objects, num_requests, start_obj_id) + + def _generate_obj_ids(self, num_objects: int, num_requests: int, start_obj_id: int) -> np.ndarray: + """Subclasses must implement this method to generate object IDs""" + raise NotImplementedError("Subclasses must implement _generate_obj_ids") + + def __iter__(self) -> Iterator[Request]: + """Iterate over generated requests""" + for i, obj_id in enumerate(self.obj_ids): + req = Request() + req.clock_time = i * self.time_span // self.num_requests + req.obj_id = obj_id + req.obj_size = self.obj_size + req.op = ReqOp.OP_NOP + req.valid = True + yield req + + def __len__(self) -> int: + """Return number of requests""" + return self.num_requests + + +class _ZipfRequestGenerator(_BaseRequestGenerator): + """Zipf-distributed request generator""" + + def __init__( + self, + num_objects: int, + num_requests: int, + alpha: float = 1.0, + obj_size: int = 4000, + time_span: int = 86400 * 7, + start_obj_id: int = 0, + seed: Optional[int] = None, + ): + """Initialize Zipf request generator.""" + if alpha < 0: + raise ValueError("alpha must be non-negative") + self.alpha = alpha + super().__init__(num_objects, num_requests, obj_size, time_span, start_obj_id, seed) + + def _generate_obj_ids(self, num_objects: int, num_requests: int, start_obj_id: int) -> np.ndarray: + """Generate Zipf-distributed object IDs""" + return _gen_zipf(num_objects, self.alpha, num_requests, start_obj_id) + + +class _UniformRequestGenerator(_BaseRequestGenerator): + """Uniform-distributed request generator""" + + def _generate_obj_ids(self, num_objects: int, num_requests: int, start_obj_id: int) -> np.ndarray: + """Generate uniformly-distributed object IDs""" + return _gen_uniform(num_objects, num_requests, start_obj_id) + + +def create_zipf_requests( + num_objects: int, + num_requests: int, + alpha: float = 1.0, + obj_size: int = 4000, + time_span: int = 86400 * 7, + start_obj_id: int = 0, + seed: Optional[int] = None, +) -> _ZipfRequestGenerator: + """Create a Zipf-distributed request generator. + + Args: + num_objects: Number of unique objects + num_requests: Number of requests to generate + alpha: Zipf skewness parameter (alpha >= 0) + obj_size: Object size in bytes + time_span: Time span in seconds + start_obj_id: Starting object ID + seed: Random seed for reproducibility + + Returns: + Generator that yields Request objects + """ + return _ZipfRequestGenerator( + num_objects=num_objects, + num_requests=num_requests, + alpha=alpha, + obj_size=obj_size, + time_span=time_span, + start_obj_id=start_obj_id, + seed=seed, + ) + + +def create_uniform_requests( + num_objects: int, + num_requests: int, + obj_size: int = 4000, + time_span: int = 86400 * 7, + start_obj_id: int = 0, + seed: Optional[int] = None, +) -> _UniformRequestGenerator: + """Create a uniform-distributed request generator. + + Args: + num_objects: Number of unique objects + num_requests: Number of requests to generate + obj_size: Object size in bytes + time_span: Time span in seconds + start_obj_id: Starting object ID + seed: Random seed for reproducibility + + Returns: + Generator that yields Request objects + """ + return _UniformRequestGenerator( + num_objects=num_objects, + num_requests=num_requests, + obj_size=obj_size, + time_span=time_span, + start_obj_id=start_obj_id, + seed=seed, + ) diff --git a/libCacheSim-python/libcachesim/trace_analyzer.py b/libCacheSim-python/libcachesim/trace_analyzer.py new file mode 100644 index 000000000..bf598a71b --- /dev/null +++ b/libCacheSim-python/libcachesim/trace_analyzer.py @@ -0,0 +1,29 @@ +"""Wrapper of Analyzer""" + +from .protocols import ReaderProtocol, AnalyzerProtocol + +from .libcachesim_python import ( + Analyzer, + AnalysisOption, + AnalysisParam, +) + + +class TraceAnalyzer(AnalyzerProtocol): + _analyzer: Analyzer + + def __init__( + self, + analyzer: Analyzer, + reader: "ReaderProtocol", + output_path: str, + analysis_param: AnalysisParam, + analysis_option: AnalysisOption, + ): + self._analyzer = Analyzer(reader._reader, output_path, analysis_option, analysis_param) + + def run(self) -> None: + self._analyzer.run() + + def cleanup(self) -> None: + self._analyzer.cleanup() diff --git a/libCacheSim-python/libcachesim/trace_generator.py b/libCacheSim-python/libcachesim/trace_generator.py deleted file mode 100644 index 8c5802243..000000000 --- a/libCacheSim-python/libcachesim/trace_generator.py +++ /dev/null @@ -1,215 +0,0 @@ -""" -Trace generator module for libCacheSim Python bindings. - -This module provides functions to generate synthetic traces with different distributions. -""" - -import numpy as np -import random -from typing import Optional -from collections.abc import Iterator -from ._libcachesim import Request, ReqOp - - -def _gen_zipf(m: int, alpha: float, n: int, start: int = 0) -> np.ndarray: - """Generate zipf distributed workload (internal function). - - Args: - m (int): The number of objects - alpha (float): The skewness parameter (alpha >= 0) - n (int): The number of requests - start (int, optional): Start object ID. Defaults to 0. - - Returns: - np.ndarray: Array of object IDs following Zipf distribution - """ - if m <= 0 or n <= 0: - raise ValueError("num_objects and num_requests must be positive") - if alpha < 0: - raise ValueError("alpha must be non-negative") - np_tmp = np.power(np.arange(1, m + 1), -alpha) - np_zeta = np.cumsum(np_tmp) - dist_map = np_zeta / np_zeta[-1] - r = np.random.uniform(0, 1, n) - return np.searchsorted(dist_map, r) + start - - -def _gen_uniform(m: int, n: int, start: int = 0) -> np.ndarray: - """Generate uniform distributed workload (internal function). - - Args: - m (int): The number of objects - n (int): The number of requests - start (int, optional): Start object ID. Defaults to 0. - - Returns: - np.ndarray: Array of object IDs following uniform distribution - """ - if m <= 0 or n <= 0: - raise ValueError("num_objects and num_requests must be positive") - return np.random.uniform(0, m, n).astype(int) + start - - -class _ZipfRequestGenerator: - """Zipf-distributed request generator (internal class).""" - - def __init__( - self, - num_objects: int, - num_requests: int, - alpha: float = 1.0, - obj_size: int = 4000, - time_span: int = 86400 * 7, - start_obj_id: int = 0, - seed: Optional[int] = None, - ): - """Initialize Zipf request generator. - - Args: - num_objects (int): Number of unique objects - num_requests (int): Number of requests to generate - alpha (float): Zipf skewness parameter (alpha >= 0) - obj_size (int): Object size in bytes - time_span (int): Time span in seconds - start_obj_id (int): Starting object ID - seed (int, optional): Random seed for reproducibility - """ - self.num_requests = num_requests - self.obj_size = obj_size - self.time_span = time_span - - # Set random seed if provided - if seed is not None: - np.random.seed(seed) - random.seed(seed) - - # Pre-generate object IDs - self.obj_ids = _gen_zipf(num_objects, alpha, num_requests, start_obj_id) - - def __iter__(self) -> Iterator[Request]: - """Iterate over generated requests.""" - for i, obj_id in enumerate(self.obj_ids): - req = Request() - req.clock_time = i * self.time_span // self.num_requests - req.obj_id = obj_id - req.obj_size = self.obj_size - req.op = ReqOp.READ # Default operation - yield req - - def __len__(self) -> int: - """Return the number of requests.""" - return self.num_requests - - -class _UniformRequestGenerator: - """Uniform-distributed request generator (internal class).""" - - def __init__( - self, - num_objects: int, - num_requests: int, - obj_size: int = 4000, - time_span: int = 86400 * 7, - start_obj_id: int = 0, - seed: Optional[int] = None, - ): - """Initialize uniform request generator. - - Args: - num_objects (int): Number of unique objects - num_requests (int): Number of requests to generate - obj_size (int): Object size in bytes - time_span (int): Time span in seconds - start_obj_id (int): Starting object ID - seed (int, optional): Random seed for reproducibility - """ - self.num_requests = num_requests - self.obj_size = obj_size - self.time_span = time_span - - # Set random seed if provided - if seed is not None: - np.random.seed(seed) - random.seed(seed) - - # Pre-generate object IDs - self.obj_ids = _gen_uniform(num_objects, num_requests, start_obj_id) - - def __iter__(self) -> Iterator[Request]: - """Iterate over generated requests.""" - for i, obj_id in enumerate(self.obj_ids): - req = Request() - req.clock_time = i * self.time_span // self.num_requests - req.obj_id = obj_id - req.obj_size = self.obj_size - req.op = ReqOp.READ # Default operation - yield req - - def __len__(self) -> int: - """Return the number of requests.""" - return self.num_requests - - -def create_zipf_requests( - num_objects: int, - num_requests: int, - alpha: float = 1.0, - obj_size: int = 4000, - time_span: int = 86400 * 7, - start_obj_id: int = 0, - seed: Optional[int] = None, -) -> _ZipfRequestGenerator: - """Create a Zipf-distributed request generator. - - Args: - num_objects (int): Number of unique objects - num_requests (int): Number of requests to generate - alpha (float): Zipf skewness parameter (alpha >= 0) - obj_size (int): Object size in bytes - time_span (int): Time span in seconds - start_obj_id (int): Starting object ID - seed (int, optional): Random seed for reproducibility - - Returns: - Generator: A generator that yields Request objects - """ - return _ZipfRequestGenerator( - num_objects=num_objects, - num_requests=num_requests, - alpha=alpha, - obj_size=obj_size, - time_span=time_span, - start_obj_id=start_obj_id, - seed=seed, - ) - - -def create_uniform_requests( - num_objects: int, - num_requests: int, - obj_size: int = 4000, - time_span: int = 86400 * 7, - start_obj_id: int = 0, - seed: Optional[int] = None, -) -> _UniformRequestGenerator: - """Create a uniform-distributed request generator. - - Args: - num_objects (int): Number of unique objects - num_requests (int): Number of requests to generate - obj_size (int): Object size in bytes - time_span (int): Time span in seconds - start_obj_id (int): Starting object ID - seed (int, optional): Random seed for reproducibility - - Returns: - Generator: A generator that yields Request objects - """ - return _UniformRequestGenerator( - num_objects=num_objects, - num_requests=num_requests, - obj_size=obj_size, - time_span=time_span, - start_obj_id=start_obj_id, - seed=seed, - ) diff --git a/libCacheSim-python/libcachesim/trace_reader.py b/libCacheSim-python/libcachesim/trace_reader.py new file mode 100644 index 000000000..d37dead20 --- /dev/null +++ b/libCacheSim-python/libcachesim/trace_reader.py @@ -0,0 +1,251 @@ +"""Wrapper of Reader""" + +import logging +from typing import overload, Union +from collections.abc import Iterator + +from .protocols import ReaderProtocol + +from .libcachesim_python import TraceType, SamplerType, Request, ReaderInitParam, Reader, Sampler, ReadDirection + + +class TraceReader(ReaderProtocol): + _reader: Reader + + # Mark this as a C++ reader for c_process_trace compatibility + c_reader: bool = True + + @overload + def __init__(self, trace: Reader) -> None: ... + + def __init__( + self, + trace: Union[Reader, str], + trace_type: TraceType = TraceType.UNKNOWN_TRACE, + ignore_obj_size: bool = False, + ignore_size_zero_req: bool = False, + obj_id_is_num: bool = False, + obj_id_is_num_set: bool = False, + cap_at_n_req: int = -1, + block_size: int = 0, + has_header: bool = False, + has_header_set: bool = False, + delimiter: str = ",", + trace_start_offset: int = 0, + binary_fmt_str: str = "", + sampling_ratio: float = 1.0, + sampling_type: SamplerType = SamplerType.INVALID_SAMPLER, + ): + if isinstance(trace, Reader): + self._reader = trace + return + + # Process sampling_type + if sampling_ratio < 0.0 or sampling_ratio > 1.0: + raise ValueError("Sampling ratio must be between 0.0 and 1.0") + + if sampling_ratio == 1.0: + sampler = None + else: + if sampling_type == SamplerType.INVALID_SAMPLER: + logging.warning("Sampling type is invalid, using SPATIAL_SAMPLER instead") + sampling_type = SamplerType.SPATIAL_SAMPLER + logging.info(f"Sampling ratio: {sampling_ratio}, Sampling type: {sampling_type}") + sampler = Sampler(sampling_ratio, sampling_type) + + # Construct ReaderInitParam + reader_init_params = ReaderInitParam( + binary_fmt_str=binary_fmt_str, + ignore_obj_size=ignore_obj_size, + ignore_size_zero_req=ignore_size_zero_req, + obj_id_is_num=obj_id_is_num, + obj_id_is_num_set=obj_id_is_num_set, + cap_at_n_req=cap_at_n_req, + block_size=block_size, + has_header=has_header, + has_header_set=has_header_set, + delimiter=delimiter, + trace_start_offset=trace_start_offset, + sampler=sampler, + ) + + self._reader = Reader(trace, trace_type, reader_init_params) + + @property + def n_read_req(self) -> int: + return self._reader.n_read_req + + @property + def n_total_req(self) -> int: + return self._reader.n_total_req + + @property + def trace_path(self) -> str: + return self._reader.trace_path + + @property + def file_size(self) -> int: + return self._reader.file_size + + @property + def init_params(self) -> ReaderInitParam: + return self._reader.init_params + + @property + def trace_type(self) -> TraceType: + return self._reader.trace_type + + @property + def trace_format(self) -> str: + return self._reader.trace_format + + @property + def ver(self) -> int: + return self._reader.ver + + @property + def cloned(self) -> bool: + return self._reader.cloned + + @property + def cap_at_n_req(self) -> int: + return self._reader.cap_at_n_req + + @property + def trace_start_offset(self) -> int: + return self._reader.trace_start_offset + + @property + def mapped_file(self) -> bool: + return self._reader.mapped_file + + @property + def mmap_offset(self) -> int: + return self._reader.mmap_offset + + @property + def is_zstd_file(self) -> bool: + return self._reader.is_zstd_file + + @property + def item_size(self) -> int: + return self._reader.item_size + + @property + def line_buf(self) -> str: + return self._reader.line_buf + + @property + def line_buf_size(self) -> int: + return self._reader.line_buf_size + + @property + def csv_delimiter(self) -> str: + return self._reader.csv_delimiter + + @property + def csv_has_header(self) -> bool: + return self._reader.csv_has_header + + @property + def obj_id_is_num(self) -> bool: + return self._reader.obj_id_is_num + + @property + def obj_id_is_num_set(self) -> bool: + return self._reader.obj_id_is_num_set + + @property + def ignore_size_zero_req(self) -> bool: + return self._reader.ignore_size_zero_req + + @property + def ignore_obj_size(self) -> bool: + return self._reader.ignore_obj_size + + @property + def block_size(self) -> int: + return self._reader.block_size + + @ignore_size_zero_req.setter + def ignore_size_zero_req(self, value: bool) -> None: + self._reader.ignore_size_zero_req = value + + @ignore_obj_size.setter + def ignore_obj_size(self, value: bool) -> None: + self._reader.ignore_obj_size = value + + @block_size.setter + def block_size(self, value: int) -> None: + self._reader.block_size = value + + @property + def n_req_left(self) -> int: + return self._reader.n_req_left + + @property + def last_req_clock_time(self) -> int: + return self._reader.last_req_clock_time + + @property + def lcs_ver(self) -> int: + return self._reader.lcs_ver + + @property + def sampler(self) -> Sampler: + return self._reader.sampler + + @property + def read_direction(self) -> ReadDirection: + return self._reader.read_direction + + def get_num_of_req(self) -> int: + return self._reader.get_num_of_req() + + def read_one_req(self, req: Request) -> Request: + return self._reader.read_one_req(req) + + def reset(self) -> None: + self._reader.reset() + + def close(self) -> None: + self._reader.close() + + def clone(self) -> "TraceReader": + return TraceReader(self._reader.clone()) + + def read_first_req(self, req: Request) -> Request: + return self._reader.read_first_req(req) + + def read_last_req(self, req: Request) -> Request: + return self._reader.read_last_req(req) + + def skip_n_req(self, n: int) -> int: + return self._reader.skip_n_req(n) + + def read_one_req_above(self) -> Request: + return self._reader.read_one_req_above() + + def go_back_one_req(self) -> None: + self._reader.go_back_one_req() + + def set_read_pos(self, pos: float) -> None: + self._reader.set_read_pos(pos) + + def __iter__(self) -> Iterator[Request]: + return self._reader.__iter__() + + def __len__(self) -> int: + return self._reader.get_num_of_req() + + def __next__(self) -> Request: + if self._reader.n_req_left == 0: + raise StopIteration + return self._reader.read_one_req() + + def __getitem__(self, index: int) -> Request: + if index < 0 or index >= self._reader.get_num_of_req(): + raise IndexError("Index out of range") + self._reader.reset() + self._reader.skip_n_req(index) + return self._reader.read_one_req() diff --git a/libCacheSim-python/libcachesim/util.py b/libCacheSim-python/libcachesim/util.py new file mode 100644 index 000000000..0f80a7fb2 --- /dev/null +++ b/libCacheSim-python/libcachesim/util.py @@ -0,0 +1,50 @@ +"""Wrapper misc functions""" + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .protocols import CacheProtocol, ReaderProtocol + +from .libcachesim_python import convert_to_oracleGeneral, convert_to_lcs, c_process_trace + + +class Util: + @staticmethod + def convert_to_oracleGeneral(reader, ofilepath, output_txt=False, remove_size_change=False): + return convert_to_oracleGeneral(reader, ofilepath, output_txt, remove_size_change) + + @staticmethod + def convert_to_lcs(reader, ofilepath, output_txt=False, remove_size_change=False, lcs_ver=1): + """ + Convert a trace to LCS format. + + Args: + reader: The reader to convert. + ofilepath: The path to the output file. + output_txt: Whether to output the trace in text format. + remove_size_change: Whether to remove the size change field. + lcs_ver: The version of LCS format (1, 2, 3, 4, 5, 6, 7, 8). + """ + return convert_to_lcs(reader, ofilepath, output_txt, remove_size_change, lcs_ver) + + @staticmethod + def process_trace( + cache: "CacheProtocol", reader: "ReaderProtocol", start_req: int = 0, max_req: int = -1 + ) -> tuple[float, float]: + """ + Process a trace with a cache. + + Args: + cache: The cache to process the trace with. + reader: The reader to read the trace from. + start_req: The starting request to process. + max_req: The maximum number of requests to process. + + Returns: + tuple[float, float]: The object miss ratio and byte miss ratio. + """ + # Check if reader is C++ reader + if not hasattr(reader, "c_reader") or not reader.c_reader: + raise ValueError("Reader must be a C++ reader") + + return c_process_trace(cache._cache, reader._reader, start_req, max_req) diff --git a/libCacheSim-python/src/exception.cpp b/libCacheSim-python/src/exception.cpp new file mode 100644 index 000000000..078d9c4c0 --- /dev/null +++ b/libCacheSim-python/src/exception.cpp @@ -0,0 +1,56 @@ +// libcachesim_python - libCacheSim Python bindings +// Copyright 2025 The libcachesim Authors. All rights reserved. +// +// Use of this source code is governed by a GPL-3.0 +// license that can be found in the LICENSE file or at +// https://github.com/1a1a11a/libcachesim/blob/develop/LICENSE + +#include "exception.h" + +#include + +namespace libcachesim { + +namespace py = pybind11; + +void register_exception(py::module& m) { + static py::exception exc_cache(m, "CacheException"); + static py::exception exc_reader(m, "ReaderException"); + + py::register_exception_translator([](std::exception_ptr p) { + try { + if (p) std::rethrow_exception(p); + } catch (const CacheException& e) { + py::set_error(exc_cache, e.what()); + } catch (const ReaderException& e) { + py::set_error(exc_reader, e.what()); + } + }); + + py::register_exception_translator([](std::exception_ptr p) { + try { + if (p) std::rethrow_exception(p); + } catch (const std::bad_alloc& e) { + PyErr_SetString(PyExc_MemoryError, e.what()); + } catch (const std::invalid_argument& e) { + PyErr_SetString(PyExc_ValueError, e.what()); + } catch (const std::out_of_range& e) { + PyErr_SetString(PyExc_IndexError, e.what()); + } catch (const std::domain_error& e) { + PyErr_SetString(PyExc_ValueError, + ("Domain error: " + std::string(e.what())).c_str()); + } catch (const std::overflow_error& e) { + PyErr_SetString(PyExc_OverflowError, e.what()); + } catch (const std::range_error& e) { + PyErr_SetString(PyExc_ValueError, + ("Range error: " + std::string(e.what())).c_str()); + } catch (const std::runtime_error& e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + } catch (const std::exception& e) { + PyErr_SetString(PyExc_RuntimeError, + ("C++ exception: " + std::string(e.what())).c_str()); + } + }); +} + +} // namespace libcachesim diff --git a/libCacheSim-python/src/exception.h b/libCacheSim-python/src/exception.h new file mode 100644 index 000000000..2749ae078 --- /dev/null +++ b/libCacheSim-python/src/exception.h @@ -0,0 +1,33 @@ +// libcachesim_python - libCacheSim Python bindings +// Copyright 2025 The libcachesim Authors. All rights reserved. +// +// Use of this source code is governed by a GPL-3.0 +// license that can be found in the LICENSE file or at +// https://github.com/1a1a11a/libcachesim/blob/develop/LICENSE + +#pragma once + +#include + +#include +#include + +namespace libcachesim { + +namespace py = pybind11; + +class CacheException : public std::runtime_error { + public: + explicit CacheException(const std::string& message) + : std::runtime_error("CacheException: " + message) {} +}; + +class ReaderException : public std::runtime_error { + public: + explicit ReaderException(const std::string& message) + : std::runtime_error("ReaderException: " + message) {} +}; + +void register_exception(py::module& m); + +} // namespace libcachesim diff --git a/libCacheSim-python/src/export.cpp b/libCacheSim-python/src/export.cpp new file mode 100644 index 000000000..0ef8d8334 --- /dev/null +++ b/libCacheSim-python/src/export.cpp @@ -0,0 +1,38 @@ +// libcachesim_python - libCacheSim Python bindings +// Copyright 2025 The libcachesim Authors. All rights reserved. +// +// Use of this source code is governed by a GPL-3.0 +// license that can be found in the LICENSE file or at +// https://github.com/1a1a11a/libcachesim/blob/develop/LICENSE + +#include "export.h" + +#include "exception.h" + +#define STRINGIFY(x) #x +#define MACRO_STRINGIFY(x) STRINGIFY(x) + +namespace libcachesim { + +PYBIND11_MODULE(libcachesim_python, m) { + m.doc() = "libcachesim_python"; + + // NOTE(haocheng): can use decentralized interface holder to export all the + // methods if the codebase is large enough + + export_cache(m); + export_reader(m); + export_analyzer(m); + export_misc(m); + + // NOTE(haocheng): register exception to make it available in Python + register_exception(m); + +#ifdef VERSION_INFO + m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO); +#else + m.attr("__version__") = "dev"; +#endif +} + +} // namespace libcachesim diff --git a/libCacheSim-python/src/export.h b/libCacheSim-python/src/export.h new file mode 100644 index 000000000..121ff97b1 --- /dev/null +++ b/libCacheSim-python/src/export.h @@ -0,0 +1,27 @@ +// libcachesim_python - libCacheSim Python bindings +// Copyright 2025 The libcachesim Authors. All rights reserved. +// +// Use of this source code is governed by a GPL-3.0 +// license that can be found in the LICENSE file or at +// https://github.com/1a1a11a/libcachesim/blob/develop/LICENSE + +#pragma once + +#include "pybind11/operators.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace libcachesim { + +namespace py = pybind11; + +using py::literals::operator""_a; + +void export_cache(py::module &m); +void export_pyplugin_cache(py::module &m); + +void export_reader(py::module &m); +void export_analyzer(py::module &m); +void export_misc(py::module &m); + +} // namespace libcachesim diff --git a/libCacheSim-python/src/export_analyzer.cpp b/libCacheSim-python/src/export_analyzer.cpp new file mode 100644 index 000000000..0d8fd6680 --- /dev/null +++ b/libCacheSim-python/src/export_analyzer.cpp @@ -0,0 +1,136 @@ +// libcachesim_python - libCacheSim Python bindings +// Copyright 2025 The libcachesim Authors. All rights reserved. +// +// Use of this source code is governed by a GPL-3.0 +// license that can be found in the LICENSE file or at +// https://github.com/1a1a11a/libcachesim/blob/develop/LICENSE + +#include +#include +#include + +#include +#include + +#include "../libCacheSim/traceAnalyzer/analyzer.h" +#include "export.h" +#include "libCacheSim/cache.h" +#include "libCacheSim/reader.h" +#include "libCacheSim/request.h" + +namespace libcachesim { + +namespace py = pybind11; + +// Custom deleters for smart pointers +struct AnalysisParamDeleter { + void operator()(traceAnalyzer::analysis_param_t* ptr) const { + if (ptr != nullptr) free(ptr); + } +}; + +struct AnalysisOptionDeleter { + void operator()(traceAnalyzer::analysis_option_t* ptr) const { + if (ptr != nullptr) free(ptr); + } +}; + +void export_analyzer(py::module& m) { + py::class_< + traceAnalyzer::analysis_param_t, + std::unique_ptr>( + m, "AnalysisParam") + .def(py::init([](int access_pattern_sample_ratio_inv, int track_n_popular, + int track_n_hit, int time_window, int warmup_time) { + traceAnalyzer::analysis_param_t params; + params.access_pattern_sample_ratio_inv = + access_pattern_sample_ratio_inv; + params.track_n_popular = track_n_popular; + params.track_n_hit = track_n_hit; + params.time_window = time_window; + params.warmup_time = warmup_time; + return std::unique_ptr( + new traceAnalyzer::analysis_param_t(params)); + }), + "access_pattern_sample_ratio_inv"_a = 10, "track_n_popular"_a = 10, + "track_n_hit"_a = 5, "time_window"_a = 60, "warmup_time"_a = 0) + .def_readwrite( + "access_pattern_sample_ratio_inv", + &traceAnalyzer::analysis_param_t::access_pattern_sample_ratio_inv) + .def_readwrite("track_n_popular", + &traceAnalyzer::analysis_param_t::track_n_popular) + .def_readwrite("track_n_hit", + &traceAnalyzer::analysis_param_t::track_n_hit) + .def_readwrite("time_window", + &traceAnalyzer::analysis_param_t::time_window) + .def_readwrite("warmup_time", + &traceAnalyzer::analysis_param_t::warmup_time); + + py::class_< + traceAnalyzer::analysis_option_t, + std::unique_ptr>( + m, "AnalysisOption") + .def( + py::init([](bool req_rate, bool access_pattern, bool size, bool reuse, + bool popularity, bool ttl, bool popularity_decay, + bool lifetime, bool create_future_reuse_ccdf, + bool prob_at_age, bool size_change) { + traceAnalyzer::analysis_option_t option; + option.req_rate = req_rate; + option.access_pattern = access_pattern; + option.size = size; + option.reuse = reuse; + option.popularity = popularity; + option.ttl = ttl; + option.popularity_decay = popularity_decay; + option.lifetime = lifetime; + option.create_future_reuse_ccdf = create_future_reuse_ccdf; + option.prob_at_age = prob_at_age; + option.size_change = size_change; + return std::unique_ptr( + new traceAnalyzer::analysis_option_t(option)); + }), + "req_rate"_a = false, "access_pattern"_a = false, "size"_a = false, + "reuse"_a = false, "popularity"_a = false, "ttl"_a = false, + "popularity_decay"_a = false, "lifetime"_a = false, + "create_future_reuse_ccdf"_a = false, "prob_at_age"_a = false, + "size_change"_a = false) + .def_readwrite("req_rate", &traceAnalyzer::analysis_option_t::req_rate) + .def_readwrite("access_pattern", + &traceAnalyzer::analysis_option_t::access_pattern) + .def_readwrite("size", &traceAnalyzer::analysis_option_t::size) + .def_readwrite("reuse", &traceAnalyzer::analysis_option_t::reuse) + .def_readwrite("popularity", + &traceAnalyzer::analysis_option_t::popularity) + .def_readwrite("ttl", &traceAnalyzer::analysis_option_t::ttl) + .def_readwrite("popularity_decay", + &traceAnalyzer::analysis_option_t::popularity_decay) + .def_readwrite("lifetime", &traceAnalyzer::analysis_option_t::lifetime) + .def_readwrite( + "create_future_reuse_ccdf", + &traceAnalyzer::analysis_option_t::create_future_reuse_ccdf) + .def_readwrite("prob_at_age", + &traceAnalyzer::analysis_option_t::prob_at_age) + .def_readwrite("size_change", + &traceAnalyzer::analysis_option_t::size_change); + + py::class_>(m, "Analyzer") + .def(py::init([](reader_t* reader, std::string output_path, + const traceAnalyzer::analysis_param_t& param, + const traceAnalyzer::analysis_option_t& option) { + traceAnalyzer::TraceAnalyzer* analyzer = + new traceAnalyzer::TraceAnalyzer(reader, output_path, option, + param); + return std::unique_ptr(analyzer); + }), + "reader"_a, "output_path"_a, + "param"_a = traceAnalyzer::default_param(), + "option"_a = traceAnalyzer::default_option()) + .def("run", &traceAnalyzer::TraceAnalyzer::run) + .def("cleanup", &traceAnalyzer::TraceAnalyzer::cleanup); +} + +} // namespace libcachesim diff --git a/libCacheSim-python/src/export_cache.cpp b/libCacheSim-python/src/export_cache.cpp new file mode 100644 index 000000000..3868866cc --- /dev/null +++ b/libCacheSim-python/src/export_cache.cpp @@ -0,0 +1,493 @@ +// libcachesim_python - libCacheSim Python bindings +// Export cache core functions and classes +// Copyright 2025 The libcachesim Authors. All rights reserved. +// +// Use of this source code is governed by a GPL-3.0 +// license that can be found in the LICENSE file or at +// https://github.com/1a1a11a/libcachesim/blob/develop/LICENSE + +#include +#include +#include + +#include +#include + +#include "config.h" +#include "dataStructure/hashtable/hashtable.h" +#include "export.h" +#include "libCacheSim/cache.h" +#include "libCacheSim/cacheObj.h" +#include "libCacheSim/enum.h" +#include "libCacheSim/evictionAlgo.h" +#include "libCacheSim/plugin.h" +#include "libCacheSim/request.h" + +namespace libcachesim { + +namespace py = pybind11; + +// Custom deleters for smart pointers +struct CacheDeleter { + void operator()(cache_t* ptr) const { + if (ptr != nullptr) ptr->cache_free(ptr); + } +}; + +struct CommonCacheParamsDeleter { + void operator()(common_cache_params_t* ptr) const { + if (ptr != nullptr) { + delete ptr; // Simple delete for POD struct + } + } +}; + +struct CacheObjectDeleter { + void operator()(cache_obj_t* ptr) const { + if (ptr != nullptr) free_cache_obj(ptr); + } +}; + +struct RequestDeleter { + void operator()(request_t* ptr) const { + if (ptr != nullptr) free_request(ptr); + } +}; + +// *********************************************************************** +// **** Python plugin cache implementation BEGIN **** +// *********************************************************************** + +typedef struct pypluginCache_params { + py::object data; ///< Plugin's internal data structure (python object) + py::function cache_init_hook; + py::function cache_hit_hook; + py::function cache_miss_hook; + py::function cache_eviction_hook; + py::function cache_remove_hook; + py::function cache_free_hook; + std::string cache_name; +} pypluginCache_params_t; + +static void pypluginCache_free(cache_t* cache); +static bool pypluginCache_get(cache_t* cache, const request_t* req); +static cache_obj_t* pypluginCache_find(cache_t* cache, const request_t* req, + const bool update_cache); +static cache_obj_t* pypluginCache_insert(cache_t* cache, const request_t* req); +static cache_obj_t* pypluginCache_to_evict(cache_t* cache, + const request_t* req); +static void pypluginCache_evict(cache_t* cache, const request_t* req); +static bool pypluginCache_remove(cache_t* cache, const obj_id_t obj_id); + +cache_t* pypluginCache_init( + const common_cache_params_t ccache_params, std::string cache_name, + py::function cache_init_hook, py::function cache_hit_hook, + py::function cache_miss_hook, py::function cache_eviction_hook, + py::function cache_remove_hook, py::function cache_free_hook) { + // Initialize base cache structure + cache_t* cache = cache_struct_init(cache_name.c_str(), ccache_params, NULL); + + // Set function pointers for cache operations + cache->cache_init = NULL; + cache->cache_free = pypluginCache_free; + cache->get = pypluginCache_get; + cache->find = pypluginCache_find; + cache->insert = pypluginCache_insert; + cache->evict = pypluginCache_evict; + cache->remove = pypluginCache_remove; + cache->to_evict = pypluginCache_to_evict; + cache->get_occupied_byte = cache_get_occupied_byte_default; + cache->get_n_obj = cache_get_n_obj_default; + cache->can_insert = cache_can_insert_default; + cache->obj_md_size = 0; + + // Allocate and initialize plugin parameters + pypluginCache_params_t* params = new pypluginCache_params_t(); + params->cache_name = cache_name; + params->cache_init_hook = cache_init_hook; + params->cache_hit_hook = cache_hit_hook; + params->cache_miss_hook = cache_miss_hook; + params->cache_eviction_hook = cache_eviction_hook; + params->cache_remove_hook = cache_remove_hook; + params->cache_free_hook = cache_free_hook; + params->data = cache_init_hook(ccache_params); + + cache->eviction_params = params; + + return cache; +} + +static void pypluginCache_free(cache_t* cache) { + pypluginCache_params_t* params = + (pypluginCache_params_t*)cache->eviction_params; + + if (!params->cache_free_hook.is_none()) { + params->cache_free_hook(params->data); + } + delete params; + cache_struct_free(cache); +} + +static bool pypluginCache_get(cache_t* cache, const request_t* req) { + bool hit = cache_get_base(cache, req); + pypluginCache_params_t* params = + (pypluginCache_params_t*)cache->eviction_params; + + if (hit) { + params->cache_hit_hook(params->data, req); + } else { + params->cache_miss_hook(params->data, req); + } + + return hit; +} + +static cache_obj_t* pypluginCache_find(cache_t* cache, const request_t* req, + const bool update_cache) { + return cache_find_base(cache, req, update_cache); +} + +static cache_obj_t* pypluginCache_insert(cache_t* cache, const request_t* req) { + return cache_insert_base(cache, req); +} + +static cache_obj_t* pypluginCache_to_evict(cache_t* cache, + const request_t* req) { + throw std::runtime_error("pypluginCache does not support to_evict function"); +} + +static void pypluginCache_evict(cache_t* cache, const request_t* req) { + pypluginCache_params_t* params = + (pypluginCache_params_t*)cache->eviction_params; + + // Get eviction candidate from plugin + py::object result = params->cache_eviction_hook(params->data, req); + obj_id_t obj_id = result.cast(); + + // Find the object in the cache + cache_obj_t* obj_to_evict = hashtable_find_obj_id(cache->hashtable, obj_id); + if (obj_to_evict == NULL) { + throw std::runtime_error("pypluginCache: object " + std::to_string(obj_id) + + " to be evicted not found in cache"); + } + + // Perform the eviction + cache_evict_base(cache, obj_to_evict, true); +} + +static bool pypluginCache_remove(cache_t* cache, const obj_id_t obj_id) { + pypluginCache_params_t* params = + (pypluginCache_params_t*)cache->eviction_params; + + // Notify plugin of the removal + params->cache_remove_hook(params->data, obj_id); + + // Find the object in the cache + cache_obj_t* obj = hashtable_find_obj_id(cache->hashtable, obj_id); + if (obj == NULL) { + return false; + } + + // Remove the object from the cache + cache_remove_obj_base(cache, obj, true); + return true; +} + +// *********************************************************************** +// **** Python plugin cache implementation END **** +// *********************************************************************** + +// Templates +template +auto make_cache_wrapper(const std::string& fn_name) { + return [=](py::module_& m) { + m.def( + fn_name.c_str(), + [](const common_cache_params_t& cc_params, + const std::string& cache_specific_params) { + const char* params_cstr = cache_specific_params.empty() + ? nullptr + : cache_specific_params.c_str(); + cache_t* ptr = InitFn(cc_params, params_cstr); + return std::unique_ptr(ptr); + }, + "cc_params"_a, "cache_specific_params"_a = ""); + }; +} + +void export_cache(py::module& m) { + /** + * @brief Cache structure + */ + py::class_>(m, "Cache") + .def_readonly("cache_size", &cache_t::cache_size) + .def_readonly("default_ttl", &cache_t::default_ttl) + .def_readonly("obj_md_size", &cache_t::obj_md_size) + .def_readonly("n_req", &cache_t::n_req) + .def_readonly("cache_name", &cache_t::cache_name) + .def_readonly("init_params", &cache_t::init_params) + .def( + "get", + [](cache_t& self, const request_t& req) { + return self.get(&self, &req); + }, + "req"_a) + .def( + "find", + [](cache_t& self, const request_t& req, const bool update_cache) { + return self.find(&self, &req, update_cache); + }, + "req"_a, "update_cache"_a = true) + .def( + "can_insert", + [](cache_t& self, const request_t& req) { + return self.can_insert(&self, &req); + }, + "req"_a) + .def( + "insert", + [](cache_t& self, const request_t& req) { + return self.insert(&self, &req); + }, + "req"_a) + .def( + "need_eviction", + [](cache_t& self, const request_t& req) { + return self.need_eviction(&self, &req); + }, + "req"_a) + .def( + "evict", + [](cache_t& self, const request_t& req) { + return self.evict(&self, &req); + }, + "req"_a) + .def( + "remove", + [](cache_t& self, obj_id_t obj_id) { + return self.remove(&self, obj_id); + }, + "obj_id"_a) + .def( + "to_evict", + [](cache_t& self, const request_t& req) { + return self.to_evict(&self, &req); + }, + "req"_a) + .def("get_occupied_byte", + [](cache_t& self) { return self.get_occupied_byte(&self); }) + .def("get_n_obj", [](cache_t& self) { return self.get_n_obj(&self); }) + .def("print_cache", [](cache_t& self) { + // Capture stdout to return as string + std::ostringstream captured_output; + std::streambuf* orig = std::cout.rdbuf(); + std::cout.rdbuf(captured_output.rdbuf()); + + self.print_cache(&self); + + // Restore original stdout + std::cout.rdbuf(orig); + return captured_output.str(); + }); + + /** + * @brief Common cache parameters + */ + py::class_>( + m, "CommonCacheParams") + .def(py::init([](uint64_t cache_size, uint64_t default_ttl, + int32_t hashpower, bool consider_obj_metadata) { + common_cache_params_t* params = new common_cache_params_t(); + params->cache_size = cache_size; + params->default_ttl = default_ttl; + params->hashpower = hashpower; + params->consider_obj_metadata = consider_obj_metadata; + return params; + }), + "cache_size"_a, "default_ttl"_a = 86400 * 300, "hashpower"_a = 24, + "consider_obj_metadata"_a = false) + .def_readwrite("cache_size", &common_cache_params_t::cache_size) + .def_readwrite("default_ttl", &common_cache_params_t::default_ttl) + .def_readwrite("hashpower", &common_cache_params_t::hashpower) + .def_readwrite("consider_obj_metadata", + &common_cache_params_t::consider_obj_metadata); + + /** + * @brief Cache object + * + * TODO: full support for cache object + */ + py::class_>( + m, "CacheObject") + .def_readonly("obj_id", &cache_obj_t::obj_id) + .def_readonly("obj_size", &cache_obj_t::obj_size); + + /** + * @brief Request operation enumeration + */ + py::enum_(m, "ReqOp") + .value("OP_NOP", OP_NOP) + .value("OP_GET", OP_GET) + .value("OP_GETS", OP_GETS) + .value("OP_SET", OP_SET) + .value("OP_ADD", OP_ADD) + .value("OP_CAS", OP_CAS) + .value("OP_REPLACE", OP_REPLACE) + .value("OP_APPEND", OP_APPEND) + .value("OP_PREPEND", OP_PREPEND) + .value("OP_DELETE", OP_DELETE) + .value("OP_INCR", OP_INCR) + .value("OP_DECR", OP_DECR) + .value("OP_READ", OP_READ) + .value("OP_WRITE", OP_WRITE) + .value("OP_UPDATE", OP_UPDATE) + .value("OP_INVALID", OP_INVALID) + .export_values(); + + /** + * @brief Request structure + */ + py::class_>(m, + "Request") + .def(py::init([](int64_t obj_size, req_op_e op, bool valid, + obj_id_t obj_id, int64_t clock_time, uint64_t hv, + int64_t next_access_vtime, int32_t ttl) { + request_t* req = new_request(); + req->obj_size = obj_size; + req->op = op; + req->valid = valid; + req->obj_id = obj_id; + req->clock_time = clock_time; + req->hv = hv; + req->next_access_vtime = next_access_vtime; + req->ttl = ttl; + return req; + }), + "obj_size"_a = 1, "op"_a = OP_NOP, "valid"_a = true, "obj_id"_a = 0, + "clock_time"_a = 0, "hv"_a = 0, "next_access_vtime"_a = -2, + "ttl"_a = 0) + .def_readwrite("clock_time", &request_t::clock_time) + .def_readwrite("hv", &request_t::hv) + .def_readwrite("obj_id", &request_t::obj_id) + .def_readwrite("obj_size", &request_t::obj_size) + .def_readwrite("ttl", &request_t::ttl) + .def_readwrite("op", &request_t::op) + .def_readwrite("valid", &request_t::valid) + .def_readwrite("next_access_vtime", &request_t::next_access_vtime); + + /** + * @brief Generic function to create a cache instance. + * + * TODO: add support for general cache creation and add support for cache + * specific parameters this is a backup for cache creation in python. + */ + + // Cache algorithm initialization functions + + make_cache_wrapper("ARC_init")(m); + make_cache_wrapper("ARCv0_init")(m); + make_cache_wrapper("CAR_init")(m); + make_cache_wrapper("Cacheus_init")(m); + make_cache_wrapper("Clock_init")(m); + make_cache_wrapper("ClockPro_init")(m); + make_cache_wrapper("FIFO_init")(m); + make_cache_wrapper("FIFO_Merge_init")(m); + make_cache_wrapper("flashProb_init")(m); + make_cache_wrapper("GDSF_init")(m); + make_cache_wrapper("LHD_init")(m); + make_cache_wrapper("LeCaR_init")(m); + make_cache_wrapper("LeCaRv0_init")(m); + make_cache_wrapper("LFU_init")(m); + make_cache_wrapper("LFUCpp_init")(m); + make_cache_wrapper("LFUDA_init")(m); + make_cache_wrapper("LIRS_init")(m); + make_cache_wrapper("LRU_init")(m); + make_cache_wrapper("LRU_Prob_init")(m); + make_cache_wrapper("nop_init")(m); + + make_cache_wrapper("QDLP_init")(m); + make_cache_wrapper("Random_init")(m); + make_cache_wrapper("RandomLRU_init")(m); + make_cache_wrapper("RandomTwo_init")(m); + make_cache_wrapper("S3FIFO_init")(m); + make_cache_wrapper("S3FIFOv0_init")(m); + make_cache_wrapper("S3FIFOd_init")(m); + make_cache_wrapper("Sieve_init")(m); + make_cache_wrapper("Size_init")(m); + make_cache_wrapper("SLRU_init")(m); + make_cache_wrapper("SLRUv0_init")(m); + make_cache_wrapper("TwoQ_init")(m); + make_cache_wrapper("WTinyLFU_init")(m); + make_cache_wrapper("Hyperbolic_init")(m); + make_cache_wrapper("Belady_init")(m); + make_cache_wrapper("BeladySize_init")(m); + +#ifdef ENABLE_3L_CACHE + make_cache_wrapper("ThreeLCache_init")(m); +#endif + +#ifdef ENABLE_GLCACHE + make_cache_wrapper("GLCache_init")(m); +#endif + +#ifdef ENABLE_LRB + make_cache_wrapper("LRB_init")(m); +#endif + + // *********************************************************************** + // **** **** + // **** Python plugin cache bindings **** + // **** **** + // *********************************************************************** + + m.def("pypluginCache_init", &pypluginCache_init, "cc_params"_a, + "cache_name"_a, "cache_init_hook"_a, "cache_hit_hook"_a, + "cache_miss_hook"_a, "cache_eviction_hook"_a, "cache_remove_hook"_a, + "cache_free_hook"_a); + // *********************************************************************** + // **** **** + // **** end functions for python plugin **** + // **** **** + // *********************************************************************** + + m.def( + "c_process_trace", + [](cache_t& cache, reader_t& reader, int64_t start_req = 0, + int64_t max_req = -1) { + reset_reader(&reader); + if (start_req > 0) { + skip_n_req(&reader, start_req); + } + + request_t* req = new_request(); + int64_t n_req = 0, n_hit = 0; + int64_t bytes_req = 0, bytes_hit = 0; + bool hit; + + read_one_req(&reader, req); + while (req->valid) { + n_req += 1; + bytes_req += req->obj_size; + hit = cache.get(&cache, req); + if (hit) { + n_hit += 1; + bytes_hit += req->obj_size; + } + read_one_req(&reader, req); + if (max_req > 0 && n_req >= max_req) { + break; // Stop if we reached the max request limit + } + } + + free_request(req); + // return the miss ratio + double obj_miss_ratio = n_req > 0 ? 1.0 - (double)n_hit / n_req : 0.0; + double byte_miss_ratio = + bytes_req > 0 ? 1.0 - (double)bytes_hit / bytes_req : 0.0; + return std::make_tuple(obj_miss_ratio, byte_miss_ratio); + }, + "cache"_a, "reader"_a, "start_req"_a = 0, "max_req"_a = -1); +} + +} // namespace libcachesim diff --git a/libCacheSim-python/src/export_misc.cpp b/libCacheSim-python/src/export_misc.cpp new file mode 100644 index 000000000..08000590f --- /dev/null +++ b/libCacheSim-python/src/export_misc.cpp @@ -0,0 +1,30 @@ +// libcachesim_python - libCacheSim Python bindings +// Copyright 2025 The libcachesim Authors. All rights reserved. +// +// Use of this source code is governed by a GPL-3.0 +// license that can be found in the LICENSE file or at +// https://github.com/1a1a11a/libcachesim/blob/develop/LICENSE + +#include + +#include "../libCacheSim/bin/traceUtils/internal.hpp" +#include "export.h" + +namespace libcachesim { + +namespace py = pybind11; + +void export_misc(py::module& m) { + // NOTE(haocheng): Here we provide some convertion functions and utilities + // - convert_to_oracleGeneral + // - convert_to_lcs: v1 to v8 (default v1) + + m.def("convert_to_oracleGeneral", &traceConv::convert_to_oracleGeneral, + "reader"_a, "ofilepath"_a, "output_txt"_a = false, + "remove_size_change"_a = false); + m.def("convert_to_lcs", &traceConv::convert_to_lcs, "reader"_a, "ofilepath"_a, + "output_txt"_a = false, "remove_size_change"_a = false, + "lcs_ver"_a = 1); +} + +} // namespace libcachesim diff --git a/libCacheSim-python/src/export_reader.cpp b/libCacheSim-python/src/export_reader.cpp new file mode 100644 index 000000000..f9c3789b6 --- /dev/null +++ b/libCacheSim-python/src/export_reader.cpp @@ -0,0 +1,312 @@ +// libcachesim_python - libCacheSim Python bindings +// Copyright 2025 The libcachesim Authors. All rights reserved. +// +// Use of this source code is governed by a GPL-3.0 +// license that can be found in the LICENSE file or at +// https://github.com/1a1a11a/libcachesim/blob/develop/LICENSE + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "cli_reader_utils.h" +#include "config.h" +#include "export.h" +#include "libCacheSim/enum.h" +#include "libCacheSim/reader.h" +#include "libCacheSim/request.h" +#include "mystr.h" + +namespace libcachesim { + +namespace py = pybind11; + +// Custom deleters for smart pointers +struct ReaderDeleter { + void operator()(reader_t* ptr) const { + if (ptr != nullptr) close_trace(ptr); + } +}; + +struct RequestDeleter { + void operator()(request_t* ptr) const { + if (ptr != nullptr) free_request(ptr); + } +}; + +struct ReaderInitParamDeleter { + void operator()(reader_init_param_t* ptr) const { + if (ptr != nullptr) free(ptr); + } +}; + +struct SamplerDeleter { + void operator()(sampler_t* ptr) const { + if (ptr != nullptr && ptr->free != nullptr) { + ptr->free(ptr); + } + } +}; + +void export_reader(py::module& m) { + // Sampler type enumeration + py::enum_(m, "SamplerType") + .value("SPATIAL_SAMPLER", sampler_type::SPATIAL_SAMPLER) + .value("TEMPORAL_SAMPLER", sampler_type::TEMPORAL_SAMPLER) + .value("SHARDS_SAMPLER", sampler_type::SHARDS_SAMPLER) + .value("INVALID_SAMPLER", sampler_type::INVALID_SAMPLER) + .export_values(); + + // Trace type enumeration + py::enum_(m, "TraceType") + .value("CSV_TRACE", trace_type_e::CSV_TRACE) + .value("BIN_TRACE", trace_type_e::BIN_TRACE) + .value("PLAIN_TXT_TRACE", trace_type_e::PLAIN_TXT_TRACE) + .value("ORACLE_GENERAL_TRACE", trace_type_e::ORACLE_GENERAL_TRACE) + .value("LCS_TRACE", trace_type_e::LCS_TRACE) + .value("VSCSI_TRACE", trace_type_e::VSCSI_TRACE) + .value("TWR_TRACE", trace_type_e::TWR_TRACE) + .value("TWRNS_TRACE", trace_type_e::TWRNS_TRACE) + .value("ORACLE_SIM_TWR_TRACE", trace_type_e::ORACLE_SIM_TWR_TRACE) + .value("ORACLE_SYS_TWR_TRACE", trace_type_e::ORACLE_SYS_TWR_TRACE) + .value("ORACLE_SIM_TWRNS_TRACE", trace_type_e::ORACLE_SIM_TWRNS_TRACE) + .value("ORACLE_SYS_TWRNS_TRACE", trace_type_e::ORACLE_SYS_TWRNS_TRACE) + .value("VALPIN_TRACE", trace_type_e::VALPIN_TRACE) + .value("UNKNOWN_TRACE", trace_type_e::UNKNOWN_TRACE) + .export_values(); + + py::enum_(m, "ReadDirection") + .value("READ_FORWARD", read_direction::READ_FORWARD) + .value("READ_BACKWARD", read_direction::READ_BACKWARD) + .export_values(); + + /** + * @brief Sampler structure + */ + py::class_>(m, + "Sampler") + .def(py::init([](double sample_ratio, enum sampler_type type) + -> std::unique_ptr { + switch (type) { + case sampler_type::SPATIAL_SAMPLER: + return std::unique_ptr( + create_spatial_sampler(sample_ratio)); + case sampler_type::TEMPORAL_SAMPLER: + return std::unique_ptr( + create_temporal_sampler(sample_ratio)); + case sampler_type::SHARDS_SAMPLER: + throw std::invalid_argument("SHARDS_SAMPLER is not added"); + case sampler_type::INVALID_SAMPLER: + default: + throw std::invalid_argument("Unknown sampler type"); + } + }), + "sample_ratio"_a = 0.1, "type"_a = sampler_type::INVALID_SAMPLER) + .def_readwrite("sampling_ratio_inv", &sampler_t::sampling_ratio_inv) + .def_readwrite("sampling_ratio", &sampler_t::sampling_ratio) + .def_readwrite("sampling_salt", &sampler_t::sampling_salt) + .def_readwrite("sampling_type", &sampler_t::type); + + // Reader initialization parameters + py::class_(m, "ReaderInitParam") + .def(py::init([]() { return default_reader_init_params(); })) + .def(py::init([](const std::string& binary_fmt_str, bool ignore_obj_size, + bool ignore_size_zero_req, bool obj_id_is_num, + bool obj_id_is_num_set, int64_t cap_at_n_req, + int64_t block_size, bool has_header, bool has_header_set, + const std::string& delimiter, ssize_t trace_start_offset, + sampler_t* sampler) { + reader_init_param_t params = default_reader_init_params(); + if (!binary_fmt_str.empty()) { + params.binary_fmt_str = strdup(binary_fmt_str.c_str()); + } + params.ignore_obj_size = ignore_obj_size; + params.ignore_size_zero_req = ignore_size_zero_req; + params.obj_id_is_num = obj_id_is_num; + params.obj_id_is_num_set = obj_id_is_num_set; + params.cap_at_n_req = cap_at_n_req; + params.block_size = block_size; + params.has_header = has_header; + params.has_header_set = has_header_set; + params.delimiter = delimiter.empty() ? ',' : delimiter[0]; + params.trace_start_offset = trace_start_offset; + params.sampler = sampler; + return params; + }), + "binary_fmt_str"_a = "", "ignore_obj_size"_a = false, + "ignore_size_zero_req"_a = true, "obj_id_is_num"_a = true, + "obj_id_is_num_set"_a = false, "cap_at_n_req"_a = -1, + "block_size"_a = -1, "has_header"_a = false, + "has_header_set"_a = false, "delimiter"_a = ",", + "trace_start_offset"_a = 0, "sampler"_a = nullptr) + .def_readwrite("ignore_obj_size", &reader_init_param_t::ignore_obj_size) + .def_readwrite("ignore_size_zero_req", + &reader_init_param_t::ignore_size_zero_req) + .def_readwrite("obj_id_is_num", &reader_init_param_t::obj_id_is_num) + .def_readwrite("obj_id_is_num_set", + &reader_init_param_t::obj_id_is_num_set) + .def_readwrite("cap_at_n_req", &reader_init_param_t::cap_at_n_req) + .def_readwrite("time_field", &reader_init_param_t::time_field) + .def_readwrite("obj_id_field", &reader_init_param_t::obj_id_field) + .def_readwrite("obj_size_field", &reader_init_param_t::obj_size_field) + .def_readwrite("op_field", &reader_init_param_t::op_field) + .def_readwrite("ttl_field", &reader_init_param_t::ttl_field) + .def_readwrite("cnt_field", &reader_init_param_t::cnt_field) + .def_readwrite("tenant_field", &reader_init_param_t::tenant_field) + .def_readwrite("next_access_vtime_field", + &reader_init_param_t::next_access_vtime_field) + .def_readwrite("n_feature_fields", &reader_init_param_t::n_feature_fields) + // .def_readwrite("feature_fields", &reader_init_param_t::feature_fields) + .def_property( + "feature_fields", + [](const reader_init_param_t& self) { + return py::array_t({self.n_feature_fields}, + self.feature_fields); // copy to python + }, + [](reader_init_param_t& self, py::array_t arr) { + if (arr.size() != self.n_feature_fields) + throw std::runtime_error("Expected array of size " + + std::to_string(self.n_feature_fields)); + std::memcpy( + self.feature_fields, arr.data(), + self.n_feature_fields * sizeof(int)); // write to C++ array + }) + .def_readwrite("block_size", &reader_init_param_t::block_size) + .def_readwrite("has_header", &reader_init_param_t::has_header) + .def_readwrite("has_header_set", &reader_init_param_t::has_header_set) + .def_readwrite("delimiter", &reader_init_param_t::delimiter) + .def_readwrite("trace_start_offset", + &reader_init_param_t::trace_start_offset) + .def_readwrite("binary_fmt_str", &reader_init_param_t::binary_fmt_str) + .def_readwrite("sampler", &reader_init_param_t::sampler); + + /** + * @brief Reader structure + */ + py::class_>(m, "Reader") + .def(py::init([](const std::string& trace_path, trace_type_e trace_type, + const reader_init_param_t& init_params) { + trace_type_e final_trace_type = trace_type; + if (final_trace_type == trace_type_e::UNKNOWN_TRACE) { + final_trace_type = detect_trace_type(trace_path.c_str()); + } + reader_t* ptr = setup_reader(trace_path.c_str(), final_trace_type, + &init_params); + if (ptr == nullptr) { + throw std::runtime_error("Failed to create reader for " + + trace_path); + } + return std::unique_ptr(ptr); + }), + "trace_path"_a, "trace_type"_a = trace_type_e::UNKNOWN_TRACE, + "init_params"_a = default_reader_init_params()) + .def_readonly("n_read_req", &reader_t::n_read_req) + .def_readonly("n_total_req", &reader_t::n_total_req) + .def_readonly("trace_path", &reader_t::trace_path) + .def_readonly("file_size", &reader_t::file_size) + .def_readonly("init_params", &reader_t::init_params) + .def_readonly("trace_type", &reader_t::trace_type) + .def_readonly("trace_format", &reader_t::trace_format) + .def_readonly("ver", &reader_t::ver) + .def_readonly("cloned", &reader_t::cloned) + .def_readonly("cap_at_n_req", &reader_t::cap_at_n_req) + .def_readonly("trace_start_offset", &reader_t::trace_start_offset) + // For binary traces + .def_readonly("mapped_file", &reader_t::mapped_file) + .def_readonly("mmap_offset", &reader_t::mmap_offset) + // .def_readonly("zstd_reader_p", &reader_t::zstd_reader_p) + .def_readonly("is_zstd_file", &reader_t::is_zstd_file) + .def_readonly("item_size", &reader_t::item_size) + // For text traces + .def_readonly("file", &reader_t::file) + .def_readonly("line_buf", &reader_t::line_buf) + .def_readonly("line_buf_size", &reader_t::line_buf_size) + .def_readonly("csv_delimiter", &reader_t::csv_delimiter) + .def_readonly("csv_has_header", &reader_t::csv_has_header) + .def_readonly("obj_id_is_num", &reader_t::obj_id_is_num) + .def_readonly("obj_id_is_num_set", &reader_t::obj_id_is_num_set) + // Other properties + .def_readwrite("ignore_size_zero_req", &reader_t::ignore_size_zero_req) + .def_readwrite("ignore_obj_size", &reader_t::ignore_obj_size) + .def_readwrite("block_size", &reader_t::block_size) + .def_readonly("n_req_left", &reader_t::n_req_left) + .def_readonly("last_req_clock_time", &reader_t::last_req_clock_time) + .def_readonly("lcs_ver", &reader_t::lcs_ver) + // TODO(haocheng): Fully support sampler in Python bindings + .def_readonly("sampler", &reader_t::sampler) + .def_readonly("read_direction", &reader_t::read_direction) + .def("get_num_of_req", + [](reader_t& self) { return get_num_of_req(&self); }) + .def( + "read_one_req", + [](reader_t& self, request_t& req) { + int ret = read_one_req(&self, &req); + if (ret != 0) { + throw std::runtime_error("Failed to read request"); + } + return req; + }, + "req"_a) + .def("reset", [](reader_t& self) { reset_reader(&self); }) + .def("close", [](reader_t& self) { close_reader(&self); }) + .def("clone", + [](const reader_t& self) { + reader_t* cloned_reader = clone_reader(&self); + if (cloned_reader == nullptr) { + throw std::runtime_error("Failed to clone reader"); + } + return std::unique_ptr(cloned_reader); + }) + .def( + "read_first_req", + [](reader_t& self, request_t& req) { + read_first_req(&self, &req); + return req; + }, + "req"_a) + .def( + "read_last_req", + [](reader_t& self, request_t& req) { + read_last_req(&self, &req); + return req; + }, + "req"_a) + .def( + "skip_n_req", + [](reader_t& self, int n) { + int ret = skip_n_req(&self, n); + if (ret != 0) { + throw std::runtime_error("Failed to skip requests"); + } + return ret; + }, + "n"_a) + .def("read_one_req_above", + [](reader_t& self) { + request_t* req = new_request(); + int ret = read_one_req_above(&self, req); + if (ret != 0) { + free_request(req); + throw std::runtime_error("Failed to read one request above"); + } + return std::unique_ptr(req); + }) + .def("go_back_one_req", + [](reader_t& self) { + int ret = go_back_one_req(&self); + if (ret != 0) { + throw std::runtime_error("Failed to go back one request"); + } + }) + .def( + "set_read_pos", + [](reader_t& self, double pos) { reader_set_read_pos(&self, pos); }, + "pos"_a); +} +} // namespace libcachesim diff --git a/libCacheSim-python/src/pylibcachesim.cpp b/libCacheSim-python/src/pylibcachesim.cpp deleted file mode 100644 index 5ca90ca21..000000000 --- a/libCacheSim-python/src/pylibcachesim.cpp +++ /dev/null @@ -1,1223 +0,0 @@ -#include -#include -#include - -// Suppress visibility warnings for pybind11 types -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wattributes" - -#include -#include -#include - -#include "config.h" -#include "libCacheSim/cache.h" -#include "libCacheSim/cacheObj.h" -#include "libCacheSim/const.h" -#include "libCacheSim/enum.h" -#include "libCacheSim/logging.h" -#include "libCacheSim/macro.h" -#include "libCacheSim/reader.h" -#include "libCacheSim/request.h" -#include "libCacheSim/sampling.h" -#include "mystr.h" - -/* admission */ -#include "libCacheSim/admissionAlgo.h" - -/* eviction */ -#include "libCacheSim/evictionAlgo.h" - -/* cache simulator */ -#include "libCacheSim/profilerLRU.h" -#include "libCacheSim/simulator.h" - -/* bin */ -#include "cachesim/cache_init.h" -#include "cli_reader_utils.h" - -#define STRINGIFY(x) #x -#define MACRO_STRINGIFY(x) STRINGIFY(x) - -namespace py = pybind11; - -// Helper functions - -// https://stackoverflow.com/questions/874134/find-out-if-string-ends-with-another-string-in-c -static bool ends_with(std::string_view str, std::string_view suffix) { - return str.size() >= suffix.size() && - str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0; -} - -trace_type_e infer_trace_type(const std::string& trace_path) { - // Infer the trace type based on the file extension - if (trace_path.find("oracleGeneral") != std::string::npos) { - return trace_type_e::ORACLE_GENERAL_TRACE; - } else if (ends_with(trace_path, ".csv")) { - return trace_type_e::CSV_TRACE; - } else if (ends_with(trace_path, ".txt")) { - return trace_type_e::PLAIN_TXT_TRACE; - } else if (ends_with(trace_path, ".bin")) { - return trace_type_e::BIN_TRACE; - } else if (ends_with(trace_path, ".vscsi")) { - return trace_type_e::VSCSI_TRACE; - } else if (ends_with(trace_path, ".twr")) { - return trace_type_e::TWR_TRACE; - } else if (ends_with(trace_path, ".twrns")) { - return trace_type_e::TWRNS_TRACE; - } else if (ends_with(trace_path, ".lcs")) { - return trace_type_e::LCS_TRACE; - } else if (ends_with(trace_path, ".valpin")) { - return trace_type_e::VALPIN_TRACE; - } else { - return trace_type_e::UNKNOWN_TRACE; - } -} - -// Python Hook Cache Implementation -class PythonHookCache { - private: - uint64_t cache_size_; - std::string cache_name_; - std::unordered_map objects_; // obj_id -> obj_size - py::object plugin_data_; - - // Hook functions - py::function init_hook_; - py::function hit_hook_; - py::function miss_hook_; - py::function eviction_hook_; - py::function remove_hook_; - py::object free_hook_; // Changed to py::object to allow py::none() - - public: - uint64_t n_req = 0; - uint64_t n_obj = 0; - uint64_t occupied_byte = 0; - uint64_t cache_size; - - PythonHookCache(uint64_t cache_size, - const std::string& cache_name = "PythonHookCache") - : cache_size_(cache_size), - cache_name_(cache_name), - cache_size(cache_size), - free_hook_(py::none()) {} - - void set_hooks(py::function init_hook, py::function hit_hook, - py::function miss_hook, py::function eviction_hook, - py::function remove_hook, py::object free_hook = py::none()) { - init_hook_ = init_hook; - hit_hook_ = hit_hook; - miss_hook_ = miss_hook; - eviction_hook_ = eviction_hook; - remove_hook_ = remove_hook; - - // Handle free_hook properly - if (!free_hook.is_none()) { - free_hook_ = free_hook; - } else { - free_hook_ = py::none(); - } - - // Initialize plugin data - plugin_data_ = init_hook_(cache_size_); - } - - bool get(const request_t& req) { - n_req++; - - auto it = objects_.find(req.obj_id); - if (it != objects_.end()) { - // Cache hit - hit_hook_(plugin_data_, req.obj_id, req.obj_size); - return true; - } else { - // Cache miss - call miss hook first - miss_hook_(plugin_data_, req.obj_id, req.obj_size); - - // Check if eviction is needed - while (occupied_byte + req.obj_size > cache_size_ && !objects_.empty()) { - // Need to evict - uint64_t victim_id = - eviction_hook_(plugin_data_, req.obj_id, req.obj_size) - .cast(); - auto victim_it = objects_.find(victim_id); - if (victim_it != objects_.end()) { - occupied_byte -= victim_it->second; - objects_.erase(victim_it); - n_obj--; - remove_hook_(plugin_data_, victim_id); - } else { - // Safety check: if eviction hook returns invalid ID, break to avoid - // infinite loop - break; - } - } - - // Insert new object if there's space - if (occupied_byte + req.obj_size <= cache_size_) { - objects_[req.obj_id] = req.obj_size; - occupied_byte += req.obj_size; - n_obj++; - } - - return false; - } - } - - ~PythonHookCache() { - if (!free_hook_.is_none()) { - py::function free_func = free_hook_.cast(); - free_func(plugin_data_); - } - } -}; - -// Restore visibility warnings -#pragma GCC diagnostic pop - -struct CacheDeleter { - void operator()(cache_t* ptr) const { - if (ptr != nullptr) ptr->cache_free(ptr); - } -}; - -struct RequestDeleter { - void operator()(request_t* ptr) const { - if (ptr != nullptr) free_request(ptr); - } -}; - -struct ReaderDeleter { - void operator()(reader_t* ptr) const { - if (ptr != nullptr) close_trace(ptr); - } -}; - -PYBIND11_MODULE(_libcachesim, m) { // NOLINT(readability-named-parameter) - m.doc() = R"pbdoc( - libCacheSim Python bindings - -------------------------- - - .. currentmodule:: libcachesim - - .. autosummary:: - :toctree: _generate - - TODO(haocheng): add meaningful methods - )pbdoc"; - - py::enum_(m, "TraceType") - .value("CSV_TRACE", trace_type_e::CSV_TRACE) - .value("BIN_TRACE", trace_type_e::BIN_TRACE) - .value("PLAIN_TXT_TRACE", trace_type_e::PLAIN_TXT_TRACE) - .value("ORACLE_GENERAL_TRACE", trace_type_e::ORACLE_GENERAL_TRACE) - .value("LCS_TRACE", trace_type_e::LCS_TRACE) - .value("VSCSI_TRACE", trace_type_e::VSCSI_TRACE) - .value("TWR_TRACE", trace_type_e::TWR_TRACE) - .value("TWRNS_TRACE", trace_type_e::TWRNS_TRACE) - .value("ORACLE_SIM_TWR_TRACE", trace_type_e::ORACLE_SIM_TWR_TRACE) - .value("ORACLE_SYS_TWR_TRACE", trace_type_e::ORACLE_SYS_TWR_TRACE) - .value("ORACLE_SIM_TWRNS_TRACE", trace_type_e::ORACLE_SIM_TWRNS_TRACE) - .value("ORACLE_SYS_TWRNS_TRACE", trace_type_e::ORACLE_SYS_TWRNS_TRACE) - .value("VALPIN_TRACE", trace_type_e::VALPIN_TRACE) - .value("UNKNOWN_TRACE", trace_type_e::UNKNOWN_TRACE) - .export_values(); - - py::enum_(m, "ReqOp") - .value("NOP", req_op_e::OP_NOP) - .value("GET", req_op_e::OP_GET) - .value("GETS", req_op_e::OP_GETS) - .value("SET", req_op_e::OP_SET) - .value("ADD", req_op_e::OP_ADD) - .value("CAS", req_op_e::OP_CAS) - .value("REPLACE", req_op_e::OP_REPLACE) - .value("APPEND", req_op_e::OP_APPEND) - .value("PREPEND", req_op_e::OP_PREPEND) - .value("DELETE", req_op_e::OP_DELETE) - .value("INCR", req_op_e::OP_INCR) - .value("DECR", req_op_e::OP_DECR) - .value("READ", req_op_e::OP_READ) - .value("WRITE", req_op_e::OP_WRITE) - .value("UPDATE", req_op_e::OP_UPDATE) - .value("INVALID", req_op_e::OP_INVALID) - .export_values(); - - // *************** structs *************** - /** - * @brief Cache structure - */ - py::class_>(m, "Cache") - .def_readwrite("n_req", &cache_t::n_req) - .def_readwrite("cache_size", &cache_t::cache_size) - // Use proper accessor functions for private fields - .def_property_readonly( - "n_obj", [](const cache_t& self) { return self.get_n_obj(&self); }) - .def_property_readonly( - "occupied_byte", - [](const cache_t& self) { return self.get_occupied_byte(&self); }) - // methods - .def("get", [](cache_t& self, const request_t& req) { - return self.get(&self, &req); - }); - - /** - * @brief Request structure - */ - py::class_>(m, - "Request") - .def(py::init([]() { return new_request(); })) - .def(py::init([](uint64_t obj_id, uint64_t obj_size, uint64_t clock_time, - uint64_t hv, req_op_e op) { - request_t* req = new_request(); - req->obj_id = obj_id; - req->obj_size = obj_size; - req->clock_time = clock_time; - req->hv = hv; - req->op = op; - return req; - }), - py::arg("obj_id"), py::arg("obj_size") = 1, - py::arg("clock_time") = 0, py::arg("hv") = 0, - py::arg("op") = req_op_e::OP_GET, - R"pbdoc( - Create a request instance. - - Args: - obj_id (int): The object ID. - obj_size (int): The object size. (default: 1) - clock_time (int): The clock time. (default: 0) - hv (int): The hash value. (default: 0) - op (req_op_e): The operation. (default: OP_GET) - - Returns: - Request: A new request instance. - )pbdoc") - .def_readwrite("clock_time", &request_t::clock_time) - .def_readwrite("hv", &request_t::hv) - .def_readwrite("obj_id", &request_t::obj_id) - .def_readwrite("obj_size", &request_t::obj_size) - .def_readwrite("op", &request_t::op); - - /** - * @brief Reader structure - */ - py::class_>(m, "Reader") - .def_readwrite("n_read_req", &reader_t::n_read_req) - .def_readwrite("n_total_req", &reader_t::n_total_req) - .def_readwrite("trace_path", &reader_t::trace_path) - .def_readwrite("file_size", &reader_t::file_size) - .def_readwrite("ignore_obj_size", &reader_t::ignore_obj_size) - // methods - .def( - "get_wss", - [](reader_t& self) { - int64_t wss_obj = 0, wss_byte = 0; - cal_working_set_size(&self, &wss_obj, &wss_byte); - return self.ignore_obj_size ? wss_obj : wss_byte; - }, - R"pbdoc( - Get the working set size of the trace. - - Args: - ignore_obj_size (bool): Whether to ignore the object size. - - Returns: - int: The working set size of the trace. - )pbdoc") - .def( - "seek", - [](reader_t& self, int64_t offset, bool from_beginning = false) { - int64_t offset_from_beginning = offset; - if (!from_beginning) { - offset_from_beginning += self.n_read_req; - } - reset_reader(&self); - skip_n_req(&self, offset_from_beginning); - }, - py::arg("offset"), py::arg("from_beginning") = false, - R"pbdoc( - Seek to a specific offset in the trace file. - We only support seeking from current position or from the beginning. - - Can only move forward, not backward. - - Args: - offset (int): The offset to seek to the beginning. - - Raises: - RuntimeError: If seeking fails. - )pbdoc") - .def("__iter__", [](reader_t& self) -> reader_t& { return self; }) - .def("__next__", [](reader_t& self) { - auto req = std::unique_ptr(new_request()); - int ret = read_one_req(&self, req.get()); - if (ret != 0) { - throw py::stop_iteration(); - } - return req; - }); - - // Helper function to apply parameters from dictionary to reader_init_param_t - auto apply_params_from_dict = [](reader_init_param_t& params, - py::dict dict_params) { - // Template field setter with type safety - auto set_if_present = [&](const char* key, auto& field) { - if (dict_params.contains(key)) { - field = - dict_params[key].cast>(); - } - }; - - // Apply all standard fields - set_if_present("time_field", params.time_field); - set_if_present("obj_id_field", params.obj_id_field); - set_if_present("obj_size_field", params.obj_size_field); - set_if_present("has_header", params.has_header); - set_if_present("ignore_obj_size", params.ignore_obj_size); - set_if_present("ignore_size_zero_req", params.ignore_size_zero_req); - set_if_present("obj_id_is_num", params.obj_id_is_num); - set_if_present("obj_id_is_num_set", params.obj_id_is_num_set); - set_if_present("has_header_set", params.has_header_set); - set_if_present("cap_at_n_req", params.cap_at_n_req); - set_if_present("op_field", params.op_field); - set_if_present("ttl_field", params.ttl_field); - set_if_present("cnt_field", params.cnt_field); - set_if_present("tenant_field", params.tenant_field); - set_if_present("next_access_vtime_field", params.next_access_vtime_field); - set_if_present("block_size", params.block_size); - set_if_present("trace_start_offset", params.trace_start_offset); - - // Special fields with custom handling - if (dict_params.contains("delimiter")) { - std::string delim = dict_params["delimiter"].cast(); - params.delimiter = delim.empty() ? ',' : delim[0]; - } - - if (dict_params.contains("binary_fmt_str")) { - // Free existing memory first to prevent leaks - if (params.binary_fmt_str) { - free(params.binary_fmt_str); - params.binary_fmt_str = nullptr; - } - std::string fmt = dict_params["binary_fmt_str"].cast(); - if (!fmt.empty()) { - // Note: Using strdup for C-compatible memory allocation - // Memory is managed by reader_init_param_t destructor/cleanup - params.binary_fmt_str = strdup(fmt.c_str()); - if (!params.binary_fmt_str) { - throw std::runtime_error( - "Failed to allocate memory for binary_fmt_str"); - } - } - } - - if (dict_params.contains("feature_fields")) { - auto ff = dict_params["feature_fields"].cast>(); - if (ff.size() > N_MAX_FEATURES) { - throw py::value_error("Too many feature fields (max " + - std::to_string(N_MAX_FEATURES) + ")"); - } - params.n_feature_fields = static_cast(ff.size()); - // Use copy_n for explicit bounds checking - std::copy_n(ff.begin(), params.n_feature_fields, params.feature_fields); - } - }; - - py::class_(m, "ReaderInitParam") - .def(py::init([]() { - reader_init_param_t params; - set_default_reader_init_params(¶ms); - return params; - }), - "Create with default parameters") - - .def(py::init([apply_params_from_dict](py::kwargs kwargs) { - reader_init_param_t params; - set_default_reader_init_params(¶ms); - - // Convert kwargs to dict and apply using shared helper - py::dict dict_params = py::dict(kwargs); - apply_params_from_dict(params, dict_params); - - return params; - }), - "Create with keyword arguments") - - .def(py::init([apply_params_from_dict](py::dict dict_params) { - reader_init_param_t params; - set_default_reader_init_params(¶ms); - - // Apply using shared helper function - apply_params_from_dict(params, dict_params); - - return params; - }), - py::arg("params"), "Create from dictionary (backward compatibility)") - .def("__repr__", [](const reader_init_param_t& params) { - std::stringstream ss; - ss << "ReaderInitParam(\n"; - - // Group 1: Core fields - ss << " # Core fields\n"; - ss << " time_field=" << params.time_field << ", "; - ss << "obj_id_field=" << params.obj_id_field << ", "; - ss << "obj_size_field=" << params.obj_size_field << ",\n"; - - // Group 2: Flags and options - ss << " # Flags and options\n"; - ss << " has_header=" << params.has_header << ", "; - ss << "ignore_obj_size=" << params.ignore_obj_size << ", "; - ss << "ignore_size_zero_req=" << params.ignore_size_zero_req << ", "; - ss << "obj_id_is_num=" << params.obj_id_is_num << ",\n"; - - // Group 3: Internal state flags - ss << " # Internal state\n"; - ss << " obj_id_is_num_set=" << params.obj_id_is_num_set << ", "; - ss << "has_header_set=" << params.has_header_set << ",\n"; - - // Group 4: Optional fields - ss << " # Optional fields\n"; - ss << " cap_at_n_req=" << params.cap_at_n_req << ", "; - ss << "op_field=" << params.op_field << ", "; - ss << "ttl_field=" << params.ttl_field << ", "; - ss << "cnt_field=" << params.cnt_field << ",\n"; - ss << " tenant_field=" << params.tenant_field << ", "; - ss << "next_access_vtime_field=" << params.next_access_vtime_field - << ",\n"; - - // Group 5: Miscellaneous - ss << " # Miscellaneous\n"; - ss << " block_size=" << params.block_size << ", "; - ss << "trace_start_offset=" << params.trace_start_offset; - ss << "\n)"; - return ss.str(); - }); - - // *************** functions *************** - /** - * @brief Open a trace file for reading - */ - m.def( - "open_trace", - [apply_params_from_dict](const std::string& trace_path, py::object type, - py::object params) { - trace_type_e c_type = UNKNOWN_TRACE; - if (!type.is_none()) { - c_type = type.cast(); - } else { - // If type is None, we can try to infer the type from the file - // extension - c_type = infer_trace_type(trace_path); - if (c_type == UNKNOWN_TRACE) { - throw std::runtime_error("Could not infer trace type from path: " + - trace_path); - } - } - - // Handle different parameter types - reader_init_param_t init_param; - set_default_reader_init_params(&init_param); - - if (py::isinstance(params)) { - // Dictionary parameters - use shared helper function - py::dict dict_params = params.cast(); - apply_params_from_dict(init_param, dict_params); - } else if (!params.is_none()) { - // reader_init_param_t object - direct cast (pybind11 handles - // conversion) - init_param = params.cast(); - } - reader_t* ptr = open_trace(trace_path.c_str(), c_type, &init_param); - return std::unique_ptr(ptr); - }, - py::arg("trace_path"), py::arg("type") = py::none(), - py::arg("params") = py::none(), - R"pbdoc( - Open a trace file for reading. - - Args: - trace_path (str): Path to the trace file. - type (Union[trace_type_e, None]): Type of the trace (e.g., CSV_TRACE). If None, the type will be inferred. - params (Union[dict, reader_init_param_t, None]): Initialization parameters for the reader. - - Returns: - Reader: A new reader instance for the trace. - )pbdoc"); - - /** - * @brief Generic function to create a cache instance. - */ - m.def( - "create_cache", - [](const std::string& eviction_algo, const uint64_t cache_size, - const std::string& eviction_params, - bool consider_obj_metadata) { return nullptr; }, - py::arg("eviction_algo"), py::arg("cache_size"), - py::arg("eviction_params"), py::arg("consider_obj_metadata"), - R"pbdoc( - Create a cache instance. - - Args: - eviction_algo (str): Eviction algorithm to use (e.g., "LRU", "FIFO", "Random"). - cache_size (int): Size of the cache in bytes. - eviction_params (str): Additional parameters for the eviction algorithm. - consider_obj_metadata (bool): Whether to consider object metadata in eviction decisions. - - Returns: - Cache: A new cache instance. - )pbdoc"); - - /* TODO(haocheng): should we support all parameters in the - * common_cache_params_t? (hash_power, etc.) */ - - // Currently supported eviction algorithms with direct initialization: - // - "ARC" - // - "Clock" - // - "FIFO" - // - "LRB" - // - "LRU" - // - "S3FIFO" - // - "Sieve" - // - "ThreeLCache" - // - "TinyLFU" - // - "TwoQ" - - /** - * @brief Create a ARC cache instance. - */ - m.def( - "ARC_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = ARC_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a ARC cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - )pbdoc"); - - /** - * @brief Create a Clock cache instance. - */ - m.def( - "Clock_init", - [](uint64_t cache_size, long int n_bit_counter, long int init_freq) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - // assemble the cache specific parameters - std::string cache_specific_params = - "n-bit-counter=" + std::to_string(n_bit_counter) + "," + - "init-freq=" + std::to_string(init_freq); - - cache_t* ptr = Clock_init(cc_params, cache_specific_params.c_str()); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), py::arg("n_bit_counter") = 1, - py::arg("init_freq") = 0, - R"pbdoc( - Create a Clock cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - n_bit_counter (int): Number of bits for counter (default: 1). - init_freq (int): Initial frequency value (default: 0). - - Returns: - Cache: A new Clock cache instance. - )pbdoc"); - - /** - * @brief Create a FIFO cache instance. - */ - m.def( - "FIFO_init", - [](uint64_t cache_size) { - // Construct common cache parameters - common_cache_params_t cc_params = {.cache_size = cache_size}; - // FIFO no specific parameters, so we pass nullptr - cache_t* ptr = FIFO_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a FIFO cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new FIFO cache instance. - )pbdoc"); - -#ifdef ENABLE_LRB - /** - * @brief Create a LRB cache instance. - */ - m.def( - "LRB_init", - [](uint64_t cache_size, std::string objective) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = LRB_init(cc_params, ("objective=" + objective).c_str()); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), py::arg("objective") = "byte-miss-ratio", - R"pbdoc( - Create a LRB cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - objective (str): Objective function to optimize (default: "byte-miss-ratio"). - - Returns: - Cache: A new LRB cache instance. - )pbdoc"); -#else - // TODO(haocheng): add a dummy function to avoid the error when LRB is not - // enabled - m.def( - "LRB_init", - [](uint64_t cache_size, std::string objective) { - throw std::runtime_error("LRB is not enabled"); - }, - py::arg("cache_size"), py::arg("objective") = "byte-miss-ratio"); -#endif - - /** - * @brief Create a LRU cache instance. - */ - m.def( - "LRU_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = LRU_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a LRU cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new LRU cache instance. - )pbdoc"); - - /** - * @brief Create a S3FIFO cache instance. - */ - m.def( - "S3FIFO_init", - [](uint64_t cache_size, double fifo_size_ratio, double ghost_size_ratio, - int move_to_main_threshold) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = S3FIFO_init( - cc_params, - ("fifo-size-ratio=" + std::to_string(fifo_size_ratio) + "," + - "ghost-size-ratio=" + std::to_string(ghost_size_ratio) + "," + - "move-to-main-threshold=" + std::to_string(move_to_main_threshold)) - .c_str()); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), py::arg("fifo_size_ratio") = 0.10, - py::arg("ghost_size_ratio") = 0.90, py::arg("move_to_main_threshold") = 2, - R"pbdoc( - Create a S3FIFO cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - fifo_size_ratio (float): Ratio of FIFO size to cache size (default: 0.10). - ghost_size_ratio (float): Ratio of ghost size to cache size (default: 0.90). - move_to_main_threshold (int): Threshold for moving to main queue (default: 2). - - Returns: - Cache: A new S3FIFO cache instance. - )pbdoc"); - - /** - * @brief Create a Sieve cache instance. - */ - m.def( - "Sieve_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = Sieve_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a Sieve cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new Sieve cache instance. - )pbdoc"); - -#ifdef ENABLE_3L_CACHE - /** - * @brief Create a ThreeL cache instance. - */ - m.def( - "ThreeLCache_init", - [](uint64_t cache_size, std::string objective) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = - ThreeLCache_init(cc_params, ("objective=" + objective).c_str()); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), py::arg("objective") = "byte-miss-ratio", - R"pbdoc( - Create a ThreeL cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - objective (str): Objective function to optimize (default: "byte-miss-ratio"). - - Returns: - Cache: A new ThreeL cache instance. - )pbdoc"); -#else - // TODO(haocheng): add a dummy function to avoid the error when ThreeLCache is - // not enabled - m.def( - "ThreeLCache_init", - [](uint64_t cache_size, std::string objective) { - throw std::runtime_error("ThreeLCache is not enabled"); - }, - py::arg("cache_size"), py::arg("objective") = "byte-miss-ratio"); -#endif - - /** - * @brief Create a TinyLFU cache instance. - */ - // mark evivtion parsing need change - m.def( - "TinyLFU_init", - [](uint64_t cache_size, std::string main_cache, double window_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = WTinyLFU_init( - cc_params, ("main-cache=" + main_cache + "," + - "window-size=" + std::to_string(window_size)) - .c_str()); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), py::arg("main_cache") = "SLRU", - py::arg("window_size") = 0.01, - R"pbdoc( - Create a TinyLFU cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - main_cache (str): Main cache to use (default: "SLRU"). - window_size (float): Window size for TinyLFU (default: 0.01). - - Returns: - Cache: A new TinyLFU cache instance. - )pbdoc"); - - /** - * @brief Create a TwoQ cache instance. - */ - m.def( - "TwoQ_init", - [](uint64_t cache_size, double Ain_size_ratio, double Aout_size_ratio) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = TwoQ_init( - cc_params, - ("Ain-size-ratio=" + std::to_string(Ain_size_ratio) + "," + - "Aout-size-ratio=" + std::to_string(Aout_size_ratio)) - .c_str()); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), py::arg("Ain_size_ratio") = 0.25, - py::arg("Aout_size_ratio") = 0.5, - R"pbdoc( - Create a TwoQ cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - Ain_size_ratio (float): Ratio of A-in size to cache size (default: 0.25). - Aout_size_ratio (float): Ratio of A-out size to cache size (default: 0.5). - - Returns: - Cache: A new TwoQ cache instance. - )pbdoc"); - - /** - * @brief Create a LFU cache instance. - */ - m.def( - "LFU_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = LFU_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a LFU cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new LFU cache instance. - )pbdoc"); - - /** - * @brief Create a LFUDA cache instance. - */ - m.def( - "LFUDA_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = LFUDA_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a LFUDA cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new LFUDA cache instance. - )pbdoc"); - - /** - * @brief Create a SLRU cache instance. - */ - m.def( - "SLRU_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = SLRU_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a SLRU cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new SLRU cache instance. - )pbdoc"); - - /** - * @brief Create a Belady cache instance. - */ - m.def( - "Belady_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = Belady_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a Belady cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new Belady cache instance. - )pbdoc"); - - /** - * @brief Create a BeladySize cache instance. - */ - m.def( - "BeladySize_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = BeladySize_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a BeladySize cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new BeladySize cache instance. - )pbdoc"); - - /** - * @brief Create a QDLP cache instance. - */ - m.def( - "QDLP_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = QDLP_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a QDLP cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new QDLP cache instance. - )pbdoc"); - - /** - * @brief Create a LeCaR cache instance. - */ - m.def( - "LeCaR_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = LeCaR_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a LeCaR cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new LeCaR cache instance. - )pbdoc"); - - /** - * @brief Create a Cacheus cache instance. - */ - m.def( - "Cacheus_init", - [](uint64_t cache_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = Cacheus_init(cc_params, nullptr); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), - R"pbdoc( - Create a Cacheus cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - - Returns: - Cache: A new Cacheus cache instance. - )pbdoc"); - - /** - * @brief Create a WTinyLFU cache instance. - */ - m.def( - "WTinyLFU_init", - [](uint64_t cache_size, std::string main_cache, double window_size) { - common_cache_params_t cc_params = {.cache_size = cache_size}; - cache_t* ptr = WTinyLFU_init( - cc_params, ("main-cache=" + main_cache + "," + - "window-size=" + std::to_string(window_size)) - .c_str()); - return std::unique_ptr(ptr); - }, - py::arg("cache_size"), py::arg("main_cache") = "SLRU", - py::arg("window_size") = 0.01, - R"pbdoc( - Create a WTinyLFU cache instance. - - Args: - cache_size (int): Size of the cache in bytes. - main_cache (str): Main cache to use (default: "SLRU"). - window_size (float): Window size for TinyLFU (default: 0.01). - - Returns: - Cache: A new WTinyLFU cache instance. - )pbdoc"); - - /** - * @brief Create a Python hook-based cache instance. - */ - py::class_(m, "PythonHookCache") - .def(py::init(), py::arg("cache_size"), - py::arg("cache_name") = "PythonHookCache") - .def("set_hooks", &PythonHookCache::set_hooks, py::arg("init_hook"), - py::arg("hit_hook"), py::arg("miss_hook"), py::arg("eviction_hook"), - py::arg("remove_hook"), py::arg("free_hook") = py::none(), - R"pbdoc( - Set the hook functions for the cache. - - Args: - init_hook (callable): Function called during cache initialization. - Signature: init_hook(cache_size: int) -> Any - hit_hook (callable): Function called on cache hit. - Signature: hit_hook(plugin_data: Any, obj_id: int, obj_size: int) -> None - miss_hook (callable): Function called on cache miss. - Signature: miss_hook(plugin_data: Any, obj_id: int, obj_size: int) -> None - eviction_hook (callable): Function called to select eviction candidate. - Signature: eviction_hook(plugin_data: Any, obj_id: int, obj_size: int) -> int - remove_hook (callable): Function called when object is removed. - Signature: remove_hook(plugin_data: Any, obj_id: int) -> None - free_hook (callable, optional): Function called during cache cleanup. - Signature: free_hook(plugin_data: Any) -> None - )pbdoc") - .def("get", &PythonHookCache::get, py::arg("req"), - R"pbdoc( - Process a cache request. - - Args: - req (Request): The cache request to process. - - Returns: - bool: True if cache hit, False if cache miss. - )pbdoc") - .def_readwrite("n_req", &PythonHookCache::n_req) - .def_readwrite("n_obj", &PythonHookCache::n_obj) - .def_readwrite("occupied_byte", &PythonHookCache::occupied_byte) - .def_readwrite("cache_size", &PythonHookCache::cache_size); - - /** - * @brief Process a trace with a cache and return miss ratio. - */ - m.def( - "process_trace", - [](cache_t& cache, reader_t& reader, int64_t start_req = 0, - int64_t max_req = -1) { - reset_reader(&reader); - if (start_req > 0) { - skip_n_req(&reader, start_req); - } - - request_t* req = new_request(); - int64_t n_req = 0, n_hit = 0; - int64_t bytes_req = 0, bytes_hit = 0; - bool hit; - - read_one_req(&reader, req); - while (req->valid) { - n_req += 1; - bytes_req += req->obj_size; - hit = cache.get(&cache, req); - if (hit) { - n_hit += 1; - bytes_hit += req->obj_size; - } - read_one_req(&reader, req); - if (max_req > 0 && n_req >= max_req) { - break; // Stop if we reached the max request limit - } - } - - free_request(req); - // return the miss ratio - double obj_miss_ratio = n_req > 0 ? 1.0 - (double)n_hit / n_req : 0.0; - double byte_miss_ratio = - bytes_req > 0 ? 1.0 - (double)bytes_hit / bytes_req : 0.0; - return std::make_tuple(obj_miss_ratio, byte_miss_ratio); - }, - py::arg("cache"), py::arg("reader"), py::arg("start_req") = 0, - py::arg("max_req") = -1, - R"pbdoc( - Process a trace with a cache and return miss ratio. - - This function processes trace data entirely on the C++ side to avoid - data movement overhead between Python and C++. - - Args: - cache (Cache): The cache instance to use for processing. - reader (Reader): The trace reader instance. - start_req (int): The starting request number to process from (default: 0, from the beginning). - max_req (int): Maximum number of requests to process (-1 for no limit). - - Returns: - float: Object miss ratio (0.0 to 1.0). - float: Byte miss ratio (0.0 to 1.0). - - Example: - >>> cache = libcachesim.LRU(1024*1024) - >>> reader = libcachesim.open_trace("trace.csv", libcachesim.TraceType.CSV_TRACE) - >>> obj_miss_ratio, byte_miss_ratio = libcachesim.process_trace(cache, reader) - >>> print(f"Obj miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") - )pbdoc"); - - /** - * @brief Process a trace with a Python hook cache and return miss ratio. - */ - m.def( - "process_trace_python_hook", - [](PythonHookCache& cache, reader_t& reader, int64_t start_req = 0, - int64_t max_req = -1) { - reset_reader(&reader); - if (start_req > 0) { - skip_n_req(&reader, start_req); - } - - request_t* req = new_request(); - int64_t n_req = 0, n_hit = 0; - int64_t bytes_req = 0, bytes_hit = 0; - bool hit; - - read_one_req(&reader, req); - while (req->valid) { - n_req += 1; - bytes_req += req->obj_size; - hit = cache.get(*req); - if (hit) { - n_hit += 1; - bytes_hit += req->obj_size; - } - read_one_req(&reader, req); - if (max_req > 0 && n_req >= max_req) { - break; // Stop if we reached the max request limit - } - } - - free_request(req); - // return the miss ratio - double obj_miss_ratio = n_req > 0 ? 1.0 - (double)n_hit / n_req : 0.0; - double byte_miss_ratio = - bytes_req > 0 ? 1.0 - (double)bytes_hit / bytes_req : 0.0; - return std::make_tuple(obj_miss_ratio, byte_miss_ratio); - }, - py::arg("cache"), py::arg("reader"), py::arg("start_req") = 0, - py::arg("max_req") = -1, - R"pbdoc( - Process a trace with a Python hook cache and return miss ratio. - - This function processes trace data entirely on the C++ side to avoid - data movement overhead between Python and C++. Specifically designed - for PythonHookCache instances. - - Args: - cache (PythonHookCache): The Python hook cache instance to use. - reader (Reader): The trace reader instance. - start_req (int): The starting request number to process from (0 for beginning). - max_req (int): Maximum number of requests to process (-1 for no limit). - - Returns: - float: Object miss ratio (0.0 to 1.0). - float: Byte miss ratio (0.0 to 1.0). - - Example: - >>> cache = libcachesim.PythonHookCachePolicy(1024*1024) - >>> cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - >>> reader = libcachesim.open_trace("trace.csv", libcachesim.TraceType.CSV_TRACE) - >>> obj_miss_ratio, byte_miss_ratio = libcachesim.process_trace_python_hook(cache.cache, reader) - >>> print(f"Obj miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") - )pbdoc"); - -#ifdef VERSION_INFO - m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO); -#else - m.attr("__version__") = "dev"; -#endif -} diff --git a/libCacheSim-python/tests/conftest.py b/libCacheSim-python/tests/conftest.py index a3e2705e1..42edf9190 100644 --- a/libCacheSim-python/tests/conftest.py +++ b/libCacheSim-python/tests/conftest.py @@ -4,29 +4,3 @@ import gc import pytest - -from libcachesim import Reader, TraceType, open_trace - - -@pytest.fixture -def mock_reader(): - data_file = os.path.join( # noqa: PTH118 - os.path.dirname(os.path.dirname(os.path.dirname(__file__))), # noqa: PTH120 - "data", - "cloudPhysicsIO.oracleGeneral.bin", - ) - reader: Reader = open_trace( - data_file, - type=TraceType.ORACLE_GENERAL_TRACE, - ) - try: - yield reader - finally: - # More careful cleanup - try: - if hasattr(reader, "close"): - reader.close() - except Exception: # Be specific about exception type - pass - # Don't explicitly del reader here, let Python handle it - gc.collect() diff --git a/libCacheSim-python/tests/test_eviction.py b/libCacheSim-python/tests/test_eviction.py deleted file mode 100644 index a51aae860..000000000 --- a/libCacheSim-python/tests/test_eviction.py +++ /dev/null @@ -1,62 +0,0 @@ -import pytest - -from libcachesim import ( - ARC, - FIFO, - LRU, - S3FIFO, - Clock, - Sieve, - TinyLFU, - TwoQ, -) -from tests.utils import get_reference_data - - -@pytest.mark.parametrize( - "eviction_algo", - [ - FIFO, - ARC, - Clock, - LRU, - S3FIFO, - Sieve, - TinyLFU, - TwoQ, - ], -) -@pytest.mark.parametrize("cache_size_ratio", [0.01]) -def test_eviction_algo(eviction_algo, cache_size_ratio, mock_reader): - cache = None - try: - # create a cache with the eviction policy - cache = eviction_algo(cache_size=int(mock_reader.get_wss() * cache_size_ratio)) - req_count = 0 - miss_count = 0 - - # Limit the number of requests to avoid long test times - # max_requests = 1000 - for i, req in enumerate(mock_reader): - # if i >= max_requests: - # break - hit = cache.get(req) - if not hit: - miss_count += 1 - req_count += 1 - - if req_count == 0: - pytest.skip("No requests processed") - - miss_ratio = miss_count / req_count - reference_miss_ratio = get_reference_data(eviction_algo.__name__, cache_size_ratio) - if reference_miss_ratio is None: - pytest.skip(f"No reference data for {eviction_algo.__name__} with cache size ratio {cache_size_ratio}") - assert abs(miss_ratio - reference_miss_ratio) < 0.01, ( - f"Miss ratio {miss_ratio} is not close to reference {reference_miss_ratio}" - ) - - except Exception as e: - pytest.fail(f"Error in test_eviction_algo: {e}") - finally: - pass diff --git a/libCacheSim-python/tests/test_example.py b/libCacheSim-python/tests/test_example.py new file mode 100644 index 000000000..9cfcb7f3f --- /dev/null +++ b/libCacheSim-python/tests/test_example.py @@ -0,0 +1,16 @@ +from libcachesim import ( + Request, + LRU, + SyntheticReader, + Util, +) + +def test_example(): + reader = SyntheticReader(num_of_req=1000) + cache = LRU(cache_size=1000) + miss_cnt = 0 + for req in reader: + hit = cache.get(req) + if not hit: + miss_cnt += 1 + print(f"Miss ratio: {miss_cnt / reader.num_of_req}") diff --git a/libCacheSim-python/tests/test_process_trace.py b/libCacheSim-python/tests/test_process_trace.py deleted file mode 100644 index 1dbfb486f..000000000 --- a/libCacheSim-python/tests/test_process_trace.py +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env python3 -""" -Test file for process_trace functionality. -""" - -import sys -import os -import pytest - -# Add the parent directory to the Python path for development testing -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) - -try: - import libcachesim as lcs -except ImportError as e: - pytest.skip(f"libcachesim not available: {e}", allow_module_level=True) - -from collections import OrderedDict - - -def create_trace_reader(): - """Helper function to create a trace reader with binary trace file.""" - data_file = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "cloudPhysicsIO.oracleGeneral.bin" - ) - if not os.path.exists(data_file): - return None - return lcs.open_trace(data_file, lcs.TraceType.ORACLE_GENERAL_TRACE) - - -def test_process_trace_native(): - """Test process_trace with native LRU cache.""" - - # Open trace - reader = create_trace_reader() - if reader is None: - pytest.skip("Test trace file not found, skipping test") - - # Create LRU cache - cache = lcs.LRU(1024 * 1024) # 1MB cache - - # Process trace and get miss ratio - obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader, max_req=1000) - - # Verify miss ratio is reasonable (should be between 0 and 1) - assert 0.0 <= obj_miss_ratio <= 1.0, f"Invalid miss ratio: {obj_miss_ratio}" - - -def test_process_trace_python_hook(): - """Test process_trace with Python hook cache.""" - - # Open trace - reader = create_trace_reader() - if reader is None: - pytest.skip("Test trace file not found, skipping test") - - # Create Python hook LRU cache - cache = lcs.PythonHookCachePolicy(1024 * 1024, "TestLRU") - - # Define LRU hooks - def init_hook(cache_size): - return OrderedDict() - - def hit_hook(lru_dict, obj_id, obj_size): - lru_dict.move_to_end(obj_id) - - def miss_hook(lru_dict, obj_id, obj_size): - lru_dict[obj_id] = True - - def eviction_hook(lru_dict, obj_id, obj_size): - return next(iter(lru_dict)) - - def remove_hook(lru_dict, obj_id): - lru_dict.pop(obj_id, None) - - # Set hooks - cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - - # Test both methods - # Method 1: Direct function call - miss_ratio1 = lcs.process_trace_python_hook(cache.cache, reader, max_req=1000)[0] - - # Need to reopen the trace for second test - reader2 = create_trace_reader() - if reader2 is None: - pytest.skip("Warning: Cannot reopen trace file, skipping second test") - # Continue with just the first test result - assert miss_ratio1 is not None and 0.0 <= miss_ratio1 <= 1.0, f"Invalid miss ratio: {miss_ratio1}" - return - - # Reset cache for fair comparison - cache2 = lcs.PythonHookCachePolicy(1024 * 1024, "TestLRU2") - cache2.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - - # Method 2: Convenience method - miss_ratio2 = cache2.process_trace(reader2, max_req=1000)[0] - - # Verify both methods give the same result and miss ratios are reasonable - assert 0.0 <= miss_ratio1 <= 1.0, f"Invalid miss ratio 1: {miss_ratio1}" - assert 0.0 <= miss_ratio2 <= 1.0, f"Invalid miss ratio 2: {miss_ratio2}" - assert abs(miss_ratio1 - miss_ratio2) < 0.001, ( - f"Different results from the two methods: {miss_ratio1} vs {miss_ratio2}" - ) - - -def test_compare_native_vs_python_hook(): - """Compare native LRU vs Python hook LRU using process_trace.""" - - cache_size = 512 * 1024 # 512KB cache - max_requests = 500 - - # Test native LRU - native_cache = lcs.LRU(cache_size) - reader1 = create_trace_reader() - if reader1 is None: - pytest.skip("Test trace file not found, skipping test") - - native_obj_miss_ratio, native_byte_miss_ratio = native_cache.process_trace(reader1, max_req=max_requests) - - # Test Python hook LRU - hook_cache = lcs.PythonHookCachePolicy(cache_size, "HookLRU") - - def init_hook(cache_size): - return OrderedDict() - - def hit_hook(lru_dict, obj_id, obj_size): - lru_dict.move_to_end(obj_id) - - def miss_hook(lru_dict, obj_id, obj_size): - lru_dict[obj_id] = True - - def eviction_hook(lru_dict, obj_id, obj_size): - return next(iter(lru_dict)) - - def remove_hook(lru_dict, obj_id): - lru_dict.pop(obj_id, None) - - hook_cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - - reader2 = create_trace_reader() - if reader2 is None: - pytest.skip("Warning: Cannot reopen trace file, skipping comparison") - return # Skip test - - hook_obj_miss_ratio, hook_byte_miss_ratio = hook_cache.process_trace(reader2, max_req=max_requests) - - # They should be very similar (allowing for some small differences due to implementation details) - assert abs(native_obj_miss_ratio - hook_obj_miss_ratio) < 0.05, ( - f"Too much difference: {abs(native_obj_miss_ratio - hook_obj_miss_ratio):.4f}" - ) - - -def test_error_handling(): - """Test error handling for process_trace.""" - - cache = lcs.PythonHookCachePolicy(1024) - - reader = create_trace_reader() - if reader is None: - pytest.skip("Test trace file not found, skipping error test") - - # Try to process trace without setting hooks - should raise RuntimeError - with pytest.raises(RuntimeError, match="Hooks must be set before processing trace"): - cache.process_trace(reader) - - -def test_lru_implementation_accuracy(): - """Test that Python hook LRU implementation matches native LRU closely.""" - - cache_size = 1024 * 1024 # 1MB - max_requests = 100 - - # Create readers - reader1 = create_trace_reader() - reader2 = create_trace_reader() - - if not reader1 or not reader2: - pytest.skip("Cannot open trace files for LRU accuracy test") - - # Test native LRU - native_cache = lcs.LRU(cache_size) - native_obj_miss_ratio, native_byte_miss_ratio = native_cache.process_trace(reader1, max_req=max_requests) - - # Test Python hook LRU - hook_cache = lcs.PythonHookCachePolicy(cache_size, "AccuracyTestLRU") - init_hook, hit_hook, miss_hook, eviction_hook, remove_hook = create_optimized_lru_hooks() - hook_cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - - hook_obj_miss_ratio, hook_byte_miss_ratio = hook_cache.process_trace(reader2, max_req=max_requests) - - # Calculate difference - difference = abs(native_obj_miss_ratio - hook_obj_miss_ratio) - percentage_diff = (difference / native_obj_miss_ratio) * 100 if native_obj_miss_ratio > 0 else 0 - - # Assert that the difference is small (< 5%) - assert percentage_diff < 5.0, f"LRU implementation difference too large: {percentage_diff:.4f}%" - - -def create_optimized_lru_hooks(): - """Create optimized LRU hooks that closely match native LRU behavior.""" - - def init_hook(cache_size): - return OrderedDict() - - def hit_hook(lru_dict, obj_id, obj_size): - if obj_id in lru_dict: - lru_dict.move_to_end(obj_id) - - def miss_hook(lru_dict, obj_id, obj_size): - lru_dict[obj_id] = obj_size - - def eviction_hook(lru_dict, obj_id, obj_size): - if lru_dict: - return next(iter(lru_dict)) - return obj_id - - def remove_hook(lru_dict, obj_id): - lru_dict.pop(obj_id, None) - - return init_hook, hit_hook, miss_hook, eviction_hook, remove_hook diff --git a/libCacheSim-python/tests/test_python_hook_cache.py b/libCacheSim-python/tests/test_python_hook_cache.py deleted file mode 100644 index 7af8873dc..000000000 --- a/libCacheSim-python/tests/test_python_hook_cache.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env python3 -""" -Test file for PythonHookCachePolicy functionality. -""" - -import pytest -import libcachesim as lcs -from dataclasses import dataclass -from collections import OrderedDict - - -@dataclass -class CacheTestCase: - """Represents a single test case for cache operations.""" - - request: tuple[int, int] # (obj_id, obj_size) - expected_hit: bool - expected_obj_count: int - description: str = "" - - -def create_lru_hooks(): - """Create standard LRU hooks for testing. - - Returns: - tuple: A tuple of (init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - """ - - def init_hook(cache_size): - return OrderedDict() - - def hit_hook(lru_dict, obj_id, obj_size): - lru_dict.move_to_end(obj_id) - - def miss_hook(lru_dict, obj_id, obj_size): - lru_dict[obj_id] = True - - def eviction_hook(lru_dict, obj_id, obj_size): - return next(iter(lru_dict)) - - def remove_hook(lru_dict, obj_id): - lru_dict.pop(obj_id, None) - - return init_hook, hit_hook, miss_hook, eviction_hook, remove_hook - - -def create_test_request(obj_id: int, obj_size: int) -> lcs.Request: - """Create a test request with given parameters. - - Args: - obj_id: Object ID - obj_size: Object size in bytes - - Returns: - Request: A configured request object - """ - req = lcs.Request() - req.obj_id = obj_id - req.obj_size = obj_size - return req - - -def test_python_hook_cache(): - """Test the Python hook cache implementation.""" - cache_size = 300 # 3 objects of size 100 each - cache = lcs.PythonHookCachePolicy(cache_size, "TestLRU") - - # Set up hooks - init_hook, hit_hook, miss_hook, eviction_hook, remove_hook = create_lru_hooks() - cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - - # Define test sequence - test_cases = [ - CacheTestCase((1, 100), False, 1, "Miss - insert 1"), - CacheTestCase((2, 100), False, 2, "Miss - insert 2"), - CacheTestCase((3, 100), False, 3, "Miss - insert 3 (cache full)"), - CacheTestCase((1, 100), True, 3, "Hit - move 1 to end"), - CacheTestCase((4, 100), False, 3, "Miss - should evict 2 (LRU), insert 4"), - CacheTestCase((2, 100), False, 3, "Miss - should evict 3, insert 2"), - CacheTestCase((1, 100), True, 3, "Hit - move 1 to end"), - ] - - # Execute test sequence - for i, test_case in enumerate(test_cases): - obj_id, obj_size = test_case.request - req = create_test_request(obj_id, obj_size) - - result = cache.get(req) - assert result == test_case.expected_hit, f"Request {i + 1} (obj_id={obj_id}):" - f"Expected {'hit' if test_case.expected_hit else 'miss'} - {test_case.description}" - assert cache.n_obj == test_case.expected_obj_count, ( - f"Request {i + 1}: Expected {test_case.expected_obj_count} objects - {test_case.description}" - ) - assert cache.occupied_byte <= cache_size, f"Request {i + 1}: Cache size exceeded" - - -def test_error_handling(): - """Test error handling for uninitialized cache.""" - cache = lcs.PythonHookCachePolicy(1000) - - # Try to use cache without setting hooks - req = create_test_request(1, 100) - - with pytest.raises(RuntimeError): - cache.get(req) - - -def test_lru_comparison(): - """Test Python hook LRU against native LRU to verify identical behavior.""" - cache_size = 300 # 3 objects of size 100 each - - # Create native LRU cache - native_lru = lcs.LRU(cache_size) - - # Create Python hook LRU cache - hook_lru = lcs.PythonHookCachePolicy(cache_size, "TestLRU") - init_hook, hit_hook, miss_hook, eviction_hook, remove_hook = create_lru_hooks() - hook_lru.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - - # Define test sequence with various access patterns - test_cases = [ - CacheTestCase((1, 100), False, 1, "Miss - insert 1"), - CacheTestCase((2, 100), False, 2, "Miss - insert 2"), - CacheTestCase((3, 100), False, 3, "Miss - insert 3 (cache full)"), - CacheTestCase((1, 100), True, 3, "Hit - move 1 to end"), - CacheTestCase((4, 100), False, 3, "Miss - should evict 2 (LRU), insert 4"), - CacheTestCase((2, 100), False, 3, "Miss - should evict 3, insert 2"), - CacheTestCase((1, 100), True, 3, "Hit - move 1 to end"), - CacheTestCase((3, 100), False, 3, "Miss - should evict 4, insert 3"), - CacheTestCase((5, 100), False, 3, "Miss - should evict 2, insert 5"), - CacheTestCase((1, 100), True, 3, "Hit - move 1 to end"), - CacheTestCase((3, 100), True, 3, "Hit - move 3 to end"), - CacheTestCase((6, 100), False, 3, "Miss - should evict 5, insert 6"), - ] - - # Test both caches with identical requests - for i, test_case in enumerate(test_cases): - obj_id, obj_size = test_case.request - - # Test native LRU - req_native = create_test_request(obj_id, obj_size) - native_result = native_lru.get(req_native) - - # Test hook LRU - req_hook = create_test_request(obj_id, obj_size) - hook_result = hook_lru.get(req_hook) - - # Compare results - assert native_result == hook_result, ( - f"Request {i + 1} (obj_id={obj_id}): Native and hook LRU differ - {test_case.description}" - ) - - # Compare cache statistics - assert native_lru.n_obj == hook_lru.n_obj, f"Request {i + 1}: Object count differs - {test_case.description}" - assert native_lru.occupied_byte == hook_lru.occupied_byte, ( - f"Request {i + 1}: Occupied bytes differ - {test_case.description}" - ) - - -def test_lru_comparison_variable_sizes(): - """Test Python hook LRU vs Native LRU with variable object sizes.""" - cache_size = 1000 # Total cache capacity - - # Create caches - native_lru = lcs.LRU(cache_size) - hook_lru = lcs.PythonHookCachePolicy(cache_size, "VariableSizeLRU") - - init_hook, hit_hook, miss_hook, eviction_hook, remove_hook = create_lru_hooks() - hook_lru.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - - # Define test sequence with variable object sizes - test_cases = [ - CacheTestCase((1, 200), False, 1, "Miss - insert 1 (200 bytes)"), - CacheTestCase((2, 300), False, 2, "Miss - insert 2 (300 bytes)"), - CacheTestCase((3, 400), False, 3, "Miss - insert 3 (400 bytes) - total 900 bytes"), - CacheTestCase((4, 200), False, 3, "Miss - should evict 1, insert 4 (total would be 1100, over limit)"), - CacheTestCase((1, 200), False, 3, "Miss - should evict 2, insert 1"), - CacheTestCase((5, 100), False, 3, "Miss - should evict 3, insert 5"), - CacheTestCase((4, 200), True, 3, "Hit - access 4"), - CacheTestCase((6, 500), False, 2, "Miss - should evict multiple objects to fit"), - CacheTestCase((4, 200), False, 3, "Miss - 4 was evicted"), - ] - - # Test both caches with identical requests - for i, test_case in enumerate(test_cases): - obj_id, obj_size = test_case.request - - # Test native LRU - req_native = create_test_request(obj_id, obj_size) - native_result = native_lru.get(req_native) - - # Test hook LRU - req_hook = create_test_request(obj_id, obj_size) - hook_result = hook_lru.get(req_hook) - - # Compare results - assert native_result == hook_result, ( - f"Request {i + 1} (obj_id={obj_id}, size={obj_size}): Results differ - {test_case.description}" - ) - - # Compare cache statistics - assert native_lru.n_obj == hook_lru.n_obj, f"Request {i + 1}: Object count differs - {test_case.description}" - assert native_lru.occupied_byte == hook_lru.occupied_byte, ( - f"Request {i + 1}: Occupied bytes differ - {test_case.description}" - ) diff --git a/libCacheSim-python/tests/test_trace_generator.py b/libCacheSim-python/tests/test_trace_generator.py deleted file mode 100644 index 37040026e..000000000 --- a/libCacheSim-python/tests/test_trace_generator.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python3 -""" -Tests for trace generator module. -""" - -import libcachesim as lcs - - -class TestTraceGeneration: - """Test trace generation functions.""" - - # Constants for test readability - NUM_SAMPLE_REQUESTS = 10 # Number of requests to check in detail - - def test_create_zipf_requests_basic(self): - """Test basic Zipf request creation.""" - generator = lcs.create_zipf_requests(num_objects=100, num_requests=1000, alpha=1.0, obj_size=4000, seed=42) - - # Test iteration - requests = list(generator) - assert len(requests) == 1000 - - for req in requests[: self.NUM_SAMPLE_REQUESTS]: # Check first NUM_SAMPLE_REQUESTS - assert isinstance(req, lcs.Request) - assert 0 <= req.obj_id < 100 - assert req.obj_size == 4000 - assert req.clock_time >= 0 - - def test_create_uniform_requests_basic(self): - """Test basic uniform request creation.""" - generator = lcs.create_uniform_requests(num_objects=100, num_requests=1000, obj_size=4000, seed=42) - - # Test iteration - requests = list(generator) - assert len(requests) == 1000 - - for req in requests[: self.NUM_SAMPLE_REQUESTS]: # Check first NUM_SAMPLE_REQUESTS - assert isinstance(req, lcs.Request) - assert 0 <= req.obj_id < 100 - assert req.obj_size == 4000 - assert req.clock_time >= 0 - - def test_zipf_reproducibility(self): - """Test reproducibility with seed.""" - gen1 = lcs.create_zipf_requests(10, 100, alpha=1.0, seed=42) - gen2 = lcs.create_zipf_requests(10, 100, alpha=1.0, seed=42) - - requests1 = list(gen1) - requests2 = list(gen2) - - assert len(requests1) == len(requests2) - for req1, req2 in zip(requests1, requests2): - assert req1.obj_id == req2.obj_id - - def test_uniform_reproducibility(self): - """Test reproducibility with seed.""" - gen1 = lcs.create_uniform_requests(10, 100, seed=42) - gen2 = lcs.create_uniform_requests(10, 100, seed=42) - - requests1 = list(gen1) - requests2 = list(gen2) - - assert len(requests1) == len(requests2) - for req1, req2 in zip(requests1, requests2): - assert req1.obj_id == req2.obj_id - - def test_different_seeds(self): - """Test that different seeds produce different results.""" - gen1 = lcs.create_zipf_requests(10, 100, alpha=1.0, seed=42) - gen2 = lcs.create_zipf_requests(10, 100, alpha=1.0, seed=43) - - requests1 = [req.obj_id for req in gen1] - requests2 = [req.obj_id for req in gen2] - - assert requests1 != requests2 - - def test_zipf_with_cache(self): - """Test Zipf generator with cache simulation.""" - cache = lcs.LRU(cache_size=50 * 1024) # 50KB cache - generator = lcs.create_zipf_requests( - num_objects=100, - num_requests=1000, - alpha=1.0, - obj_size=1000, # 1KB objects - seed=42, - ) - - hit_count = 0 - for req in generator: - if cache.get(req): - hit_count += 1 - - # Should have some hits and some misses - assert 0 <= hit_count <= 1000 - assert hit_count > 0 # Should have some hits - - def test_uniform_with_cache(self): - """Test uniform generator with cache simulation.""" - cache = lcs.LRU(cache_size=50 * 1024) # 50KB cache - generator = lcs.create_uniform_requests( - num_objects=100, - num_requests=1000, - obj_size=1000, # 1KB objects - seed=42, - ) - - hit_count = 0 - for req in generator: - if cache.get(req): - hit_count += 1 - - # Should have some hits and some misses - assert 0 <= hit_count <= 1000 - assert hit_count > 0 # Should have some hits - - def test_custom_parameters(self): - """Test generators with custom parameters.""" - generator = lcs.create_zipf_requests( - num_objects=50, - num_requests=200, - alpha=1.5, - obj_size=2048, - time_span=3600, # 1 hour - start_obj_id=1000, - seed=123, - ) - - requests = list(generator) - assert len(requests) == 200 - - # Check custom parameters - for req in requests[: self.NUM_SAMPLE_REQUESTS // 2]: # Check fewer for shorter test - assert 1000 <= req.obj_id < 1050 # start_obj_id + num_objects - assert req.obj_size == 2048 - assert req.clock_time <= 3600 diff --git a/libCacheSim-python/tests/test_unified_interface.py b/libCacheSim-python/tests/test_unified_interface.py deleted file mode 100644 index a2c7c8c26..000000000 --- a/libCacheSim-python/tests/test_unified_interface.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env python3 -""" -Test the unified interface for all cache policies. -""" - -import sys -import os -import pytest - -# Add the parent directory to the Python path for development testing -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) - -try: - import libcachesim as lcs -except ImportError as e: - pytest.skip(f"libcachesim not available: {e}", allow_module_level=True) - -from collections import OrderedDict - - -def create_trace_reader(): - """Helper function to create a trace reader. - - Returns: - Reader or None: A trace reader instance, or None if trace file not found. - """ - data_file = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "cloudPhysicsIO.oracleGeneral.bin" - ) - if not os.path.exists(data_file): - return None - return lcs.open_trace(data_file, lcs.TraceType.ORACLE_GENERAL_TRACE) - - -def create_test_lru_hooks(): - """Create LRU hooks for testing. - - Returns: - tuple: A tuple of (init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - """ - - def init_hook(cache_size): - """Initialize LRU data structure.""" - return OrderedDict() - - def hit_hook(lru_dict, obj_id, obj_size): - """Handle cache hit by moving to end (most recently used).""" - if obj_id in lru_dict: - lru_dict.move_to_end(obj_id) - - def miss_hook(lru_dict, obj_id, obj_size): - """Handle cache miss by adding new object.""" - lru_dict[obj_id] = obj_size - - def eviction_hook(lru_dict, obj_id, obj_size): - """Return the least recently used object ID for eviction.""" - if lru_dict: - return next(iter(lru_dict)) - return obj_id - - def remove_hook(lru_dict, obj_id): - """Remove object from LRU structure.""" - lru_dict.pop(obj_id, None) - - return init_hook, hit_hook, miss_hook, eviction_hook, remove_hook - - -def test_unified_process_trace_interface(): - """Test that all cache policies have the same process_trace interface.""" - - cache_size = 1024 * 1024 # 1MB - max_requests = 100 - - # Create trace reader - reader = create_trace_reader() - if not reader: - pytest.skip("Skipping test: Trace file not available") - - # Test different cache policies - caches = { - "LRU": lcs.LRU(cache_size), - "FIFO": lcs.FIFO(cache_size), - "ARC": lcs.ARC(cache_size), - } - - # Add Python hook cache - python_cache = lcs.PythonHookCachePolicy(cache_size, "TestLRU") - init_hook, hit_hook, miss_hook, eviction_hook, remove_hook = create_test_lru_hooks() - python_cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - caches["Python Hook LRU"] = python_cache - - results = {} - for name, cache in caches.items(): - # Create fresh reader for each test - test_reader = create_trace_reader() - if not test_reader: - pytest.skip(f"Cannot create reader for {name} test") - - # Test process_trace method exists - assert hasattr(cache, "process_trace"), f"{name} missing process_trace method" - - # Test process_trace functionality - obj_miss_ratio, byte_miss_ratio = cache.process_trace(test_reader, max_req=max_requests) - results[name] = obj_miss_ratio - - # Verify miss_ratio is valid - assert 0.0 <= obj_miss_ratio <= 1.0, f"{name} returned invalid miss_ratio: {obj_miss_ratio}" - - # Verify we got results for all caches - assert len(results) == len(caches), "Not all caches were tested" - - -def test_unified_properties_interface(): - """Test that all cache policies have the same properties interface.""" - - cache_size = 1024 * 1024 - - # Create different cache types - caches = { - "LRU": lcs.LRU(cache_size), - "FIFO": lcs.FIFO(cache_size), - "Python Hook": lcs.PythonHookCachePolicy(cache_size, "TestCache"), - } - - required_properties = ["cache_size", "n_req", "n_obj", "occupied_byte"] - - for name, cache in caches.items(): - # Test all required properties exist - for prop in required_properties: - assert hasattr(cache, prop), f"{name} missing {prop} property" - - # Test cache_size is correct - assert cache.cache_size == cache_size, f"{name} cache_size mismatch" - - -def test_get_interface_consistency(): - """Test that get() method works consistently across all cache policies.""" - - cache_size = 1024 * 1024 - - # Create caches - caches = { - "LRU": lcs.LRU(cache_size), - "FIFO": lcs.FIFO(cache_size), - } - - # Add Python hook cache - python_cache = lcs.PythonHookCachePolicy(cache_size, "ConsistencyTest") - init_hook, hit_hook, miss_hook, eviction_hook, remove_hook = create_test_lru_hooks() - python_cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - caches["Python Hook"] = python_cache - - # Create a test request using the proper request class - test_req = lcs.Request() - test_req.obj_id = 1 - test_req.obj_size = 1024 - - for name, cache in caches.items(): - # Reset cache state for consistent testing - initial_n_req = cache.n_req - initial_n_obj = cache.n_obj - initial_occupied = cache.occupied_byte - - # Test get method exists - assert hasattr(cache, "get"), f"{name} missing get method" - - # Test first access (should be miss for new object) - result = cache.get(test_req) - - # Test properties updated correctly - assert cache.n_req > initial_n_req, f"{name} n_req not updated" - if not result: # If it was a miss, object should be added - assert cache.n_obj > initial_n_obj, f"{name} n_obj not updated after miss" - assert cache.occupied_byte > initial_occupied, f"{name} occupied_byte not updated after miss" - - # Test second access to same object (should be hit) - second_result = cache.get(test_req) - - # Second access should be a hit (unless cache is too small) - if cache.cache_size >= test_req.obj_size: - assert second_result, f"{name} second access should be a hit" diff --git a/libCacheSim-python/tests/utils.py b/libCacheSim-python/tests/utils.py deleted file mode 100644 index 0977cc815..000000000 --- a/libCacheSim-python/tests/utils.py +++ /dev/null @@ -1,16 +0,0 @@ -import os - - -def get_reference_data(eviction_algo, cache_size_ratio): - data_file = os.path.join( # noqa: PTH118 - (os.path.dirname(os.path.dirname(__file__))), # noqa: PTH120 - "tests", - "reference.csv", - ) - with open(data_file) as f: # noqa: PTH123 - lines = f.readlines() - key = "3LCache" if eviction_algo == "ThreeLCache" else eviction_algo - for line in lines: - if line.startswith(f"{key},{cache_size_ratio}"): - return float(line.split(",")[-1]) - return None diff --git a/libCacheSim/traceReader/CMakeLists.txt b/libCacheSim/traceReader/CMakeLists.txt index 6e1b68ced..db28cb9f3 100644 --- a/libCacheSim/traceReader/CMakeLists.txt +++ b/libCacheSim/traceReader/CMakeLists.txt @@ -3,9 +3,9 @@ # ============================================================================== set(traceReader_sources_c - generalReader/binary.c - generalReader/csv.c - generalReader/txt.c + generalReader/binary.c + generalReader/csv.c + generalReader/txt.c generalReader/libcsv.c customizedReader/lcs.c reader.c diff --git a/scripts/install_python_dev.sh b/scripts/install_python_dev.sh index d878d89b9..a97159972 100644 --- a/scripts/install_python_dev.sh +++ b/scripts/install_python_dev.sh @@ -39,7 +39,7 @@ echo "Building Python binding..." echo "Sync python version..." python scripts/sync_python_version.py pushd libCacheSim-python -pip install -e . -vvv +python -m pip install -e . -vvv popd # Test that the import works From 0fff17647c286f868bbad1bd29b794372a05ca6b Mon Sep 17 00:00:00 2001 From: haochengxia Date: Mon, 21 Jul 2025 00:03:24 -0400 Subject: [PATCH 2/4] Fix comments by copilot --- .../libcachesim/synthetic_reader.py | 15 ++- libCacheSim-python/src/export_cache.cpp | 121 ++++++++++++------ libCacheSim-python/src/export_reader.cpp | 18 ++- 3 files changed, 107 insertions(+), 47 deletions(-) diff --git a/libCacheSim-python/libcachesim/synthetic_reader.py b/libCacheSim-python/libcachesim/synthetic_reader.py index c9d3575fc..16f8a1046 100644 --- a/libCacheSim-python/libcachesim/synthetic_reader.py +++ b/libCacheSim-python/libcachesim/synthetic_reader.py @@ -95,7 +95,7 @@ def read_one_req(self, req: Request) -> Request: req.obj_id = obj_id req.obj_size = self.obj_size req.clock_time = self.current_pos * self.time_span // self.num_of_req - req.op = ReqOp.OP_NOP + req.op = ReqOp.OP_READ req.valid = True self.current_pos += 1 @@ -132,7 +132,7 @@ def read_first_req(self, req: Request) -> Request: req.obj_id = obj_id req.obj_size = self.obj_size req.clock_time = 0 - req.op = ReqOp.OP_NOP + req.op = ReqOp.OP_READ req.valid = True return req @@ -146,7 +146,7 @@ def read_last_req(self, req: Request) -> Request: req.obj_id = obj_id req.obj_size = self.obj_size req.clock_time = (self.num_of_req - 1) * self.time_span // self.num_of_req - req.op = ReqOp.OP_NOP + req.op = ReqOp.OP_READ req.valid = True return req @@ -165,7 +165,7 @@ def read_one_req_above(self, req: Request) -> Request: req.obj_id = obj_id req.obj_size = self.obj_size req.clock_time = (self.current_pos + 1) * self.time_span // self.num_of_req - req.op = ReqOp.OP_NOP + req.op = ReqOp.OP_READ req.valid = True return req @@ -207,7 +207,7 @@ def __getitem__(self, index: int) -> Request: req.obj_id = obj_id req.obj_size = self.obj_size req.clock_time = index * self.time_span // self.num_of_req - req.op = ReqOp.OP_NOP + req.op = ReqOp.OP_READ req.valid = True return req @@ -256,7 +256,8 @@ def _gen_uniform(m: int, n: int, start: int = 0) -> np.ndarray: """ if m <= 0 or n <= 0: raise ValueError("num_objects and num_requests must be positive") - return np.random.randint(0, m, n) + start + # Optimized: directly generate in the target range for better performance + return np.random.randint(start, start + m, n) class _BaseRequestGenerator: @@ -302,7 +303,7 @@ def __iter__(self) -> Iterator[Request]: req.clock_time = i * self.time_span // self.num_requests req.obj_id = obj_id req.obj_size = self.obj_size - req.op = ReqOp.OP_NOP + req.op = ReqOp.OP_READ req.valid = True yield req diff --git a/libCacheSim-python/src/export_cache.cpp b/libCacheSim-python/src/export_cache.cpp index 3868866cc..fb383a236 100644 --- a/libCacheSim-python/src/export_cache.cpp +++ b/libCacheSim-python/src/export_cache.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include "config.h" @@ -58,7 +59,10 @@ struct RequestDeleter { // **** Python plugin cache implementation BEGIN **** // *********************************************************************** -typedef struct pypluginCache_params { +// Forward declaration with appropriate visibility +struct pypluginCache_params; + +typedef struct __attribute__((visibility("hidden"))) pypluginCache_params { py::object data; ///< Plugin's internal data structure (python object) py::function cache_init_hook; py::function cache_hit_hook; @@ -69,6 +73,23 @@ typedef struct pypluginCache_params { std::string cache_name; } pypluginCache_params_t; +// Custom deleter for pypluginCache_params_t +struct PypluginCacheParamsDeleter { + void operator()(pypluginCache_params_t* ptr) const { + if (ptr != nullptr) { + // Call the free hook if available before deletion + if (!ptr->cache_free_hook.is_none()) { + try { + ptr->cache_free_hook(ptr->data); + } catch (...) { + // Ignore exceptions during cleanup to prevent double-fault + } + } + delete ptr; + } + } +}; + static void pypluginCache_free(cache_t* cache); static bool pypluginCache_get(cache_t* cache, const request_t* req); static cache_obj_t* pypluginCache_find(cache_t* cache, const request_t* req, @@ -84,47 +105,71 @@ cache_t* pypluginCache_init( py::function cache_init_hook, py::function cache_hit_hook, py::function cache_miss_hook, py::function cache_eviction_hook, py::function cache_remove_hook, py::function cache_free_hook) { - // Initialize base cache structure - cache_t* cache = cache_struct_init(cache_name.c_str(), ccache_params, NULL); - - // Set function pointers for cache operations - cache->cache_init = NULL; - cache->cache_free = pypluginCache_free; - cache->get = pypluginCache_get; - cache->find = pypluginCache_find; - cache->insert = pypluginCache_insert; - cache->evict = pypluginCache_evict; - cache->remove = pypluginCache_remove; - cache->to_evict = pypluginCache_to_evict; - cache->get_occupied_byte = cache_get_occupied_byte_default; - cache->get_n_obj = cache_get_n_obj_default; - cache->can_insert = cache_can_insert_default; - cache->obj_md_size = 0; - - // Allocate and initialize plugin parameters - pypluginCache_params_t* params = new pypluginCache_params_t(); - params->cache_name = cache_name; - params->cache_init_hook = cache_init_hook; - params->cache_hit_hook = cache_hit_hook; - params->cache_miss_hook = cache_miss_hook; - params->cache_eviction_hook = cache_eviction_hook; - params->cache_remove_hook = cache_remove_hook; - params->cache_free_hook = cache_free_hook; - params->data = cache_init_hook(ccache_params); - - cache->eviction_params = params; - - return cache; + // Initialize base cache structure with exception safety + cache_t* cache = nullptr; + std::unique_ptr params; + + try { + cache = cache_struct_init(cache_name.c_str(), ccache_params, NULL); + if (!cache) { + throw std::runtime_error("Failed to initialize cache structure"); + } + + // Set function pointers for cache operations + cache->cache_init = NULL; + cache->cache_free = pypluginCache_free; + cache->get = pypluginCache_get; + cache->find = pypluginCache_find; + cache->insert = pypluginCache_insert; + cache->evict = pypluginCache_evict; + cache->remove = pypluginCache_remove; + cache->to_evict = pypluginCache_to_evict; + cache->get_occupied_byte = cache_get_occupied_byte_default; + cache->get_n_obj = cache_get_n_obj_default; + cache->can_insert = cache_can_insert_default; + cache->obj_md_size = 0; + + // Allocate and initialize plugin parameters using smart pointer with custom + // deleter + params = + std::unique_ptr( + new pypluginCache_params_t(), PypluginCacheParamsDeleter()); + params->cache_name = cache_name; + params->cache_init_hook = cache_init_hook; + params->cache_hit_hook = cache_hit_hook; + params->cache_miss_hook = cache_miss_hook; + params->cache_eviction_hook = cache_eviction_hook; + params->cache_remove_hook = cache_remove_hook; + params->cache_free_hook = cache_free_hook; + + // Initialize the cache data - this might throw + params->data = cache_init_hook(ccache_params); + + // Transfer ownership to the cache structure + cache->eviction_params = params.release(); + + return cache; + + } catch (...) { + // Clean up on exception + if (cache) { + cache_struct_free(cache); + } + // params will be automatically cleaned up by smart pointer destructor + throw; // Re-throw the exception + } } static void pypluginCache_free(cache_t* cache) { - pypluginCache_params_t* params = - (pypluginCache_params_t*)cache->eviction_params; - - if (!params->cache_free_hook.is_none()) { - params->cache_free_hook(params->data); + if (!cache || !cache->eviction_params) { + return; } - delete params; + + // Use smart pointer for automatic cleanup + std::unique_ptr params( + static_cast(cache->eviction_params)); + + // The smart pointer destructor will handle cleanup automatically cache_struct_free(cache); } diff --git a/libCacheSim-python/src/export_reader.cpp b/libCacheSim-python/src/export_reader.cpp index f9c3789b6..468f54289 100644 --- a/libCacheSim-python/src/export_reader.cpp +++ b/libCacheSim-python/src/export_reader.cpp @@ -42,7 +42,14 @@ struct RequestDeleter { struct ReaderInitParamDeleter { void operator()(reader_init_param_t* ptr) const { - if (ptr != nullptr) free(ptr); + if (ptr != nullptr) { + // Free the strdup'ed string if it exists + if (ptr->binary_fmt_str != nullptr) { + free(ptr->binary_fmt_str); + ptr->binary_fmt_str = nullptr; + } + free(ptr); + } } }; @@ -123,9 +130,16 @@ void export_reader(py::module& m) { const std::string& delimiter, ssize_t trace_start_offset, sampler_t* sampler) { reader_init_param_t params = default_reader_init_params(); + + // Safe string handling with proper error checking if (!binary_fmt_str.empty()) { - params.binary_fmt_str = strdup(binary_fmt_str.c_str()); + char* fmt_str = strdup(binary_fmt_str.c_str()); + if (!fmt_str) { + throw std::bad_alloc(); + } + params.binary_fmt_str = fmt_str; } + params.ignore_obj_size = ignore_obj_size; params.ignore_size_zero_req = ignore_size_zero_req; params.obj_id_is_num = obj_id_is_num; From 7f11a0aec4f52fb6a5ed7e46f5522119610599ee Mon Sep 17 00:00:00 2001 From: haochengxia Date: Mon, 21 Jul 2025 00:47:12 -0400 Subject: [PATCH 3/4] Preserve reader_protocol only --- libCacheSim-python/libcachesim/__init__.pyi | 12 ++-- libCacheSim-python/libcachesim/cache.py | 6 +- libCacheSim-python/libcachesim/protocols.py | 69 +++++-------------- .../libcachesim/trace_analyzer.py | 10 ++- libCacheSim-python/libcachesim/util.py | 10 +-- 5 files changed, 38 insertions(+), 69 deletions(-) diff --git a/libCacheSim-python/libcachesim/__init__.pyi b/libCacheSim-python/libcachesim/__init__.pyi index 213eb1eb8..2e2a565e5 100644 --- a/libCacheSim-python/libcachesim/__init__.pyi +++ b/libCacheSim-python/libcachesim/__init__.pyi @@ -3,7 +3,7 @@ from typing import bool, int, str, tuple from collections.abc import Iterator from .libcachesim_python import ReqOp, TraceType, SamplerType -from .protocols import ReaderProtocol, CacheProtocol +from .protocols import ReaderProtocol class Request: clock_time: int @@ -59,8 +59,8 @@ class Cache: def get_n_obj(self) -> int: ... def print_cache(self) -> str: ... -class CacheBase(CacheProtocol): - """Base class implementing CacheProtocol""" +class CacheBase: + """Base class for all cache implementations""" def __init__(self, _cache: Cache): ... def get(self, req: Request) -> bool: ... def find(self, req: Request, update_cache: bool = True) -> CacheObject: ... @@ -219,6 +219,7 @@ def create_zipf_requests( start_obj_id: int = 0, seed: int | None = None, ) -> Iterator[Request]: ... + def create_uniform_requests( num_objects: int, num_requests: int, @@ -230,8 +231,9 @@ def create_uniform_requests( # Analyzer class TraceAnalyzer: - def __init__(self, analyzer): ... - def analyze(self, reader: ReaderProtocol, output_path: str, analysis_param, analysis_option) -> None: ... + def __init__(self, analyzer, reader: ReaderProtocol, output_path: str, analysis_param, analysis_option): ... + def run(self) -> None: ... + def cleanup(self) -> None: ... # Utilities class Util: diff --git a/libCacheSim-python/libcachesim/cache.py b/libCacheSim-python/libcachesim/cache.py index 3f3a2bd38..3e40249e1 100644 --- a/libCacheSim-python/libcachesim/cache.py +++ b/libCacheSim-python/libcachesim/cache.py @@ -40,11 +40,11 @@ c_process_trace, ) -from .protocols import CacheProtocol, ReaderProtocol +from .protocols import ReaderProtocol -class CacheBase(CacheProtocol): - """Base class for all cache implementations that implements CacheProtocol""" +class CacheBase(ABC): + """Base class for all cache implementations""" _cache: Cache # Internal C++ cache object diff --git a/libCacheSim-python/libcachesim/protocols.py b/libCacheSim-python/libcachesim/protocols.py index d362946a0..d2e7b8170 100644 --- a/libCacheSim-python/libcachesim/protocols.py +++ b/libCacheSim-python/libcachesim/protocols.py @@ -1,71 +1,34 @@ -from __future__ import annotations - -from typing import Protocol, TYPE_CHECKING - -if TYPE_CHECKING: - from .libcachesim_python import Request, CacheObject, Reader, Analyzer - - -class CacheProtocol(Protocol): - def get(self, req: Request) -> bool: ... - - def find(self, req: Request, update_cache: bool = True) -> CacheObject: ... - - def can_insert(self, req: Request) -> bool: ... +""" +Reader protocol for libCacheSim Python bindings. - def insert(self, req: Request) -> CacheObject: ... +ReaderProtocol defines the interface contract for trace readers, +enabling different implementations (Python/C++) to work interchangeably. +""" - def need_eviction(self, req: Request) -> bool: ... - - def evict(self, req: Request) -> CacheObject: ... - - def remove(self, obj_id: int) -> bool: ... - - def to_evict(self, req: Request) -> CacheObject: ... - - def get_occupied_byte(self) -> int: ... - - def get_n_obj(self) -> int: ... - - def print_cache(self) -> str: ... +from __future__ import annotations +from typing import Protocol, runtime_checkable, TYPE_CHECKING - def process_trace(self, reader: "ReaderProtocol", start_req: int = 0, max_req: int = -1) -> tuple[float, float]: ... +if TYPE_CHECKING: + from .libcachesim_python import Request - # Properties - @property - def cache_size(self) -> int: ... - @property - def cache_name(self) -> str: ... +@runtime_checkable +class ReaderProtocol(Protocol): + """Protocol for trace readers + This protocol ensures that different reader implementations + (SyntheticReader, TraceReader) can be used interchangeably. + """ -class ReaderProtocol(Protocol): def get_num_of_req(self) -> int: ... - def read_one_req(self, req: Request) -> Request: ... - def reset(self) -> None: ... - def close(self) -> None: ... - - def clone(self) -> ReaderProtocol: ... - + def clone(self) -> "ReaderProtocol": ... def read_first_req(self, req: Request) -> Request: ... - def read_last_req(self, req: Request) -> Request: ... - def skip_n_req(self, n: int) -> int: ... - def read_one_req_above(self, req: Request) -> Request: ... - def go_back_one_req(self) -> None: ... - def set_read_pos(self, pos: float) -> None: ... - def get_read_pos(self) -> float: ... - - -class AnalyzerProtocol(Protocol): - def run(self) -> None: ... - - def cleanup(self) -> None: ... diff --git a/libCacheSim-python/libcachesim/trace_analyzer.py b/libCacheSim-python/libcachesim/trace_analyzer.py index bf598a71b..46c0f63a6 100644 --- a/libCacheSim-python/libcachesim/trace_analyzer.py +++ b/libCacheSim-python/libcachesim/trace_analyzer.py @@ -1,6 +1,10 @@ """Wrapper of Analyzer""" +from __future__ import annotations -from .protocols import ReaderProtocol, AnalyzerProtocol +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .protocols import ReaderProtocol from .libcachesim_python import ( Analyzer, @@ -9,13 +13,13 @@ ) -class TraceAnalyzer(AnalyzerProtocol): +class TraceAnalyzer: _analyzer: Analyzer def __init__( self, analyzer: Analyzer, - reader: "ReaderProtocol", + reader: ReaderProtocol, output_path: str, analysis_param: AnalysisParam, analysis_option: AnalysisOption, diff --git a/libCacheSim-python/libcachesim/util.py b/libCacheSim-python/libcachesim/util.py index 0f80a7fb2..c9c351b35 100644 --- a/libCacheSim-python/libcachesim/util.py +++ b/libCacheSim-python/libcachesim/util.py @@ -1,9 +1,11 @@ """Wrapper misc functions""" +from __future__ import annotations from typing import TYPE_CHECKING if TYPE_CHECKING: - from .protocols import CacheProtocol, ReaderProtocol + from .protocols import ReaderProtocol + from .cache import CacheBase from .libcachesim_python import convert_to_oracleGeneral, convert_to_lcs, c_process_trace @@ -28,9 +30,7 @@ def convert_to_lcs(reader, ofilepath, output_txt=False, remove_size_change=False return convert_to_lcs(reader, ofilepath, output_txt, remove_size_change, lcs_ver) @staticmethod - def process_trace( - cache: "CacheProtocol", reader: "ReaderProtocol", start_req: int = 0, max_req: int = -1 - ) -> tuple[float, float]: + def process_trace(cache: CacheBase, reader: ReaderProtocol, start_req: int = 0, max_req: int = -1) -> tuple[float, float]: """ Process a trace with a cache. @@ -44,7 +44,7 @@ def process_trace( tuple[float, float]: The object miss ratio and byte miss ratio. """ # Check if reader is C++ reader - if not hasattr(reader, "c_reader") or not reader.c_reader: + if not hasattr(reader, 'c_reader') or not reader.c_reader: raise ValueError("Reader must be a C++ reader") return c_process_trace(cache._cache, reader._reader, start_req, max_req) From 7642dac89e51f3c92ecfee710be13bb4693abbf2 Mon Sep 17 00:00:00 2001 From: haochengxia Date: Wed, 23 Jul 2025 14:14:56 -0400 Subject: [PATCH 4/4] Connect S3 --- libCacheSim-python/libcachesim/__init__.py | 7 + libCacheSim-python/libcachesim/data_loader.py | 131 ++++++++++++++++++ libCacheSim-python/libcachesim/protocols.py | 15 +- .../libcachesim/trace_analyzer.py | 26 +++- libCacheSim-python/src/export_analyzer.cpp | 15 +- libCacheSim-python/tests/test_analyzer.py | 15 ++ libCacheSim-python/tests/test_data_loader.py | 8 ++ 7 files changed, 198 insertions(+), 19 deletions(-) create mode 100644 libCacheSim-python/libcachesim/data_loader.py create mode 100644 libCacheSim-python/tests/test_analyzer.py create mode 100644 libCacheSim-python/tests/test_data_loader.py diff --git a/libCacheSim-python/libcachesim/__init__.py b/libCacheSim-python/libcachesim/__init__.py index b9424a37b..f71c6ee47 100644 --- a/libCacheSim-python/libcachesim/__init__.py +++ b/libCacheSim-python/libcachesim/__init__.py @@ -8,6 +8,8 @@ ReqOp, TraceType, SamplerType, + AnalysisParam, + AnalysisOption, __doc__, __version__, ) @@ -43,6 +45,7 @@ from .trace_analyzer import TraceAnalyzer from .synthetic_reader import SyntheticReader, create_zipf_requests, create_uniform_requests from .util import Util +from .data_loader import DataLoader __all__ = [ # Core classes @@ -51,6 +54,8 @@ "ReqOp", "TraceType", "SamplerType", + "AnalysisParam", + "AnalysisOption", # Cache base class "CacheBase", # Core cache algorithms @@ -85,6 +90,8 @@ "create_uniform_requests", # Utilities "Util", + # Data loader + "DataLoader", # Metadata "__doc__", "__version__", diff --git a/libCacheSim-python/libcachesim/data_loader.py b/libCacheSim-python/libcachesim/data_loader.py new file mode 100644 index 000000000..fee5f9bc3 --- /dev/null +++ b/libCacheSim-python/libcachesim/data_loader.py @@ -0,0 +1,131 @@ +"""S3 Bucket data loader with local caching (HuggingFace-style).""" + +from __future__ import annotations + +import hashlib +import logging +import shutil +from pathlib import Path +from typing import Optional, Union +from urllib.parse import quote + +logger = logging.getLogger(__name__) + + +class DataLoader: + DEFAULT_BUCKET = "cache-datasets" + DEFAULT_CACHE_DIR = Path.home() / ".cache/libcachesim_hub" + + def __init__( + self, + bucket_name: str = DEFAULT_BUCKET, + cache_dir: Optional[Union[str, Path]] = None, + use_auth: bool = False + ): + self.bucket_name = bucket_name + self.cache_dir = Path(cache_dir) if cache_dir else self.DEFAULT_CACHE_DIR + self.use_auth = use_auth + self._s3_client = None + self._ensure_cache_dir() + + def _ensure_cache_dir(self) -> None: + (self.cache_dir / self.bucket_name).mkdir(parents=True, exist_ok=True) + + @property + def s3_client(self): + if self._s3_client is None: + try: + import boto3 + from botocore.config import Config + from botocore import UNSIGNED + + self._s3_client = boto3.client( + 's3', + config=None if self.use_auth else Config(signature_version=UNSIGNED) + ) + except ImportError: + raise ImportError("Install boto3: pip install boto3") + return self._s3_client + + def _cache_path(self, key: str) -> Path: + safe_name = hashlib.sha256(key.encode()).hexdigest()[:16] + "_" + quote(key, safe='') + return self.cache_dir / self.bucket_name / safe_name + + def _download(self, key: str, dest: Path) -> None: + temp = dest.with_suffix(dest.suffix + '.tmp') + temp.parent.mkdir(parents=True, exist_ok=True) + + try: + logger.info(f"Downloading s3://{self.bucket_name}/{key}") + obj = self.s3_client.get_object(Bucket=self.bucket_name, Key=key) + with open(temp, 'wb') as f: + f.write(obj['Body'].read()) + shutil.move(str(temp), str(dest)) + logger.info(f"Saved to: {dest}") + except Exception as e: + if temp.exists(): + temp.unlink() + raise RuntimeError(f"Download failed for s3://{self.bucket_name}/{key}: {e}") + + def load(self, key: str, force: bool = False, mode: str = 'rb') -> Union[bytes, str]: + path = self._cache_path(key) + if not path.exists() or force: + self._download(key, path) + with open(path, mode) as f: + return f.read() + + def is_cached(self, key: str) -> bool: + return self._cache_path(key).exists() + + def get_cache_path(self, key: str) -> Path: + return self._cache_path(key).as_posix() + + def clear_cache(self, key: Optional[str] = None) -> None: + if key: + path = self._cache_path(key) + if path.exists(): + path.unlink() + logger.info(f"Cleared: {path}") + else: + shutil.rmtree(self.cache_dir, ignore_errors=True) + logger.info(f"Cleared entire cache: {self.cache_dir}") + + def list_cached_files(self) -> list[str]: + if not self.cache_dir.exists(): + return [] + return [ + str(p) for p in self.cache_dir.rglob('*') + if p.is_file() and not p.name.endswith('.tmp') + ] + + def get_cache_size(self) -> int: + return sum( + p.stat().st_size for p in self.cache_dir.rglob('*') if p.is_file() + ) + + def list_s3_objects(self, prefix: str = "", delimiter: str = "/") -> dict: + """ + List S3 objects and pseudo-folders under a prefix. + + Args: + prefix: The S3 prefix to list under (like folder path) + delimiter: Use "/" to simulate folder structure + + Returns: + A dict with two keys: + - "folders": list of sub-prefixes (folders) + - "files": list of object keys (files) + """ + paginator = self.s3_client.get_paginator('list_objects_v2') + result = {"folders": [], "files": []} + + for page in paginator.paginate( + Bucket=self.bucket_name, + Prefix=prefix, + Delimiter=delimiter + ): + # CommonPrefixes are like subdirectories + result["folders"].extend(cp["Prefix"] for cp in page.get("CommonPrefixes", [])) + result["files"].extend(obj["Key"] for obj in page.get("Contents", [])) + + return result diff --git a/libCacheSim-python/libcachesim/protocols.py b/libCacheSim-python/libcachesim/protocols.py index d2e7b8170..58eeddbff 100644 --- a/libCacheSim-python/libcachesim/protocols.py +++ b/libCacheSim-python/libcachesim/protocols.py @@ -6,7 +6,7 @@ """ from __future__ import annotations -from typing import Protocol, runtime_checkable, TYPE_CHECKING +from typing import Iterator, Protocol, runtime_checkable, TYPE_CHECKING if TYPE_CHECKING: from .libcachesim_python import Request @@ -18,17 +18,16 @@ class ReaderProtocol(Protocol): This protocol ensures that different reader implementations (SyntheticReader, TraceReader) can be used interchangeably. + + Only core methods are defined here. """ def get_num_of_req(self) -> int: ... def read_one_req(self, req: Request) -> Request: ... + def skip_n_req(self, n: int) -> int: ... def reset(self) -> None: ... def close(self) -> None: ... def clone(self) -> "ReaderProtocol": ... - def read_first_req(self, req: Request) -> Request: ... - def read_last_req(self, req: Request) -> Request: ... - def skip_n_req(self, n: int) -> int: ... - def read_one_req_above(self, req: Request) -> Request: ... - def go_back_one_req(self) -> None: ... - def set_read_pos(self, pos: float) -> None: ... - def get_read_pos(self) -> float: ... + def __iter__(self) -> Iterator[Request]: ... + def __next__(self) -> Request: ... + def __len__(self) -> int: ... diff --git a/libCacheSim-python/libcachesim/trace_analyzer.py b/libCacheSim-python/libcachesim/trace_analyzer.py index 46c0f63a6..4e51da41c 100644 --- a/libCacheSim-python/libcachesim/trace_analyzer.py +++ b/libCacheSim-python/libcachesim/trace_analyzer.py @@ -12,18 +12,38 @@ AnalysisParam, ) +# Import ReaderException +class ReaderException(Exception): + """Exception raised when reader is not compatible""" + pass class TraceAnalyzer: _analyzer: Analyzer def __init__( self, - analyzer: Analyzer, reader: ReaderProtocol, output_path: str, - analysis_param: AnalysisParam, - analysis_option: AnalysisOption, + analysis_param: AnalysisParam = None, + analysis_option: AnalysisOption = None, ): + """ + Initialize trace analyzer. + + Args: + reader: Reader protocol + output_path: Path to output file + analysis_param: Analysis parameters + analysis_option: Analysis options + """ + if not hasattr(reader, 'c_reader') or not reader.c_reader: + raise ReaderException("Only C/C++ reader is supported") + + if analysis_param is None: + analysis_param = AnalysisParam() + if analysis_option is None: + analysis_option = AnalysisOption() + self._analyzer = Analyzer(reader._reader, output_path, analysis_option, analysis_param) def run(self) -> None: diff --git a/libCacheSim-python/src/export_analyzer.cpp b/libCacheSim-python/src/export_analyzer.cpp index 0d8fd6680..f05c853ab 100644 --- a/libCacheSim-python/src/export_analyzer.cpp +++ b/libCacheSim-python/src/export_analyzer.cpp @@ -92,8 +92,8 @@ void export_analyzer(py::module& m) { AnalysisOptionDeleter>( new traceAnalyzer::analysis_option_t(option)); }), - "req_rate"_a = false, "access_pattern"_a = false, "size"_a = false, - "reuse"_a = false, "popularity"_a = false, "ttl"_a = false, + "req_rate"_a = true, "access_pattern"_a = true, "size"_a = true, + "reuse"_a = true, "popularity"_a = true, "ttl"_a = false, "popularity_decay"_a = false, "lifetime"_a = false, "create_future_reuse_ccdf"_a = false, "prob_at_age"_a = false, "size_change"_a = false) @@ -119,18 +119,17 @@ void export_analyzer(py::module& m) { py::class_>(m, "Analyzer") .def(py::init([](reader_t* reader, std::string output_path, - const traceAnalyzer::analysis_param_t& param, - const traceAnalyzer::analysis_option_t& option) { + const traceAnalyzer::analysis_option_t& option, + const traceAnalyzer::analysis_param_t& param) { traceAnalyzer::TraceAnalyzer* analyzer = new traceAnalyzer::TraceAnalyzer(reader, output_path, option, param); return std::unique_ptr(analyzer); }), "reader"_a, "output_path"_a, - "param"_a = traceAnalyzer::default_param(), - "option"_a = traceAnalyzer::default_option()) - .def("run", &traceAnalyzer::TraceAnalyzer::run) - .def("cleanup", &traceAnalyzer::TraceAnalyzer::cleanup); + "option"_a = traceAnalyzer::default_option(), + "param"_a = traceAnalyzer::default_param()) + .def("run", &traceAnalyzer::TraceAnalyzer::run); } } // namespace libcachesim diff --git a/libCacheSim-python/tests/test_analyzer.py b/libCacheSim-python/tests/test_analyzer.py new file mode 100644 index 000000000..f5d854345 --- /dev/null +++ b/libCacheSim-python/tests/test_analyzer.py @@ -0,0 +1,15 @@ +from libcachesim import TraceAnalyzer, TraceReader, DataLoader +import os + + +def test_analyzer_common(): + # Add debugging and error handling + loader = DataLoader() + loader.load("cache_dataset_oracleGeneral/2020_tencentBlock/1K/tencentBlock_1621.oracleGeneral.zst") + file_path = loader.get_cache_path("cache_dataset_oracleGeneral/2020_tencentBlock/1K/tencentBlock_1621.oracleGeneral.zst") + + reader = TraceReader(file_path) + + analyzer = TraceAnalyzer(reader, output_path="./") + + analyzer.run() diff --git a/libCacheSim-python/tests/test_data_loader.py b/libCacheSim-python/tests/test_data_loader.py new file mode 100644 index 000000000..5aba6f5f2 --- /dev/null +++ b/libCacheSim-python/tests/test_data_loader.py @@ -0,0 +1,8 @@ +from libcachesim import DataLoader + + +def test_data_loader_common(): + loader = DataLoader() + loader.load("cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst") + path = loader.get_cache_path("cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst") + filles = loader.list_s3_objects("cache_dataset_oracleGeneral/2007_msr/")