haochengxia
diff --git a/‎experiments/k8s.py‎
Lines changed: 103 additions & 0 deletions b/‎experiments/k8s.py‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎experiments/simulate.py‎
Lines changed: 129 additions & 0 deletions b/‎experiments/simulate.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎experiments/trace_sel.py‎
Lines changed: 32 additions & 0 deletions b/‎experiments/trace_sel.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎libCacheSim-python/libcachesim/__init__.py‎
Lines changed: 7 additions & 1 deletion b/‎libCacheSim-python/libcachesim/__init__.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎libCacheSim-python/libcachesim/__init__.pyi‎
Lines changed: 23 additions & 1 deletion b/‎libCacheSim-python/libcachesim/__init__.pyi‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎libCacheSim-python/libcachesim/const.py‎
Lines changed: 2 additions & 9 deletions b/‎libCacheSim-python/libcachesim/const.py‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎libCacheSim-python/libcachesim/dataset.py‎
Lines changed: 20 additions & 0 deletions b/‎libCacheSim-python/libcachesim/dataset.py‎
Lines changed: 20 additions & 0 deletions
@@ -0,0 +1,103 @@
+import os
+import json
+from itertools import product
+import subprocess
+import time
+
+# ==== CONFIGURATION ====
+
+
+
+algos = ["S3FIFO", "ARC"]
+params_list = [
+    {"fifo_size_ratio": 0.1, "move_to_main_threshold": 2},
+    {"fifo_size_ratio": 0.2, "move_to_main_threshold": 3},
+]
+image = "myrepo/cachesim:latest"  # Replace with your image
+volume_path = "/mnt/moofs"
+
+# ==== GENERATE JOB YAML ====
+os.makedirs("jobs", exist_ok=True)
+job_template = """
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: simulate-{job_id}
+spec:
+  template:
+    spec:
+      containers:
+      - name: simulator
+        image: {image}
+        command:
+          - "python"
+          - "simulate.py"
+          - "--trace"
+          - "{trace}"
+          - "--algo"
+          - "{algo}"
+          - "--params"
+          - '{params}'
+        volumeMounts:
+        - name: moofs
+          mountPath: /mnt/moofs
+      restartPolicy: Never
+      volumes:
+      - name: moofs
+        hostPath:
+          path: {volume_path}
+          type: Directory
+  backoffLimit: 1
+"""
+
+job_map = []  # List of (job_id, job_name)
+job_id = 0
+for trace, algo, params in product(traces, algos, params_list):
+    job_name = f"simulate-{job_id}"
+    yaml = job_template.format(
+        job_id=job_id,
+        trace=trace,
+        algo=algo,
+        params=json.dumps(params),
+        image=image,
+        volume_path=volume_path,
+    )
+    with open(f"jobs/job-{job_id}.yaml", "w") as f:
+        f.write(yaml)
+    job_map.append((job_id, job_name))
+    job_id += 1
+
+# ==== APPLY JOBS ====
+for job_id, job_name in job_map:
+    subprocess.run(["kubectl", "apply", "-f", f"jobs/job-{job_id}.yaml"])
+
+# ==== WAIT FOR COMPLETION ====
+def all_jobs_done():
+    result = subprocess.run(["kubectl", "get", "jobs", "-o", "json"], capture_output=True, text=True)
+    jobs = json.loads(result.stdout)["items"]
+    completed = sum(1 for job in jobs if job["status"].get("succeeded") == 1)
+    return completed == len(job_map)
+
+print("Waiting for jobs to complete...")
+while not all_jobs_done():
+    time.sleep(5)
+
+# ==== GATHER LOGS ====
+results = []
+for job_id, job_name in job_map:
+    pod_name = subprocess.run(
+        ["kubectl", "get", "pods", "--selector=job-name=" + job_name, "-o", "jsonpath={.items[0].metadata.name}"],
+        capture_output=True, text=True).stdout.strip()
+
+    log = subprocess.run(["kubectl", "logs", pod_name], capture_output=True, text=True).stdout.strip()
+    print(f"[{job_name}] {log}")
+    if "Miss ratio:" in log:
+        try:
+            ratio = float(log.strip().split()[-1])
+            results.append((job_name, ratio))
+        except:
+            results.append((job_name, None))
+
+print("\nAll gathered results:")
+for job_name, ratio in results:
+    print(f"{job_name}: {ratio}")
@@ -0,0 +1,129 @@
+from libcachesim import open_trace, TraceType, get_trace_file_path, get_trace_file_lists
+from libcachesim.eviction import S3FIFO
+import pandas as pd
+import multiprocessing as mp
+from functools import partial
+
+
+def simulate(eviction_algo, trace_file_name: str, cache_size_ratio: float, ignore_obj_size: bool, eviction_algo_params: dict) -> None:
+    # get the trace file path
+    trace_file_path = get_trace_file_path(trace_file_name)
+    # open the trace file
+    reader = open_trace(trace_file_path, type=TraceType.ORACLE_GENERAL_TRACE,
+        ignore_obj_size=ignore_obj_size)
+    # create a cache with the eviction policy
+    cache = eviction_algo(cache_size=int(reader.get_wss(ignore_obj_size=ignore_obj_size)*cache_size_ratio), **eviction_algo_params)
+    # process the trace
+    miss_ratio = cache.process_trace(reader)
+    # return the miss ratio
+    return miss_ratio
+
+
+def simulate_single_trace(args):
+    """单个trace文件的模拟函数，用于多进程"""
+    trace_file_name, eviction_algo, cache_size_ratio, ignore_obj_size, eviction_algo_params = args
+    try:
+        miss_ratio = simulate(eviction_algo, trace_file_name, cache_size_ratio, ignore_obj_size, eviction_algo_params)
+        return {"trace_file_name": trace_file_name, "miss_ratio": miss_ratio, "status": "success"}
+    except Exception as e:
+        return {"trace_file_name": trace_file_name, "miss_ratio": None, "status": f"error: {str(e)}"}
+
+
+def main():
+    ignore_obj_size = True
+    cache_size_ratio = 0.01
+    eviction_algo = S3FIFO
+    eviction_algo_params = {
+        "fifo_size_ratio": 0.1,
+        "move_to_main_threshold": 2,
+    }
+
+    # get the trace file path
+    files = get_trace_file_lists("msr")
+    miss_ratios = []
+    for file in files:
+        trace_file_path = get_trace_file_path(file)
+        # open the trace file
+        reader = open_trace(trace_file_path, type=TraceType.ORACLE_GENERAL_TRACE, ignore_obj_size=ignore_obj_size)
+        # create a cache with the eviction policy
+        cache = eviction_algo(cache_size=int(reader.get_wss(ignore_obj_size=ignore_obj_size)*cache_size_ratio), **eviction_algo_params)
+        # process the trace
+        miss_ratio = cache.process_trace(reader)
+        miss_ratios.append(miss_ratio)
+
+    print(f"Miss ratio: {miss_ratios}")
+
+
+def simulate_all_traces_parallel(num_processes=None):
+    """使用多进程并行处理所有trace文件"""
+    if num_processes is None:
+        num_processes = mp.cpu_count()
+
+    print(f"使用 {num_processes} 个进程进行并行处理...")
+
+    # 读取trace文件列表
+    df = pd.read_csv("trace_set_files.csv")
+
+    # 准备参数
+    eviction_algo = S3FIFO
+    cache_size_ratio = 0.01
+    ignore_obj_size = True
+    eviction_algo_params = {"fifo_size_ratio": 0.1, "move_to_main_threshold": 2}
+
+    # 准备任务参数
+    tasks = []
+    for index, row in df.iterrows():
+        trace_file_name = str(row["trace_file_name"])
+        task_args = (trace_file_name, eviction_algo, cache_size_ratio, ignore_obj_size, eviction_algo_params)
+        tasks.append(task_args)
+
+    print(f"总共需要处理 {len(tasks)} 个trace文件")
+
+    # 使用多进程池处理
+    results = []
+    with mp.Pool(processes=num_processes) as pool:
+        # 使用imap来显示进度
+        for i, result in enumerate(pool.imap(simulate_single_trace, tasks)):
+            results.append(result)
+            if (i + 1) % 10 == 0:
+                print(f"已处理 {i + 1}/{len(tasks)} 个文件")
+
+    # 处理结果
+    successful_results = [r for r in results if r["status"] == "success"]
+    failed_results = [r for r in results if r["status"] != "success"]
+
+    print(f"\n处理完成！")
+    print(f"成功处理: {len(successful_results)} 个文件")
+    print(f"失败处理: {len(failed_results)} 个文件")
+
+    if failed_results:
+        print("\n失败的文件:")
+        for result in failed_results:
+            print(f"  {result['trace_file_name']}: {result['status']}")
+
+    # 提取miss ratios
+    miss_ratios = [r["miss_ratio"] for r in successful_results]
+    print(f"\nMiss ratios: {miss_ratios}")
+
+    # 保存结果到CSV
+    results_df = pd.DataFrame(results)
+    results_df.to_csv("simulation_results.csv", index=False)
+    print("结果已保存到 simulation_results.csv")
+
+    return results
+
+
+def simulate_all_traces():
+    """原始的单进程版本"""
+    df = pd.read_csv("trace_set_files.csv")
+    miss_ratios = []
+    for index, row in df.iterrows():
+        trace_file_name = str(row["trace_file_name"])
+        miss_ratio = simulate(S3FIFO, trace_file_name, 0.01, True, {"fifo_size_ratio": 0.1, "move_to_main_threshold": 2})
+        miss_ratios.append(miss_ratio)
+    print(f"Miss ratio: {miss_ratios}")
+
+
+if __name__ == "__main__":
+    # 使用多进程版本
+    simulate_all_traces_parallel()
@@ -0,0 +1,32 @@
+"""
+Trace selection
+"""
+
+from libcachesim import get_trace_file_lists
+import pandas as pd
+
+
+trace_set_names = ["alibabaBlock", "cloudphysics", "metaCDN", "metaKV", "metaStorage", "msr", "tencentBlock", "tencentPhoto", "twitter", "wiki", "cdn1", "cdn2"]  # fiu and systor are not available
+
+
+def select_traces():
+    df = pd.DataFrame(columns=["trace_set_name", "trace_file_name"])
+    trace_set_files = []
+    for trace_set_name in trace_set_names:
+        try:
+            trace_set_files.append(get_trace_file_lists(trace_set_name))
+            current_trace_set_files = trace_set_files[-1]
+            current_trace_set_files = [trace_file_name for trace_file_name in current_trace_set_files if ("1K" not in trace_file_name and "10K" not in trace_file_name and "100K" not in trace_file_name)]
+            for trace_file_name in current_trace_set_files[:200]:
+                new_row_df = pd.DataFrame([{"trace_set_name": trace_set_name, "trace_file_name": trace_file_name}])
+                df = pd.concat([df, new_row_df], ignore_index=True)
+            print(f"Trace set {trace_set_name} has {len(current_trace_set_files)} traces, selected {len(current_trace_set_files[:200])} traces")
+        except FileNotFoundError:
+            print(f"Trace set {trace_set_name} not found")
+            continue
+    df.to_csv("trace_set_files.csv", index=False)
+    return trace_set_files
+
+
+if __name__ == "__main__":
+    select_traces()
@@ -8,8 +8,8 @@
     __version__,
     create_cache,
     open_trace,
+    TraceType,
 )
-from .const import TraceType
 from .eviction import (
     ARC,
     FIFO,
@@ -23,6 +23,9 @@
     TwoQ,
 )
 
+from .const import HF_CACHE_DIR
+from .dataset import get_trace_file_lists, get_trace_file_path
+
 __all__ = [
     "ARC",
     "FIFO",
@@ -42,5 +45,8 @@
     "__version__",
     "create_cache",
     "open_trace",
+    "get_trace_file_lists",
+    "get_trace_file_path",
+    "HF_CACHE_DIR",
     # TODO(haocheng): add more eviction policies
 ]
@@ -26,7 +26,24 @@ libCacheSim Python bindings
     TraceType
 """
 
-from .const import TraceType
+import enum
+
+class TraceType(enum.Enum):
+    CSV_TRACE = 0
+    BIN_TRACE = 1
+    PLAIN_TXT_TRACE = 2
+    ORACLE_GENERAL_TRACE = 3
+    LCS_TRACE = 4 # libCacheSim format
+    VSCSI_TRACE = 5
+    TWR_TRACE = 6
+    TWRNS_TRACE = 7
+    ORACLE_SIM_TWR_TRACE = 8
+    ORACLE_SYS_TWR_TRACE = 9
+    ORACLE_SIM_TWRNS_TRACE = 10
+    ORACLE_SYS_TWRNS_TRACE = 11
+    VALPIN_TRACE = 12
+    UNKNOWN_TRACE = 13
+
 
 def create_cache(
     eviction_algo: str,
@@ -146,3 +163,8 @@ class Reader:
     def get_wss(self, ignore_obj_size: bool = False) -> int: ...
     def __iter__(self) -> Reader: ...
     def __next__(self) -> Request: ...
+
+
+# -----------------------------
+def get_trace_file_lists(trace_set_name: str) -> list[str]: ...
+def get_trace_file_path(trace_file_name: str) -> str: ...
@@ -1,11 +1,4 @@
 from __future__ import annotations
 
-import enum
-
-
-class TraceType(enum.Enum):
-    CSV_TRACE = 0
-    BIN_TRACE = 1
-    PLAIN_TXT_TRACE = 2
-    ORACLE_GENERAL_TRACE = 3
-    LCS_TRACE = 4 # libCacheSim format
+HF_CACHE_DIR = "/mnt/cfs"
+DATASET_NAME = "1a1a11a/cache_dataset_oracleGeneral"
@@ -0,0 +1,20 @@
+from huggingface_hub import hf_hub_download, list_repo_files
+
+from .const import DATASET_NAME, HF_CACHE_DIR
+
+
+def get_trace_file_lists(trace_set_name: str) -> list[str]:
+    """
+    Get the list of trace files in the dataset.
+    """
+    repo_files = list_repo_files(repo_id=DATASET_NAME, repo_type="dataset")
+    # Apply keyword filter
+    keyword = trace_set_name.lower()
+    return [file for file in repo_files if keyword in file.lower()]
+
+def get_trace_file_path(trace_file_name: str) -> str:
+    """
+    Get the localpath of a trace file in the dataset. Note, it will download the file to the cache directory.
+    """
+    return hf_hub_download(
+        repo_id=DATASET_NAME, repo_type="dataset", filename=trace_file_name, cache_dir=HF_CACHE_DIR)