Cipher-unhsiV
diff --git a/‎metadata.json‎
Lines changed: 1 addition & 1 deletion b/‎metadata.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎private/evaluate.py‎
Lines changed: 116 additions & 61 deletions b/‎private/evaluate.py‎
Lines changed: 116 additions & 61 deletions
diff --git a/‎private/result-img.png‎
268 KB b/‎private/result-img.png‎
268 KB
@@ -7,7 +7,7 @@
   "num_classes": 10,
 
   "federated_learning": {
-    "num_clients": 100,
+    "num_clients": 260,
     "malicious_fraction": 0.2,
     "num_rounds": 20,
     "local_epochs": 1,
 
@@ -1,62 +1,117 @@
+# ==========================
+# Robust Evaluation Script
+# ==========================
+
+import argparse
+import pandas as pd
+import torch
+import numpy as np
+
+from sklearn.metrics import (
+    accuracy_score,
+    f1_score,
+    classification_report,
+    confusion_matrix
+)
+
+# --------------------------
+# Helpers
+# --------------------------
+
+import pandas as pd
 import torch
-import csv
-import sys
-from sklearn.metrics import accuracy_score, f1_score
-
-# -------------------------
-# Paths (adjust if needed)
-# -------------------------
-GROUND_TRUTH_PATH = "ground_truth_client_labels.pt"
-SUBMISSION_PATH = sys.argv[1]  # path to submission.csv
-
-# -------------------------
-# Load ground truth
-# -------------------------
-ground_truth = torch.load(GROUND_TRUTH_PATH)
-
-# Convert to sorted lists
-gt_labels = []
-pred_labels = []
-
-# -------------------------
-# Load submission
-# -------------------------
-submission = {}
-
-with open(SUBMISSION_PATH, "r") as f:
-    reader = csv.DictReader(f)
-    if "client_id" not in reader.fieldnames or "predicted_label" not in reader.fieldnames:
-        raise ValueError("Submission must contain 'client_id' and 'predicted_label' columns.")
-
-    for row in reader:
-        client_id = int(row["client_id"])
-        label = row["predicted_label"].strip().lower()
-
-        if label not in {"honest", "malicious"}:
-            raise ValueError(f"Invalid label '{label}' for client {client_id}")
-
-        submission[client_id] = label
-
-# -------------------------
-# Match predictions to ground truth
-# -------------------------
-for client_id in sorted(ground_truth.keys()):
-    if client_id not in submission:
-        raise ValueError(f"Missing prediction for client_id {client_id}")
-
-    gt_labels.append(ground_truth[client_id])
-    pred_labels.append(submission[client_id])
-
-# -------------------------
-# Compute metrics
-# -------------------------
-accuracy = accuracy_score(gt_labels, pred_labels)
-macro_f1 = f1_score(gt_labels, pred_labels, average="macro")
-
-# -------------------------
-# Output results
-# -------------------------
-print("Evaluation Results")
-print("------------------")
-print(f"Accuracy : {accuracy:.4f}")
-print(f"Macro F1 : {macro_f1:.4f}")
+
+sub = pd.read_csv("submission.csv")
+gt = torch.load("ground_truth_client_labels.pt")
+
+print("Submission rows:", len(sub))
+print("Ground truth len:", len(gt))
+
+
+def load_ground_truth(path):
+    gt = torch.load(path, map_location="cpu")
+
+    # Accept tensor/list/array/bool/int formats
+    if isinstance(gt, torch.Tensor):
+        gt = gt.cpu().numpy()
+
+    gt = np.array(gt)
+
+    # Convert to label strings
+    labels = ["malicious" if x else "honest" for x in gt]
+    return labels
+
+
+def load_submission(path):
+    df = pd.read_csv(path)
+
+    # Accept both column styles
+    if "predicted_label" in df.columns:
+        label_col = "predicted_label"
+    elif "label" in df.columns:
+        label_col = "label"
+    else:
+        raise ValueError("Submission must contain 'label' or 'predicted_label'")
+
+    # Accept both ID styles
+    ids = df["client_id"]
+
+    if ids.dtype == object:
+        ids = ids.str.replace("client_", "", regex=False).astype(int)
+
+    labels = df[label_col].tolist()
+
+    return ids.tolist(), labels
+
+
+# --------------------------
+# Main Evaluation
+# --------------------------
+
+def evaluate(submission_path, gt_path):
+
+    gt_labels = load_ground_truth(gt_path)
+    ids, pred_labels = load_submission(submission_path)
+
+    # Sort predictions by client_id
+    pred_sorted = [x for _, x in sorted(zip(ids, pred_labels))]
+
+    if len(pred_sorted) != len(gt_labels):
+        raise ValueError("Prediction length mismatch with ground truth")
+
+    # Metrics
+    acc = accuracy_score(gt_labels, pred_sorted)
+    macro_f1 = f1_score(gt_labels, pred_sorted, average="macro")
+
+    print("\n==============================")
+    print(" Evaluation Results")
+    print("==============================")
+    print(f"Accuracy   : {acc:.4f}")
+    print(f"Macro F1   : {macro_f1:.4f}")
+
+    print("\nConfusion Matrix")
+    print(confusion_matrix(gt_labels, pred_sorted))
+
+    print("\nClassification Report")
+    print(classification_report(gt_labels, pred_sorted))
+
+    return acc, macro_f1
+
+
+# --------------------------
+# CLI
+# --------------------------
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) == 1:
+        print("No submission provided — using submission.csv")
+        evaluate("submission.csv", "ground_truth_client_labels.pt")
+    else:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("submission")
+        parser.add_argument("--gt", default="ground_truth_client_labels.pt")
+        args = parser.parse_args()
+        evaluate(args.submission, args.gt)
+