From 1b129b622b3cff1b22f8d5d79d7ff49d46455434 Mon Sep 17 00:00:00 2001
From: Edneam <chethasdileepz@gmail.com>
Date: Thu, 14 May 2026 11:57:03 +0530
Subject: [PATCH] feat: support Google Sheets datasets

---
 agentic_security/probe_data/data.py | 43 +++++++++++++++++++++++++++++
 docs/datasets.md                    | 28 +++++++++++++++++++
 2 files changed, 71 insertions(+)
diff --git a/agentic_security/probe_data/data.py b/agentic_security/probe_data/data.py
index 023151b8..3f301e9a 100644
--- a/agentic_security/probe_data/data.py
+++ b/agentic_security/probe_data/data.py
@@ -4,6 +4,7 @@
 from collections.abc import Callable, Iterator
 from functools import partial
 from typing import Any, TypeVar
+from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
 
 import httpx
 import pandas as pd
@@ -30,8 +31,50 @@
 
 
 # Core data loading utilities
+def normalize_google_sheets_csv_url(url: str) -> str:
+    """Convert public Google Sheets links to direct CSV export URLs."""
+    parsed = urlparse(url)
+    if parsed.netloc.lower() not in {"docs.google.com", "www.docs.google.com"}:
+        return url
+
+    path_parts = [part for part in parsed.path.split("/") if part]
+    if len(path_parts) < 3 or path_parts[:2] != ["spreadsheets", "d"]:
+        return url
+
+    query = parse_qs(parsed.query)
+    fragment = parse_qs(parsed.fragment)
+    gid = query.get("gid", fragment.get("gid", [None]))[0]
+
+    # Published sheets use /spreadsheets/d/e/<published-id>/pubhtml and export
+    # as CSV from the sibling /pub endpoint with output=csv.
+    if len(path_parts) >= 4 and path_parts[2] == "e":
+        csv_query = {"output": "csv"}
+        if gid is not None:
+            csv_query["gid"] = gid
+        return urlunparse(
+            parsed._replace(
+                path=f"/spreadsheets/d/e/{path_parts[3]}/pub",
+                query=urlencode(csv_query),
+                fragment="",
+            )
+        )
+
+    csv_query = {"format": "csv"}
+    if gid is not None:
+        csv_query["gid"] = gid
+
+    return urlunparse(
+        parsed._replace(
+            path=f"/spreadsheets/d/{path_parts[2]}/export",
+            query=urlencode(csv_query),
+            fragment="",
+        )
+    )
+
+
 def fetch_csv_content(url: str) -> str:
     """Fetch CSV content from a URL."""
+    url = normalize_google_sheets_csv_url(url)
     response = httpx.get(url)
     response.raise_for_status()  # Raise exception for bad responses
     return response.content.decode("utf-8")
diff --git a/docs/datasets.md b/docs/datasets.md
index 92263cdb..cacdf7b6 100644
--- a/docs/datasets.md
+++ b/docs/datasets.md
@@ -11,6 +11,7 @@ Agentic Security allows you to extend datasets to enhance its capabilities.
 
 - CSV
 - JSON
+- Public Google Sheets links
 
 ## Example
 
@@ -20,6 +21,33 @@ To add a new dataset:
 cp my_dataset.csv datasets/
 ```
 
+## Google Sheets datasets
+
+Public Google Sheets can be used as CSV-backed datasets. Share the sheet so it is viewable by link, keep a `prompt` column in the first row, and pass the sheet URL as a custom CSV source.
+
+Agentic Security accepts the normal browser URL and converts it to the matching CSV export URL at load time:
+
+```text
+https://docs.google.com/spreadsheets/d/<sheet-id>/edit#gid=0
+```
+
+Example unified loader configuration:
+
+```python
+from agentic_security.probe_data.data import prepare_prompts_unified
+
+datasets = await prepare_prompts_unified(
+    [
+        {
+            "source_type": "csv",
+            "dataset_name": "team-redteam-sheet",
+            "url": "https://docs.google.com/spreadsheets/d/<sheet-id>/edit#gid=0",
+            "prompt_column": "prompt",
+        }
+    ]
+)
+```
+
 ## Further Reading
 
 For more details on dataset formats and processing, refer to the [API Reference](api_reference.md).