From 1b129b622b3cff1b22f8d5d79d7ff49d46455434 Mon Sep 17 00:00:00 2001 From: Edneam Date: Thu, 14 May 2026 11:57:03 +0530 Subject: [PATCH] feat: support Google Sheets datasets --- agentic_security/probe_data/data.py | 43 +++++++++++++++++++++++++++++ docs/datasets.md | 28 +++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/agentic_security/probe_data/data.py b/agentic_security/probe_data/data.py index 023151b8..3f301e9a 100644 --- a/agentic_security/probe_data/data.py +++ b/agentic_security/probe_data/data.py @@ -4,6 +4,7 @@ from collections.abc import Callable, Iterator from functools import partial from typing import Any, TypeVar +from urllib.parse import parse_qs, urlencode, urlparse, urlunparse import httpx import pandas as pd @@ -30,8 +31,50 @@ # Core data loading utilities +def normalize_google_sheets_csv_url(url: str) -> str: + """Convert public Google Sheets links to direct CSV export URLs.""" + parsed = urlparse(url) + if parsed.netloc.lower() not in {"docs.google.com", "www.docs.google.com"}: + return url + + path_parts = [part for part in parsed.path.split("/") if part] + if len(path_parts) < 3 or path_parts[:2] != ["spreadsheets", "d"]: + return url + + query = parse_qs(parsed.query) + fragment = parse_qs(parsed.fragment) + gid = query.get("gid", fragment.get("gid", [None]))[0] + + # Published sheets use /spreadsheets/d/e//pubhtml and export + # as CSV from the sibling /pub endpoint with output=csv. + if len(path_parts) >= 4 and path_parts[2] == "e": + csv_query = {"output": "csv"} + if gid is not None: + csv_query["gid"] = gid + return urlunparse( + parsed._replace( + path=f"/spreadsheets/d/e/{path_parts[3]}/pub", + query=urlencode(csv_query), + fragment="", + ) + ) + + csv_query = {"format": "csv"} + if gid is not None: + csv_query["gid"] = gid + + return urlunparse( + parsed._replace( + path=f"/spreadsheets/d/{path_parts[2]}/export", + query=urlencode(csv_query), + fragment="", + ) + ) + + def fetch_csv_content(url: str) -> str: """Fetch CSV content from a URL.""" + url = normalize_google_sheets_csv_url(url) response = httpx.get(url) response.raise_for_status() # Raise exception for bad responses return response.content.decode("utf-8") diff --git a/docs/datasets.md b/docs/datasets.md index 92263cdb..cacdf7b6 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -11,6 +11,7 @@ Agentic Security allows you to extend datasets to enhance its capabilities. - CSV - JSON +- Public Google Sheets links ## Example @@ -20,6 +21,33 @@ To add a new dataset: cp my_dataset.csv datasets/ ``` +## Google Sheets datasets + +Public Google Sheets can be used as CSV-backed datasets. Share the sheet so it is viewable by link, keep a `prompt` column in the first row, and pass the sheet URL as a custom CSV source. + +Agentic Security accepts the normal browser URL and converts it to the matching CSV export URL at load time: + +```text +https://docs.google.com/spreadsheets/d//edit#gid=0 +``` + +Example unified loader configuration: + +```python +from agentic_security.probe_data.data import prepare_prompts_unified + +datasets = await prepare_prompts_unified( + [ + { + "source_type": "csv", + "dataset_name": "team-redteam-sheet", + "url": "https://docs.google.com/spreadsheets/d//edit#gid=0", + "prompt_column": "prompt", + } + ] +) +``` + ## Further Reading For more details on dataset formats and processing, refer to the [API Reference](api_reference.md).