Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions agentic_security/probe_data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections.abc import Callable, Iterator
from functools import partial
from typing import Any, TypeVar
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse

import httpx
import pandas as pd
Expand All @@ -30,8 +31,50 @@


# Core data loading utilities
def normalize_google_sheets_csv_url(url: str) -> str:
"""Convert public Google Sheets links to direct CSV export URLs."""
parsed = urlparse(url)
if parsed.netloc.lower() not in {"docs.google.com", "www.docs.google.com"}:
return url

path_parts = [part for part in parsed.path.split("/") if part]
if len(path_parts) < 3 or path_parts[:2] != ["spreadsheets", "d"]:
return url

query = parse_qs(parsed.query)
fragment = parse_qs(parsed.fragment)
gid = query.get("gid", fragment.get("gid", [None]))[0]

# Published sheets use /spreadsheets/d/e/<published-id>/pubhtml and export
# as CSV from the sibling /pub endpoint with output=csv.
if len(path_parts) >= 4 and path_parts[2] == "e":
csv_query = {"output": "csv"}
if gid is not None:
csv_query["gid"] = gid
return urlunparse(
parsed._replace(
path=f"/spreadsheets/d/e/{path_parts[3]}/pub",
query=urlencode(csv_query),
fragment="",
)
)

csv_query = {"format": "csv"}
if gid is not None:
csv_query["gid"] = gid

return urlunparse(
parsed._replace(
path=f"/spreadsheets/d/{path_parts[2]}/export",
query=urlencode(csv_query),
fragment="",
)
)
Comment thread
Edneam marked this conversation as resolved.


def fetch_csv_content(url: str) -> str:
"""Fetch CSV content from a URL."""
url = normalize_google_sheets_csv_url(url)
response = httpx.get(url)
response.raise_for_status() # Raise exception for bad responses
return response.content.decode("utf-8")
Expand Down
28 changes: 28 additions & 0 deletions docs/datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Agentic Security allows you to extend datasets to enhance its capabilities.

- CSV
- JSON
- Public Google Sheets links

## Example

Expand All @@ -20,6 +21,33 @@ To add a new dataset:
cp my_dataset.csv datasets/
```

## Google Sheets datasets

Public Google Sheets can be used as CSV-backed datasets. Share the sheet so it is viewable by link, keep a `prompt` column in the first row, and pass the sheet URL as a custom CSV source.

Agentic Security accepts the normal browser URL and converts it to the matching CSV export URL at load time:

```text
https://docs.google.com/spreadsheets/d/<sheet-id>/edit#gid=0
```

Example unified loader configuration:

```python
from agentic_security.probe_data.data import prepare_prompts_unified

datasets = await prepare_prompts_unified(
[
{
"source_type": "csv",
"dataset_name": "team-redteam-sheet",
"url": "https://docs.google.com/spreadsheets/d/<sheet-id>/edit#gid=0",
"prompt_column": "prompt",
}
]
)
```

## Further Reading

For more details on dataset formats and processing, refer to the [API Reference](api_reference.md).