Skip to content

Commit 9a2ae14

Browse files
committed
added stac enhancements, r2r support, pmtiles
1 parent b4e7f61 commit 9a2ae14

15 files changed

Lines changed: 1322 additions & 81 deletions

.github/workflows/ci.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: CI
2+
on:
3+
push:
4+
branches: [ main ]
5+
pull_request:
6+
branches: [ main ]
7+
8+
jobs:
9+
build-test:
10+
runs-on: ubuntu-latest
11+
steps:
12+
- uses: actions/checkout@v4
13+
- uses: actions/setup-python@v5
14+
with:
15+
python-version: '3.12'
16+
- name: Install dependencies
17+
run: |
18+
pip install --upgrade pip
19+
pip install -e oceanstream
20+
pip install -r requirements-dev.txt || true
21+
- name: Lint (ruff)
22+
run: ruff check .
23+
- name: Type check (mypy)
24+
run: mypy oceanstream
25+
- name: Tests with coverage
26+
run: pytest --cov=oceanstream --cov-report=xml --cov-report=term
27+
- name: Upload coverage artifact
28+
uses: actions/upload-artifact@v4
29+
with:
30+
name: coverage-xml
31+
path: coverage.xml

.github/workflows/publish.yml

Whitespace-only changes.

CONTRIBUTING.md

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,44 @@ Thank you for your interest in contributing!
55
## Getting started
66
- Use Python 3.12+
77
- Create a virtual environment and install dev deps:
8-
- `pip install -r requirements.txt -r requirements-dev.txt`
9-
- `pip install -e oceanstream`
10-
- Run tests: `pytest -q`
8+
```bash
9+
python -m venv venv
10+
source venv/bin/activate # On Windows: venv\Scripts\activate
11+
pip install -r requirements.txt -r requirements-dev.txt
12+
pip install -e oceanstream
13+
```
14+
- **Always activate the project venv before running any commands:**
15+
```bash
16+
source venv/bin/activate # On Windows: venv\Scripts\activate
17+
```
18+
19+
## Running tests
20+
**IMPORTANT:** Always use the project venv when running tests or performing other tasks.
21+
22+
```bash
23+
# Activate venv first
24+
source venv/bin/activate
25+
26+
# Run all tests using make (recommended)
27+
make test
28+
29+
# Or run tests directly with pytest
30+
./venv/bin/python -m pytest oceanstream/tests/
31+
32+
# Run with verbose output
33+
./venv/bin/python -m pytest oceanstream/tests/ -v
34+
35+
# Run specific test file
36+
./venv/bin/python -m pytest oceanstream/tests/unit/test_csv_reader.py
37+
38+
# Run with coverage
39+
./venv/bin/python -m pytest oceanstream/tests/ --cov=oceanstream
40+
41+
# Show reasons for skipped tests
42+
./venv/bin/python -m pytest oceanstream/tests/ -rs
43+
```
44+
45+
## Other development commands
1146
- Lint/type-check: `ruff check .` and `mypy oceanstream`
1247
- Optional: install pre-commit hooks: `pre-commit install`
1348

oceanstream/cli.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,14 @@ def process_callback(
5454
help="Convert CSV files into standardized GeoParquet datasets (and optionally PMTiles).",
5555
)
5656
def convert_command(
57-
input_dir: Path = typer.Option(
57+
input_source: Path = typer.Option(
5858
Path("raw_data"),
5959
exists=True,
60-
file_okay=False,
61-
help="Directory containing input CSV files (default: ./raw_data).",
60+
help="Path to a CSV file or directory containing CSV files (default: ./raw_data).",
6261
),
6362
output_dir: Path = typer.Option(
6463
Path("out/geoparquet"),
65-
help="Output directory for the partitioned GeoParquet dataset (ignored with --list-columns/--print-schema).",
64+
help="Base output directory for the partitioned GeoParquet dataset (campaign-based subdirectories will be created).",
6665
),
6766
upload: bool = typer.Option(False, help="Upload processed dataset to cloud storage (future)."),
6867
verbose: bool = typer.Option(False, "-v", help="Emit detailed progress information."),
@@ -79,6 +78,12 @@ def convert_command(
7978
pmtiles_time_gap: int = typer.Option(60, help="Time gap in minutes to split track segments for PMTiles."),
8079
pmtiles_include_measurements: bool = typer.Option(True, help="Include oceanographic measurements in PMTiles."),
8180
pmtiles_measurement_columns: list[str] = typer.Option(None, help="Specific measurement columns to include (defaults to auto-selected important ones)."),
81+
campaign_id: str = typer.Option(None, help="Campaign/cruise identifier (REQUIRED - provide if not auto-detected from filenames/metadata)."),
82+
platform_id: str = typer.Option(None, help="Platform identifier (overrides auto-detection from filenames)."),
83+
attribution: str = typer.Option(None, help="Data attribution/citation (overrides provider/file metadata)."),
84+
creation_date: str = typer.Option(None, help="Data creation date in ISO 8601 format (overrides provider/file metadata)."),
85+
source_dataset: str = typer.Option(None, help="Source dataset DOI (overrides provider/file metadata)."),
86+
source_repository: str = typer.Option(None, help="Source repository DOI (overrides provider/file metadata)."),
8287
) -> None:
8388
global _provider_obj
8489
provider_obj = _provider_obj
@@ -93,7 +98,7 @@ def convert_command(
9398
try:
9499
geotrack.convert(
95100
provider=provider_obj,
96-
input_dir=input_dir,
101+
input_source=input_source,
97102
output_dir=output_dir,
98103
verbose=verbose,
99104
list_columns=list_columns,
@@ -110,6 +115,12 @@ def convert_command(
110115
pmtiles_time_gap=pmtiles_time_gap,
111116
pmtiles_include_measurements=pmtiles_include_measurements,
112117
pmtiles_measurement_columns=pmtiles_measurement_columns,
118+
campaign_id=campaign_id,
119+
platform_id=platform_id,
120+
attribution=attribution,
121+
creation_date=creation_date,
122+
source_dataset=source_dataset,
123+
source_repository=source_repository,
113124
)
114125
except FileNotFoundError as e:
115126
typer.echo(f"[geotrack] ERROR: {e}")

oceanstream/geotrack/csv_reader.py

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,108 @@
11
import os
2-
from typing import List
2+
from typing import List, Dict, Tuple, Optional
3+
from pathlib import Path
34

45
import pandas as pd
56

67

8+
def is_geocsv(file_path: str | Path) -> bool:
9+
"""
10+
Detect if a file is GeoCSV format by checking for metadata headers.
11+
12+
GeoCSV files start with lines beginning with '#' containing metadata.
13+
14+
Args:
15+
file_path: Path to file to check
16+
17+
Returns:
18+
True if file appears to be GeoCSV format
19+
"""
20+
try:
21+
with open(file_path, 'r', encoding='utf-8') as f:
22+
first_line = f.readline().strip()
23+
# GeoCSV files start with # metadata headers
24+
return first_line.startswith('#')
25+
except Exception:
26+
return False
27+
28+
29+
def parse_geocsv_metadata(lines: List[str]) -> Dict[str, str]:
30+
"""
31+
Parse GeoCSV metadata headers into a dictionary.
32+
33+
GeoCSV metadata lines start with '#' followed by key: value pairs.
34+
35+
Args:
36+
lines: List of lines from the file (including metadata headers)
37+
38+
Returns:
39+
Dictionary of metadata key-value pairs
40+
"""
41+
metadata = {}
42+
43+
for line in lines:
44+
line = line.strip()
45+
if not line.startswith('#'):
46+
break # End of metadata section
47+
48+
# Remove leading '#' and whitespace
49+
content = line[1:].strip()
50+
51+
# Parse key: value format
52+
if ':' in content:
53+
key, value = content.split(':', 1)
54+
metadata[key.strip()] = value.strip()
55+
56+
return metadata
57+
58+
59+
def read_geocsv(file_path: str | Path) -> Tuple[pd.DataFrame, Dict[str, str]]:
60+
"""
61+
Read a GeoCSV file, parsing metadata headers and data.
62+
63+
GeoCSV format specifications:
64+
- Metadata lines start with '#' and contain key: value pairs
65+
- Common metadata keys: dataset, cruise_id, field_unit, field_type,
66+
field_standard_name, source_repository, source_event, source_dataset
67+
- Data follows after metadata headers
68+
69+
Args:
70+
file_path: Path to GeoCSV file
71+
72+
Returns:
73+
Tuple of (DataFrame with data, dict with metadata)
74+
"""
75+
file_path = Path(file_path)
76+
77+
# Read all lines to parse metadata
78+
with open(file_path, 'r', encoding='utf-8') as f:
79+
all_lines = f.readlines()
80+
81+
# Parse metadata from header lines
82+
metadata = parse_geocsv_metadata(all_lines)
83+
84+
# Find where data starts (first non-# line is column headers)
85+
data_start_line = 0
86+
for i, line in enumerate(all_lines):
87+
if not line.strip().startswith('#'):
88+
data_start_line = i
89+
break
90+
91+
# Read CSV data (pandas will handle column headers automatically)
92+
df = pd.read_csv(
93+
file_path,
94+
skiprows=data_start_line,
95+
on_bad_lines='skip',
96+
low_memory=False
97+
)
98+
99+
# Basic cleanup
100+
df = df.replace(to_replace=["nan", "NaN", "NULL", "None"], value=pd.NA)
101+
df = df.replace(r"^\s*$", pd.NA, regex=True)
102+
103+
return df, metadata
104+
105+
7106
def read_csv_files(raw_data_folder: str) -> pd.DataFrame:
8107
csv_files = [f for f in os.listdir(raw_data_folder) if f.endswith('.csv')]
9108
data_frames: List[pd.DataFrame] = []

0 commit comments

Comments
 (0)