OceanStreamIO
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 31 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎.github/workflows/publish.yml‎ b/‎.github/workflows/publish.yml‎
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 38 additions & 3 deletions b/‎CONTRIBUTING.md‎
Lines changed: 38 additions & 3 deletions
diff --git a/‎oceanstream/cli.py‎
Lines changed: 16 additions & 5 deletions b/‎oceanstream/cli.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎oceanstream/geotrack/csv_reader.py‎
Lines changed: 100 additions & 1 deletion b/‎oceanstream/geotrack/csv_reader.py‎
Lines changed: 100 additions & 1 deletion
@@ -0,0 +1,31 @@
+name: CI
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build-test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e oceanstream
+          pip install -r requirements-dev.txt || true
+      - name: Lint (ruff)
+        run: ruff check .
+      - name: Type check (mypy)
+        run: mypy oceanstream
+      - name: Tests with coverage
+        run: pytest --cov=oceanstream --cov-report=xml --cov-report=term
+      - name: Upload coverage artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-xml
+          path: coverage.xml
@@ -5,9 +5,44 @@ Thank you for your interest in contributing!
 ## Getting started
 - Use Python 3.12+
 - Create a virtual environment and install dev deps:
-  - `pip install -r requirements.txt -r requirements-dev.txt`
-  - `pip install -e oceanstream`
-- Run tests: `pytest -q`
+  ```bash
+  python -m venv venv
+  source venv/bin/activate  # On Windows: venv\Scripts\activate
+  pip install -r requirements.txt -r requirements-dev.txt
+  pip install -e oceanstream
+  ```
+- **Always activate the project venv before running any commands:**
+  ```bash
+  source venv/bin/activate  # On Windows: venv\Scripts\activate
+  ```
+
+## Running tests
+**IMPORTANT:** Always use the project venv when running tests or performing other tasks.
+
+```bash
+# Activate venv first
+source venv/bin/activate
+
+# Run all tests using make (recommended)
+make test
+
+# Or run tests directly with pytest
+./venv/bin/python -m pytest oceanstream/tests/
+
+# Run with verbose output
+./venv/bin/python -m pytest oceanstream/tests/ -v
+
+# Run specific test file
+./venv/bin/python -m pytest oceanstream/tests/unit/test_csv_reader.py
+
+# Run with coverage
+./venv/bin/python -m pytest oceanstream/tests/ --cov=oceanstream
+
+# Show reasons for skipped tests
+./venv/bin/python -m pytest oceanstream/tests/ -rs
+```
+
+## Other development commands
 - Lint/type-check: `ruff check .` and `mypy oceanstream`
 - Optional: install pre-commit hooks: `pre-commit install`
 
 
@@ -54,15 +54,14 @@ def process_callback(
         help="Convert CSV files into standardized GeoParquet datasets (and optionally PMTiles).",
     )
     def convert_command(
-        input_dir: Path = typer.Option(
+        input_source: Path = typer.Option(
             Path("raw_data"),
             exists=True,
-            file_okay=False,
-            help="Directory containing input CSV files (default: ./raw_data).",
+            help="Path to a CSV file or directory containing CSV files (default: ./raw_data).",
         ),
         output_dir: Path = typer.Option(
             Path("out/geoparquet"),
-            help="Output directory for the partitioned GeoParquet dataset (ignored with --list-columns/--print-schema).",
+            help="Base output directory for the partitioned GeoParquet dataset (campaign-based subdirectories will be created).",
         ),
         upload: bool = typer.Option(False, help="Upload processed dataset to cloud storage (future)."),
         verbose: bool = typer.Option(False, "-v", help="Emit detailed progress information."),
@@ -79,6 +78,12 @@ def convert_command(
         pmtiles_time_gap: int = typer.Option(60, help="Time gap in minutes to split track segments for PMTiles."),
         pmtiles_include_measurements: bool = typer.Option(True, help="Include oceanographic measurements in PMTiles."),
         pmtiles_measurement_columns: list[str] = typer.Option(None, help="Specific measurement columns to include (defaults to auto-selected important ones)."),
+        campaign_id: str = typer.Option(None, help="Campaign/cruise identifier (REQUIRED - provide if not auto-detected from filenames/metadata)."),
+        platform_id: str = typer.Option(None, help="Platform identifier (overrides auto-detection from filenames)."),
+        attribution: str = typer.Option(None, help="Data attribution/citation (overrides provider/file metadata)."),
+        creation_date: str = typer.Option(None, help="Data creation date in ISO 8601 format (overrides provider/file metadata)."),
+        source_dataset: str = typer.Option(None, help="Source dataset DOI (overrides provider/file metadata)."),
+        source_repository: str = typer.Option(None, help="Source repository DOI (overrides provider/file metadata)."),
     ) -> None:
         global _provider_obj
         provider_obj = _provider_obj
@@ -93,7 +98,7 @@ def convert_command(
         try:
             geotrack.convert(
                 provider=provider_obj,
-                input_dir=input_dir,
+                input_source=input_source,
                 output_dir=output_dir,
                 verbose=verbose,
                 list_columns=list_columns,
@@ -110,6 +115,12 @@ def convert_command(
                 pmtiles_time_gap=pmtiles_time_gap,
                 pmtiles_include_measurements=pmtiles_include_measurements,
                 pmtiles_measurement_columns=pmtiles_measurement_columns,
+                campaign_id=campaign_id,
+                platform_id=platform_id,
+                attribution=attribution,
+                creation_date=creation_date,
+                source_dataset=source_dataset,
+                source_repository=source_repository,
             )
         except FileNotFoundError as e:
             typer.echo(f"[geotrack] ERROR: {e}")
 
@@ -1,9 +1,108 @@
 import os
-from typing import List
+from typing import List, Dict, Tuple, Optional
+from pathlib import Path
 
 import pandas as pd
 
 
+def is_geocsv(file_path: str | Path) -> bool:
+    """
+    Detect if a file is GeoCSV format by checking for metadata headers.
+    
+    GeoCSV files start with lines beginning with '#' containing metadata.
+    
+    Args:
+        file_path: Path to file to check
+        
+    Returns:
+        True if file appears to be GeoCSV format
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            first_line = f.readline().strip()
+            # GeoCSV files start with # metadata headers
+            return first_line.startswith('#')
+    except Exception:
+        return False
+
+
+def parse_geocsv_metadata(lines: List[str]) -> Dict[str, str]:
+    """
+    Parse GeoCSV metadata headers into a dictionary.
+    
+    GeoCSV metadata lines start with '#' followed by key: value pairs.
+    
+    Args:
+        lines: List of lines from the file (including metadata headers)
+        
+    Returns:
+        Dictionary of metadata key-value pairs
+    """
+    metadata = {}
+    
+    for line in lines:
+        line = line.strip()
+        if not line.startswith('#'):
+            break  # End of metadata section
+            
+        # Remove leading '#' and whitespace
+        content = line[1:].strip()
+        
+        # Parse key: value format
+        if ':' in content:
+            key, value = content.split(':', 1)
+            metadata[key.strip()] = value.strip()
+    
+    return metadata
+
+
+def read_geocsv(file_path: str | Path) -> Tuple[pd.DataFrame, Dict[str, str]]:
+    """
+    Read a GeoCSV file, parsing metadata headers and data.
+    
+    GeoCSV format specifications:
+    - Metadata lines start with '#' and contain key: value pairs
+    - Common metadata keys: dataset, cruise_id, field_unit, field_type, 
+      field_standard_name, source_repository, source_event, source_dataset
+    - Data follows after metadata headers
+    
+    Args:
+        file_path: Path to GeoCSV file
+        
+    Returns:
+        Tuple of (DataFrame with data, dict with metadata)
+    """
+    file_path = Path(file_path)
+    
+    # Read all lines to parse metadata
+    with open(file_path, 'r', encoding='utf-8') as f:
+        all_lines = f.readlines()
+    
+    # Parse metadata from header lines
+    metadata = parse_geocsv_metadata(all_lines)
+    
+    # Find where data starts (first non-# line is column headers)
+    data_start_line = 0
+    for i, line in enumerate(all_lines):
+        if not line.strip().startswith('#'):
+            data_start_line = i
+            break
+    
+    # Read CSV data (pandas will handle column headers automatically)
+    df = pd.read_csv(
+        file_path,
+        skiprows=data_start_line,
+        on_bad_lines='skip',
+        low_memory=False
+    )
+    
+    # Basic cleanup
+    df = df.replace(to_replace=["nan", "NaN", "NULL", "None"], value=pd.NA)
+    df = df.replace(r"^\s*$", pd.NA, regex=True)
+    
+    return df, metadata
+
+
 def read_csv_files(raw_data_folder: str) -> pd.DataFrame:
     csv_files = [f for f in os.listdir(raw_data_folder) if f.endswith('.csv')]
     data_frames: List[pd.DataFrame] = []