Add CSV processing and full-text search

goldlabelapps · goldlabelapps · commit 14c2187b1511 · 2026-03-26T20:43:30.000Z
Add robust CSV ingestion and full-text search support for prospects.

- Add app/api/prospects/process.py: new /prospects/process endpoint to batch-process big.csv and insert rows with a computed search_vector (to_tsvector) for full-text search.
- Update seed logic (app/api/prospects/seed.py): remove secondary email columns, create search_vector tsvector column, add GIN index, and insert rows with to_tsvector('english', ...) populated from concatenated text fields.
- Limit prospects listing (app/api/prospects/prospects.py) to 200 rows and update response meta to note the limit.
- Register the new process router in app/api/routes.py.
- Update README.md with documentation on the tsvector column, GIN index, the /prospects/process endpoint, and the recommended ingestion workflow.

These changes enable fast, scalable full-text search across all text fields and provide a dedicated endpoint for processing large CSV datasets using batch inserts.
diff --git a/README.md b/README.md
@@ -26,7 +26,20 @@ The API is at <http://localhost:8000>.
 
 - **Python 3.11+**
 - **Postgres**
-- **tsvector** - Superfast search
+- **tsvector** - Superfast full-text search (with GIN index)
+### Full-Text Search (tsvector)
+
+The prospects table includes a `search_vector` column (type: tsvector) that is automatically computed from all text fields on insert. A GIN index is created for this column, enabling fast and scalable full-text search queries.
+
+**How it works:**
+- On every insert (via `/prospects/seed` or `/prospects/process`), the `search_vector` is computed from all text columns using PostgreSQL's `to_tsvector('english', ...)`.
+- The GIN index (`idx_prospects_search_vector`) allows efficient search queries like:
+
+```sql
+SELECT * FROM prospects WHERE search_vector @@ plainto_tsquery('english', 'search terms');
+```
+
+This makes searching across all text fields in the prospects table extremely fast, even for large datasets.
 - **FastAPI** — RESTful API framework
 - **Uvicorn** — ASGI server
 - **Pytest** — testing framework
@@ -60,5 +73,24 @@ requirements.txt
 | GET    | `/`       | Welcome message                 |
 | GET    | `/health` | Health check — returns `ok`     |
 | POST   | `/echo`   | Echoes the JSON `message` field |
+| GET    | `/prospects/seed` | (Re)create prospects table and seed with sample data |
+| DELETE | `/prospects/process` | (Legacy) Empties the prospects table |
+| GET    | `/prospects/process` | Process and insert all records from big.csv into prospects table |
+
+### Processing Large CSV Files
+
+The `/prospects/process` endpoint is designed for robust, scalable ingestion of large CSV files (e.g., 1300+ rows, 300KB+). It follows the same normalization and insertion pattern as `/prospects/seed`, but is optimized for large files:
+
+
+#### Example usage
+
+1. Seed the table structure:
+  - `GET /prospects/seed`
+2. (Optional) Empty the table:
+  - `DELETE /prospects/empty`
+3. Process the large CSV:
+  - `GET /prospects/process`
+
+The endpoint will return the number of records inserted. This is the core ingestion workflow for production-scale data.
 
 
diff --git a/app/api/prospects/process.py b/app/api/prospects/process.py
@@ -0,0 +1,71 @@
+import os, time
+from fastapi import APIRouter, status
+from app.utils.db import get_db_connection
+
+router = APIRouter()
+
+CSV_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/big.csv'))
+
+
+import csv
+import io
+
+def normalize_column(col):
+    import re
+    col = col.strip().lower().replace(' ', '_')
+    col = re.sub(r'[^a-z0-9_]', '', col)
+    if col and col[0].isdigit():
+        col = '_' + col
+    return col
+
+@router.get("/prospects/process", status_code=status.HTTP_200_OK)
+def process_prospects() -> dict:
+    """
+    Process and insert data from the large CSV file (big.csv) into the prospects table.
+    The table must already exist with the correct columns (run seed and empty first).
+    This endpoint is robust and scalable for large files.
+    """
+    import psycopg2
+    BATCH_SIZE = 200
+    conn_gen = get_db_connection()
+    conn = next(conn_gen)
+    cur = conn.cursor()
+    inserted = 0
+    try:
+        with open(CSV_PATH, newline='', encoding='utf-8') as csvfile:
+            reader = csv.reader(csvfile)
+            columns_raw = next(reader)
+            remove_cols = {'secondary_email', 'secondary_email_source', 'secondary_email_status', 'secondary_email_verification_source'}
+            columns = [normalize_column(col) for col in columns_raw if normalize_column(col) not in remove_cols]
+            col_indices = [i for i, col in enumerate([normalize_column(col) for col in columns_raw]) if col not in remove_cols]
+            placeholders = ', '.join(['%s'] * len(columns))
+            batch = []
+            for row in reader:
+                filtered_row = [row[i] for i in col_indices]
+                text_content = ' '.join([str(val) for val in filtered_row if val is not None])
+                batch.append(filtered_row + [text_content])
+                if len(batch) >= BATCH_SIZE:
+                    cur.executemany(
+                        f"INSERT INTO prospects ({', '.join(columns)}, search_vector) VALUES ({placeholders}, to_tsvector('english', %s))",
+                        batch
+                    )
+                    inserted += len(batch)
+                    batch = []
+            if batch:
+                cur.executemany(
+                    f"INSERT INTO prospects ({', '.join(columns)}, search_vector) VALUES ({placeholders}, to_tsvector('english', %s))",
+                    batch
+                )
+                inserted += len(batch)
+        conn.commit()
+        result = {"detail": f"Inserted {inserted} records from big.csv into prospects table."}
+    except psycopg2.errors.UndefinedTable:
+        conn.rollback()
+        result = {"detail": "Table 'prospects' does not exist. No records inserted."}
+    except Exception as e:
+        conn.rollback()
+        result = {"detail": f"Error: {str(e)}"}
+    finally:
+        cur.close()
+        conn.close()
+    return result
diff --git a/app/api/prospects/prospects.py b/app/api/prospects/prospects.py
@@ -24,13 +24,13 @@ def root() -> dict:
         },
     ]
     try:
-        cur.execute('SELECT * FROM prospects;')
+        cur.execute('SELECT * FROM prospects LIMIT 200;')
         if cur.description is None:
             prospects = []
         else:
             columns = [desc[0] for desc in cur.description]
             prospects = [dict(zip(columns, row)) for row in cur.fetchall()]
-        meta = make_meta("success", "Prospects List")
+        meta = make_meta("success", "Prospects List (max 200)")
         result = {"meta": meta, "data": prospects}
     except Exception as e:
         import psycopg2
diff --git a/app/api/prospects/seed.py b/app/api/prospects/seed.py
@@ -29,19 +29,36 @@ def seed_prospects() -> dict:
     import io
     reader = csv.reader(io.StringIO(csv_data))
     columns_raw = next(reader)
-    columns = [normalize_column(col) for col in columns_raw]
+    # Remove 'Secondary Email' column and its variants
+    remove_cols = {'secondary_email', 'secondary_email_source', 'secondary_email_status', 'secondary_email_verification_source'}
+    columns = [normalize_column(col) for col in columns_raw if normalize_column(col) not in remove_cols]
+    col_indices = [i for i, col in enumerate([normalize_column(col) for col in columns_raw]) if col not in remove_cols]
 
-    # Drop and recreate table
+
+    # Drop and recreate table with tsvector column
     cur.execute('DROP TABLE IF EXISTS prospects;')
     create_cols = ',\n    '.join([f'{col} TEXT' for col in columns])
-    cur.execute(f'''CREATE TABLE prospects (\n    id SERIAL PRIMARY KEY,\n    {create_cols}\n);''')
+    cur.execute(f'''
+        CREATE TABLE prospects (
+            id SERIAL PRIMARY KEY,
+            {create_cols},
+            search_vector tsvector
+        );
+    ''')
+    # Create GIN index for full-text search
+    cur.execute('CREATE INDEX IF NOT EXISTS idx_prospects_search_vector ON prospects USING GIN (search_vector);')
+
 
-    # Insert rows
+    # Insert rows with tsvector
     for row in reader:
+        # Only keep values for columns we want
+        filtered_row = [row[i] for i in col_indices]
         placeholders = ', '.join(['%s'] * len(columns))
+        # Concatenate all text fields for tsvector
+        text_content = ' '.join([str(val) for val in filtered_row if val is not None])
         cur.execute(
-            f"INSERT INTO prospects ({', '.join(columns)}) VALUES ({placeholders})",
-            row
+            f"INSERT INTO prospects ({', '.join(columns)}, search_vector) VALUES ({placeholders}, to_tsvector('english', %s))",
+            filtered_row + [text_content]
         )
 
     conn.commit()
diff --git a/app/api/routes.py b/app/api/routes.py
@@ -15,11 +15,14 @@
 
 from app.api.prospects.prospects import router as prospects_router
 
+
 from app.api.prospects.seed import router as prospects_seed_router
 from app.api.prospects.empty import router as prospects_empty_router
+from app.api.prospects.process import router as prospects_process_router
 
 router.include_router(root_router)
 router.include_router(health_router)
 router.include_router(prospects_router)
 router.include_router(prospects_seed_router)
-router.include_router(prospects_empty_router)
+router.include_router(prospects_empty_router)
+router.include_router(prospects_process_router)