diff --git a/examples/LLM_Workflows/neo4j_graph_rag/README.md b/examples/LLM_Workflows/neo4j_graph_rag/README.md index 848d8d504..3d81404ac 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/README.md +++ b/examples/LLM_Workflows/neo4j_graph_rag/README.md @@ -202,4 +202,4 @@ neo4j_graph_rag/ │ └── rag_dag.png └── data/ └── README.md Dataset download and conversion instructions -``` \ No newline at end of file +``` diff --git a/examples/LLM_Workflows/neo4j_graph_rag/data/README.md b/examples/LLM_Workflows/neo4j_graph_rag/data/README.md index 4b99bb6cd..1cb8f338e 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/data/README.md +++ b/examples/LLM_Workflows/neo4j_graph_rag/data/README.md @@ -54,4 +54,4 @@ with open("tmdb_5000_credits.json", "w") as f: json.dump(credits.to_dict(orient="records"), f) ``` -Run this script once from inside the `data/` folder, then proceed with `python run.py --mode ingest`. \ No newline at end of file +Run this script once from inside the `data/` folder, then proceed with `python run.py --mode ingest`. diff --git a/examples/LLM_Workflows/neo4j_graph_rag/data/data_refine.py b/examples/LLM_Workflows/neo4j_graph_rag/data/data_refine.py index d028de3f8..4dd48cfc3 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/data/data_refine.py +++ b/examples/LLM_Workflows/neo4j_graph_rag/data/data_refine.py @@ -16,13 +16,15 @@ # under the License. -import pandas as pd, json - +import json + +import pandas as pd + movies = pd.read_csv("examples/LLM_Workflows/neo4j_graph_rag/data/tmdb_5000_movies.csv") credits = pd.read_csv("examples/LLM_Workflows/neo4j_graph_rag/data/tmdb_5000_credits.csv") - + with open("examples/LLM_Workflows/neo4j_graph_rag/data/tmdb_5000_movies.json", "w") as f: json.dump(movies.to_dict(orient="records"), f) - + with open("examples/LLM_Workflows/neo4j_graph_rag/data/tmdb_5000_credits.json", "w") as f: - json.dump(credits.to_dict(orient="records"), f) \ No newline at end of file + json.dump(credits.to_dict(orient="records"), f) diff --git a/examples/LLM_Workflows/neo4j_graph_rag/docker-compose.yml b/examples/LLM_Workflows/neo4j_graph_rag/docker-compose.yml index b41df8671..9c9a2b513 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/docker-compose.yml +++ b/examples/LLM_Workflows/neo4j_graph_rag/docker-compose.yml @@ -39,4 +39,4 @@ services: volumes: neo4j_data: - neo4j_logs: \ No newline at end of file + neo4j_logs: diff --git a/examples/LLM_Workflows/neo4j_graph_rag/embed_module.py b/examples/LLM_Workflows/neo4j_graph_rag/embed_module.py index 2dd5ecd3d..5bd444e73 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/embed_module.py +++ b/examples/LLM_Workflows/neo4j_graph_rag/embed_module.py @@ -96,7 +96,7 @@ def movie_embeddings( model=EMBEDDING_MODEL, input=[item["text"] for item in batch], ) - for item, emb_obj in zip(batch, response.data): + for item, emb_obj in zip(batch, response.data, strict=False): results.append({"id": item["id"], "embedding": emb_obj.embedding}) logger.info("Embedded batch %d-%d of %d", i, min(i + BATCH_SIZE, total), total) @@ -165,4 +165,4 @@ def embedding_summary( "dimensions": EMBEDDING_DIMENSIONS, } logger.info("Embedding complete: %s", summary) - return summary \ No newline at end of file + return summary diff --git a/examples/LLM_Workflows/neo4j_graph_rag/generation_module.py b/examples/LLM_Workflows/neo4j_graph_rag/generation_module.py index eee8b078d..a7a2e02c3 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/generation_module.py +++ b/examples/LLM_Workflows/neo4j_graph_rag/generation_module.py @@ -90,4 +90,4 @@ def answer(prompt_messages: list[dict], openai_api_key: str) -> str: ) result = response.choices[0].message.content logger.info("Generated answer (%d chars)", len(result)) - return result \ No newline at end of file + return result diff --git a/examples/LLM_Workflows/neo4j_graph_rag/graph_schema.py b/examples/LLM_Workflows/neo4j_graph_rag/graph_schema.py index a50d82692..22792e6b7 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/graph_schema.py +++ b/examples/LLM_Workflows/neo4j_graph_rag/graph_schema.py @@ -105,4 +105,4 @@ def schema_to_prompt() -> str: prop_str = f" with properties: {', '.join(props)}" if props else "" lines.append(f" (:{src})-[:{rel}]->(:{dest}){prop_str}") - return "\n".join(lines) \ No newline at end of file + return "\n".join(lines) diff --git a/examples/LLM_Workflows/neo4j_graph_rag/ingest_module.py b/examples/LLM_Workflows/neo4j_graph_rag/ingest_module.py index ac9de7686..c5326b460 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/ingest_module.py +++ b/examples/LLM_Workflows/neo4j_graph_rag/ingest_module.py @@ -323,4 +323,4 @@ def ingestion_summary( "person_edges": write_person_nodes_and_edges, } logger.info("Ingestion complete: %s", summary) - return summary \ No newline at end of file + return summary diff --git a/examples/LLM_Workflows/neo4j_graph_rag/requirements.txt b/examples/LLM_Workflows/neo4j_graph_rag/requirements.txt index e1e08da02..a2cfc9387 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/requirements.txt +++ b/examples/LLM_Workflows/neo4j_graph_rag/requirements.txt @@ -15,9 +15,9 @@ # specific language governing permissions and limitations # under the License. -sf-hamilton>=1.73.0 neo4j>=5.18.0 openai>=1.30.0 pandas>=2.0.0 python-dotenv>=1.0.0 +sf-hamilton>=1.73.0 tqdm>=4.66.0 diff --git a/examples/LLM_Workflows/neo4j_graph_rag/retrieval_module.py b/examples/LLM_Workflows/neo4j_graph_rag/retrieval_module.py index 3c045d343..c9a6b536b 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/retrieval_module.py +++ b/examples/LLM_Workflows/neo4j_graph_rag/retrieval_module.py @@ -34,10 +34,9 @@ import logging import openai -from neo4j import Driver - from embed_module import EMBEDDING_MODEL, VECTOR_INDEX_NAME from graph_schema import schema_to_prompt +from neo4j import Driver logger = logging.getLogger(__name__) @@ -127,6 +126,7 @@ # 1. Classify query intent # --------------------------------------------------------------------------- + def query_intent(user_query: str, openai_api_key: str) -> str: """ Classify the user query into one of four retrieval strategies: @@ -175,6 +175,7 @@ def query_intent(user_query: str, openai_api_key: str) -> str: # 2. Entity extraction # --------------------------------------------------------------------------- + def entity_extraction( user_query: str, openai_api_key: str, @@ -248,6 +249,7 @@ def entity_extraction( # 3. Entity resolution — look up canonical forms in Neo4j # --------------------------------------------------------------------------- + def _resolve_persons(names: list[str], session) -> dict[str, str]: """Fuzzy-match person names against the graph, return {input: canonical}.""" resolved = {} @@ -404,24 +406,16 @@ def entity_resolution( with neo4j_driver.session() as session: if entity_extraction.get("persons"): - resolved["persons"] = _resolve_persons( - entity_extraction["persons"], session - ) + resolved["persons"] = _resolve_persons(entity_extraction["persons"], session) if entity_extraction.get("movies"): - resolved["movies"] = _resolve_movies( - entity_extraction["movies"], session - ) + resolved["movies"] = _resolve_movies(entity_extraction["movies"], session) if entity_extraction.get("genres"): - resolved["genres"] = _resolve_genres( - entity_extraction["genres"], session - ) + resolved["genres"] = _resolve_genres(entity_extraction["genres"], session) if entity_extraction.get("companies"): - resolved["companies"] = _resolve_companies( - entity_extraction["companies"], session - ) + resolved["companies"] = _resolve_companies(entity_extraction["companies"], session) # Pass through numeric/date filters unchanged for key in ("year_after", "year_before", "rating_above", "rating_below"): @@ -436,6 +430,7 @@ def entity_resolution( # 4. Vector path # --------------------------------------------------------------------------- + def query_embedding( user_query: str, openai_api_key: str, @@ -499,6 +494,7 @@ def vector_results( # 5. Cypher generation using resolved entities # --------------------------------------------------------------------------- + def _build_entity_context(resolved: dict) -> str: """ Build a plain-English summary of resolved entities for the Cypher @@ -511,32 +507,32 @@ def _build_entity_context(resolved: dict) -> str: persons = resolved.get("persons", {}) if persons: - for original, canonical in persons.items(): + for _original, canonical in persons.items(): lines.append(f' Person: "{canonical}"') movies = resolved.get("movies", {}) if movies: - for original, canonical in movies.items(): + for _original, canonical in movies.items(): lines.append(f' Movie title: "{canonical}"') genres = resolved.get("genres", {}) if genres: - for original, canonical in genres.items(): + for _original, canonical in genres.items(): lines.append(f' Genre: "{canonical}"') companies = resolved.get("companies", {}) if companies: - for original, canonical in companies.items(): + for _original, canonical in companies.items(): lines.append(f' ProductionCompany: "{canonical}"') if "year_after" in resolved: - lines.append(f' Date filter: m.release_date > \'{resolved["year_after"]}-01-01\'') + lines.append(f" Date filter: m.release_date > '{resolved['year_after']}-01-01'") if "year_before" in resolved: - lines.append(f' Date filter: m.release_date < \'{resolved["year_before"]}-12-31\'') + lines.append(f" Date filter: m.release_date < '{resolved['year_before']}-12-31'") if "rating_above" in resolved: - lines.append(f' Rating filter: m.vote_average > {resolved["rating_above"]}') + lines.append(f" Rating filter: m.vote_average > {resolved['rating_above']}") if "rating_below" in resolved: - lines.append(f' Rating filter: m.vote_average < {resolved["rating_below"]}') + lines.append(f" Rating filter: m.vote_average < {resolved['rating_below']}") return "\n".join(lines) @@ -656,6 +652,7 @@ def cypher_results( # 6. Enrich vector results with graph traversal # --------------------------------------------------------------------------- + def _enrich_movie(movie_id: int, driver: Driver) -> dict | None: """Pull directors, cast, genres, companies for a movie node.""" cypher = """ @@ -684,6 +681,7 @@ def _enrich_movie(movie_id: int, driver: Driver) -> dict | None: # 7. Merge results # --------------------------------------------------------------------------- + def merged_results( vector_results: list[dict], cypher_results: list[dict], @@ -722,6 +720,7 @@ def merged_results( # 8. Format context # --------------------------------------------------------------------------- + def retrieved_context(merged_results: list[dict], query_intent: str) -> str: """ Format merged results into plain-text context for the generation DAG. @@ -734,18 +733,18 @@ def retrieved_context(merged_results: list[dict], query_intent: str) -> str: return "No relevant information found in the knowledge graph for this query." FIELD_LABELS = { - "movie": "Movie", - "director": "Director", - "actor": "Actor", - "genre": "Genre", - "company": "Production company", - "film_count": "Films", - "movie_count": "Count", + "movie": "Movie", + "director": "Director", + "actor": "Actor", + "genre": "Genre", + "company": "Production company", + "film_count": "Films", + "movie_count": "Count", "action_movie_count": "Action movies", - "avg_rating": "Avg rating", - "average_rating": "Avg rating", - "vote_average": "Rating", - "release_date": "Released", + "avg_rating": "Avg rating", + "average_rating": "Avg rating", + "vote_average": "Rating", + "release_date": "Released", } lines = [] @@ -753,7 +752,7 @@ def retrieved_context(merged_results: list[dict], query_intent: str) -> str: for row in merged_results: i += 1 - source = row.get("_source", "unknown") + row.get("_source", "unknown") if "directors" in row: # Enriched movie record from vector path @@ -786,4 +785,4 @@ def retrieved_context(merged_results: list[dict], query_intent: str) -> str: context = "\n".join(lines) logger.info("Formatted context: %d chars from %d results", len(context), len(merged_results)) - return context \ No newline at end of file + return context diff --git a/examples/LLM_Workflows/neo4j_graph_rag/run.py b/examples/LLM_Workflows/neo4j_graph_rag/run.py index ca995bd14..2a0e124e4 100644 --- a/examples/LLM_Workflows/neo4j_graph_rag/run.py +++ b/examples/LLM_Workflows/neo4j_graph_rag/run.py @@ -40,15 +40,15 @@ import os from pathlib import Path -from dotenv import load_dotenv -from hamilton import driver -from neo4j import GraphDatabase - import embed_module import generation_module import ingest_module import retrieval_module +from dotenv import load_dotenv from graph_schema import CONSTRAINTS +from neo4j import GraphDatabase + +from hamilton import driver load_dotenv() logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") @@ -124,9 +124,11 @@ def run_ingest(visualise: bool = False): ) s = result["ingestion_summary"] logger.info( - "Ingestion complete — movies: %d, genre edges: %d, " - "company edges: %d, person edges: %d", - s["movies"], s["genre_edges"], s["company_edges"], s["person_edges"], + "Ingestion complete — movies: %d, genre edges: %d, company edges: %d, person edges: %d", + s["movies"], + s["genre_edges"], + s["company_edges"], + s["person_edges"], ) drv.close() @@ -158,7 +160,9 @@ def run_embed(visualise: bool = False): s = result["embedding_summary"] logger.info( "Embedding complete — %d embeddings written, index: %s, model: %s", - s["embeddings_written"], s["vector_index"], s["model"], + s["embeddings_written"], + s["vector_index"], + s["model"], ) drv.close() @@ -172,11 +176,7 @@ def run_query(question: str, visualise: bool = False): drv = make_neo4j_driver() openai_api_key = get_env("OPENAI_API_KEY") - rag_driver = ( - driver.Builder() - .with_modules(retrieval_module, generation_module) - .build() - ) + rag_driver = driver.Builder().with_modules(retrieval_module, generation_module).build() if visualise: rag_driver.display_all_functions("rag_dag.png") @@ -239,4 +239,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()