1- # Copyright (c) 2024 Microsoft Corporation.
1+ # Copyright (C) 2026 Microsoft
22# Licensed under the MIT License
33
44"""A module containing run_workflow method definition."""
55
66import logging
7+ from collections import Counter
8+ from typing import Any
79
8- import pandas as pd
10+ from graphrag_storage . tables . table import Table
911
1012from graphrag .config .models .graph_rag_config import GraphRagConfig
11- from graphrag .data_model .data_reader import DataReader
13+ from graphrag .data_model .row_transformers import (
14+ transform_entity_row ,
15+ transform_relationship_row ,
16+ )
1217from graphrag .index .operations .finalize_entities import finalize_entities
13- from graphrag .index .operations .finalize_relationships import finalize_relationships
18+ from graphrag .index .operations .finalize_relationships import (
19+ finalize_relationships ,
20+ )
1421from graphrag .index .operations .snapshot_graphml import snapshot_graphml
1522from graphrag .index .typing .context import PipelineRunContext
1623from graphrag .index .typing .workflow import WorkflowFunctionOutput
@@ -24,41 +31,95 @@ async def run_workflow(
2431) -> WorkflowFunctionOutput :
2532 """All the steps to create the base entity graph."""
2633 logger .info ("Workflow started: finalize_graph" )
27- reader = DataReader (context .output_table_provider )
28- entities = await reader .entities ()
29- relationships = await reader .relationships ()
3034
31- final_entities , final_relationships = finalize_graph (
32- entities ,
33- relationships ,
34- )
35-
36- await context .output_table_provider .write_dataframe ("entities" , final_entities )
37- await context .output_table_provider .write_dataframe (
38- "relationships" , final_relationships
39- )
35+ async with (
36+ context .output_table_provider .open (
37+ "entities" ,
38+ transformer = transform_entity_row ,
39+ ) as entities_table ,
40+ context .output_table_provider .open (
41+ "relationships" ,
42+ transformer = transform_relationship_row ,
43+ ) as relationships_table ,
44+ ):
45+ result = await finalize_graph (
46+ entities_table ,
47+ relationships_table ,
48+ )
4049
4150 if config .snapshots .graphml :
51+ rels = await context .output_table_provider .read_dataframe ("relationships" )
4252 await snapshot_graphml (
43- final_relationships ,
53+ rels ,
4454 name = "graph" ,
4555 storage = context .output_storage ,
4656 )
4757
4858 logger .info ("Workflow completed: finalize_graph" )
49- return WorkflowFunctionOutput (
50- result = {
51- "entities" : entities ,
52- "relationships" : relationships ,
53- }
54- )
55-
56-
57- def finalize_graph (
58- entities : pd .DataFrame ,
59- relationships : pd .DataFrame ,
60- ) -> tuple [pd .DataFrame , pd .DataFrame ]:
61- """All the steps to finalize the entity and relationship formats."""
62- final_entities = finalize_entities (entities , relationships )
63- final_relationships = finalize_relationships (relationships )
64- return (final_entities , final_relationships )
59+ return WorkflowFunctionOutput (result = result )
60+
61+
62+ async def finalize_graph (
63+ entities_table : Table ,
64+ relationships_table : Table ,
65+ ) -> dict [str , list [dict [str , Any ]]]:
66+ """Compute degrees and finalize entities and relationships.
67+
68+ Streams relationship rows to build a degree map without
69+ materializing a DataFrame, then delegates to the individual
70+ finalize operations for streaming row-by-row enrichment and
71+ writing.
72+
73+ Args
74+ ----
75+ entities_table: Table
76+ Opened table for reading and writing entity rows.
77+ relationships_table: Table
78+ Opened table for reading relationships into a DataFrame
79+ and writing finalized relationship rows.
80+
81+ Returns
82+ -------
83+ dict[str, list[dict[str, Any]]]
84+ Sample rows keyed by ``"entities"`` and
85+ ``"relationships"``, up to 5 each.
86+ """
87+ degree_map = await _build_degree_map (relationships_table )
88+
89+ entity_samples = await finalize_entities (entities_table , degree_map )
90+ relationship_samples = await finalize_relationships (relationships_table , degree_map )
91+
92+ return {
93+ "entities" : entity_samples ,
94+ "relationships" : relationship_samples ,
95+ }
96+
97+
98+ async def _build_degree_map (
99+ relationships_table : Table ,
100+ ) -> dict [str , int ]:
101+ """Stream relationship rows to compute node degrees.
102+
103+ Normalizes each edge to an undirected pair and deduplicates
104+ on the fly, matching the behavior of ``compute_degree`` but
105+ without materializing a DataFrame.
106+
107+ Args
108+ ----
109+ relationships_table: Table
110+ Opened table to stream relationship rows from.
111+
112+ Returns
113+ -------
114+ dict[str, int]
115+ Mapping of entity title to its node degree.
116+ """
117+ seen : set [tuple [str , str ]] = set ()
118+ degree : Counter [str ] = Counter ()
119+ async for row in relationships_table :
120+ lo , hi = sorted ((row ["source" ], row ["target" ]))
121+ if (lo , hi ) not in seen :
122+ seen .add ((lo , hi ))
123+ degree [lo ] += 1
124+ degree [hi ] += 1
125+ return dict (degree )
0 commit comments