graphrag-accelerator/backend/graphrag_app/api/query.py at 4017f51cf35bebcf40428e4d83e5edc32c55f698 · Azure-Samples/graphrag-accelerator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import os
import traceback
from pathlib import Path

import yaml
from fastapi import (
    APIRouter,
    Depends,
    HTTPException,
    status,
)
from graphrag.api.query import global_search, local_search
from graphrag.config.create_graphrag_config import create_graphrag_config

from graphrag_app.logger.load_logger import load_pipeline_logger
from graphrag_app.typing.models import (
    GraphRequest,
    GraphResponse,
)
from graphrag_app.typing.pipeline import PipelineJobState
from graphrag_app.utils.azure_clients import AzureClientManager
from graphrag_app.utils.common import (
    get_df,
    sanitize_name,
    subscription_key_check,
    validate_index_file_exist,
)
from graphrag_app.utils.pipeline import PipelineJob

query_route = APIRouter(
    prefix="/query",
    tags=["Query Operations"],
)
if os.getenv("KUBERNETES_SERVICE_HOST"):
    query_route.dependencies.append(Depends(subscription_key_check))


@query_route.post(
    "/global",
    summary="Perform a global search across the knowledge graph index",
    description="The global query method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole.",
    response_model=GraphResponse,
    responses={status.HTTP_200_OK: {"model": GraphResponse}},
)
async def global_query(request: GraphRequest):
    # this is a slightly modified version of the graphrag.query.cli.run_global_search method
    index_name = request.index_name
    sanitized_index_name = sanitize_name(index_name)

    if not _is_index_complete(sanitized_index_name):
        raise HTTPException(
            status_code=status.HTTP_425_TOO_EARLY,
            detail=f"{index_name} not ready for querying.",
        )

    COMMUNITY_REPORT_TABLE = "output/create_final_community_reports.parquet"
    COMMUNITIES_TABLE = "output/create_final_communities.parquet"
    ENTITIES_TABLE = "output/create_final_entities.parquet"
    NODES_TABLE = "output/create_final_nodes.parquet"

    validate_index_file_exist(sanitized_index_name, COMMUNITY_REPORT_TABLE)
    validate_index_file_exist(sanitized_index_name, ENTITIES_TABLE)
    validate_index_file_exist(sanitized_index_name, NODES_TABLE)

    if isinstance(request.community_level, int):
        COMMUNITY_LEVEL = request.community_level
    else:
        # Current investigations show that community level 1 is the most useful for global search. Set this as the default value
        COMMUNITY_LEVEL = 1

    try:
        # read the parquet files into DataFrames and add provenance information
        community_report_table_path = (
            f"abfs://{sanitized_index_name}/{COMMUNITY_REPORT_TABLE}"
        )
        communities_table_path = f"abfs://{sanitized_index_name}/{COMMUNITIES_TABLE}"
        entities_table_path = f"abfs://{sanitized_index_name}/{ENTITIES_TABLE}"
        nodes_table_path = f"abfs://{sanitized_index_name}/{NODES_TABLE}"

        # load parquet tables associated with the index
        nodes_df = get_df(nodes_table_path)
        community_reports_df = get_df(community_report_table_path)
        communities_df = get_df(communities_table_path)
        entities_df = get_df(entities_table_path)

        # load custom pipeline settings
        ROOT_DIR = Path(__file__).resolve().parent.parent.parent
        with (ROOT_DIR / "scripts/settings.yaml").open("r") as f:
            data = yaml.safe_load(f)

        # layer the custom settings on top of the default configuration settings of graphrag
        parameters = create_graphrag_config(data, ".")

        # perform async search
        result = await global_search(
            config=parameters,
            nodes=nodes_df,
            entities=entities_df,
            communities=communities_df,
            community_reports=community_reports_df,
            community_level=COMMUNITY_LEVEL,
            dynamic_community_selection=False,
            response_type="Multiple Paragraphs",
            query=request.query,
        )

        return GraphResponse(result=result[0], context_data=result[1])
    except Exception as e:
        logger = load_pipeline_logger()
        logger.error(
            message="Could not perform global search.",
            cause=e,
            stack=traceback.format_exc(),
        )
        raise HTTPException(status_code=500, detail=None)


@query_route.post(
    "/local",
    summary="Perform a local search across the knowledge graph index.",
    description="The local query method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).",
    response_model=GraphResponse,
    responses={status.HTTP_200_OK: {"model": GraphResponse}},
)
async def local_query(request: GraphRequest):
    index_name = request.index_name
    sanitized_index_name = sanitize_name(index_name)

    if not _is_index_complete(sanitized_index_name):
        raise HTTPException(
            status_code=status.HTTP_425_TOO_EARLY,
            detail=f"{index_name} not ready for querying.",
        )

    azure_client_manager = AzureClientManager()
    blob_service_client = azure_client_manager.get_blob_service_client()

    COMMUNITY_REPORT_TABLE = "output/create_final_community_reports.parquet"
    COVARIATES_TABLE = "output/create_final_covariates.parquet"
    ENTITIES_TABLE = "output/create_final_entities.parquet"
    NODES_TABLE = "output/create_final_nodes.parquet"
    RELATIONSHIPS_TABLE = "output/create_final_relationships.parquet"
    TEXT_UNITS_TABLE = "output/create_final_text_units.parquet"

    if isinstance(request.community_level, int):
        COMMUNITY_LEVEL = request.community_level
    else:
        # Current investigations show that community level 2 is the most useful for local search. Set this as the default value
        COMMUNITY_LEVEL = 2

    # check for existence of files the query relies on to validate the index is complete
    validate_index_file_exist(sanitized_index_name, COMMUNITY_REPORT_TABLE)
    validate_index_file_exist(sanitized_index_name, ENTITIES_TABLE)
    validate_index_file_exist(sanitized_index_name, NODES_TABLE)
    validate_index_file_exist(sanitized_index_name, RELATIONSHIPS_TABLE)
    validate_index_file_exist(sanitized_index_name, TEXT_UNITS_TABLE)

    community_report_table_path = (
        f"abfs://{sanitized_index_name}/{COMMUNITY_REPORT_TABLE}"
    )
    covariates_table_path = f"abfs://{sanitized_index_name}/{COVARIATES_TABLE}"
    entities_table_path = f"abfs://{sanitized_index_name}/{ENTITIES_TABLE}"
    nodes_table_path = f"abfs://{sanitized_index_name}/{NODES_TABLE}"
    relationships_table_path = f"abfs://{sanitized_index_name}/{RELATIONSHIPS_TABLE}"
    text_units_table_path = f"abfs://{sanitized_index_name}/{TEXT_UNITS_TABLE}"

    nodes_df = get_df(nodes_table_path)
    community_reports_df = get_df(community_report_table_path)
    entities_df = get_df(entities_table_path)
    relationships_df = get_df(relationships_table_path)
    text_units_df = get_df(text_units_table_path)

    # If present, prepare each index's covariates dataframe for merging
    index_container_client = blob_service_client.get_container_client(
        sanitized_index_name
    )
    covariates_df = None
    if index_container_client.get_blob_client(COVARIATES_TABLE).exists():
        covariates_df = get_df(covariates_table_path)

    # load custom pipeline settings
    ROOT_DIR = Path(__file__).resolve().parent.parent.parent
    with (ROOT_DIR / "scripts/settings.yaml").open("r") as f:
        data = yaml.safe_load(f)

    # layer the custom settings on top of the default configuration settings of graphrag
    parameters = create_graphrag_config(data, ".")
    # add index_names to vector_store args
    parameters.embeddings.vector_store["collection_name"] = sanitized_index_name

    # perform async search
    result = await local_search(
        config=parameters,
        nodes=nodes_df,
        entities=entities_df,
        community_reports=community_reports_df,
        text_units=text_units_df,
        relationships=relationships_df,
        covariates=covariates_df,
        community_level=COMMUNITY_LEVEL,
        response_type="Multiple Paragraphs",
        query=request.query,
    )

    return GraphResponse(result=result[0], context_data=result[1])


def _is_index_complete(index_name: str) -> bool:
    """
    Check if an index is ready for querying.

    An index is ready for use only if it exists in the jobs table in cosmos db and
    the indexing build job has finished (i.e. 100 percent). Otherwise it is not ready.

    Args:
    -----
    index_name (str)
        Name of the index to check.

    Returns: bool
        True if the index is ready for use, False otherwise.
    """
    if PipelineJob.item_exist(index_name):
        pipeline_job = PipelineJob.load_item(index_name)
        if PipelineJobState(pipeline_job.status) == PipelineJobState.COMPLETE:
            return True
    return False