Skip to content

Commit dd5e30e

Browse files
fix byod flow and update integrated vectorization to work with byod flow
1 parent dad4e6e commit dd5e30e

6 files changed

Lines changed: 122 additions & 16 deletions

File tree

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import logging
2+
import azure.functions as func
3+
import json
4+
5+
bp_combine_pages_and_chunknos = func.Blueprint()
6+
7+
@bp_combine_pages_and_chunknos.route(route="combine_pages_and_chunknos", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
8+
def combine_pages_and_chunknos(req: func.HttpRequest) -> func.HttpResponse:
9+
"""
10+
This function is designed to be called by an Azure Cognitive Search WebApiSkill.
11+
It expects a JSON payload with two arrays ("pages" and "chunk_nos") and
12+
combines them into a single array of objects.
13+
"""
14+
logging.info("Combine pages and chunk numbers function processed a request.")
15+
16+
try:
17+
req_body = req.get_json()
18+
logging.info(f"Request body: {req_body}")
19+
values = req_body.get("values", [])
20+
21+
response_values = []
22+
23+
for value in values:
24+
record_id = value.get("recordId")
25+
data = value.get("data", {})
26+
27+
pages = data.get("pages", [])
28+
chunk_nos = data.get("chunk_nos", [])
29+
30+
# Zip the two arrays together
31+
zipped_data = [
32+
{"page_text": page, "chunk_no": chunk}
33+
for page, chunk in zip(pages, chunk_nos)
34+
]
35+
36+
response_values.append(
37+
{
38+
"recordId": record_id,
39+
"data": {"pages_with_chunks": zipped_data},
40+
"errors": None,
41+
"warnings": None,
42+
}
43+
)
44+
45+
# Return the response in the format expected by the WebApiSkill
46+
logging.info(f"Response values: {response_values}")
47+
return func.HttpResponse(
48+
body=json.dumps({"values": response_values}),
49+
mimetype="application/json",
50+
status_code=200,
51+
)
52+
53+
except Exception as e:
54+
logging.error(f"Error in combine_pages_and_chunknos function: {e}")
55+
return func.HttpResponse(
56+
body=json.dumps({"values": [{"recordId": "error", "data": {}, "errors": [{"message": str(e)}], "warnings": []}]}),
57+
mimetype="application/json",
58+
status_code=500,
59+
)

code/backend/batch/function_app.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from batch_push_results import bp_batch_push_results
66
from batch_start_processing import bp_batch_start_processing
77
from get_conversation_response import bp_get_conversation_response
8+
from combine_pages_chunknos import bp_combine_pages_and_chunknos
89
from azure.monitor.opentelemetry import configure_azure_monitor
910

1011
logging.captureWarnings(True)
@@ -20,3 +21,4 @@
2021
app.register_functions(bp_batch_push_results)
2122
app.register_functions(bp_batch_start_processing)
2223
app.register_functions(bp_get_conversation_response)
24+
app.register_functions(bp_combine_pages_and_chunknos)

code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import logging
2-
from azure.search.documents.indexes.models import SearchIndexer, FieldMapping
2+
from azure.search.documents.indexes.models import SearchIndexer, FieldMapping, FieldMappingFunction
33
from azure.search.documents.indexes import SearchIndexerClient
44
from ..helpers.env_helper import EnvHelper
55
from ..helpers.azure_credential_utils import get_azure_credential
@@ -35,6 +35,13 @@ def create_or_update_indexer(self, indexer_name: str, skillset_name: str):
3535
}
3636
},
3737
field_mappings=[
38+
FieldMapping(
39+
source_field_name="metadata_storage_path",
40+
target_field_name="id",
41+
mapping_function=FieldMappingFunction(
42+
name="base64Encode", parameters={"useHttpServerUtilityUrlTokenEncode": False}
43+
)
44+
),
3845
FieldMapping(
3946
source_field_name="metadata_storage_path",
4047
target_field_name="source",

code/backend/batch/utilities/integrated_vectorization/azure_search_skillset.py

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
AzureOpenAIEmbeddingSkill,
77
OcrSkill,
88
MergeSkill,
9+
ShaperSkill,
10+
WebApiSkill,
911
SearchIndexerIndexProjections,
1012
SearchIndexerIndexProjectionSelector,
1113
SearchIndexerIndexProjectionsParameters,
@@ -82,12 +84,30 @@ def create_skillset(self):
8284
inputs=[
8385
InputFieldMappingEntry(name="text", source="/document/merged_content"),
8486
],
85-
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
87+
outputs=[
88+
OutputFieldMappingEntry(name="textItems", target_name="pages"),
89+
OutputFieldMappingEntry(name="ordinalPositions", target_name="chunk_nos"),
90+
],
91+
)
92+
93+
# Custom WebApi skill to combine pages and chunk numbers into a single structure
94+
combine_pages_and_chunk_nos_skill = WebApiSkill(
95+
description="Combine pages and chunk numbers together",
96+
context="/document",
97+
uri=f"{self.env_helper.BACKEND_URL}/api/combine_pages_and_chunknos",
98+
http_method="POST",
99+
inputs=[
100+
InputFieldMappingEntry(name="pages", source="/document/pages"),
101+
InputFieldMappingEntry(name="chunk_nos", source="/document/chunk_nos"),
102+
],
103+
outputs=[
104+
OutputFieldMappingEntry(name="pages_with_chunks", target_name="pages_with_chunks")
105+
]
86106
)
87107

88108
embedding_skill = AzureOpenAIEmbeddingSkill(
89109
description="Skill to generate embeddings via Azure OpenAI",
90-
context="/document/pages/*",
110+
context="/document/pages_with_chunks/*",
91111
resource_uri=self.env_helper.AZURE_OPENAI_ENDPOINT,
92112
deployment_id=self.env_helper.AZURE_OPENAI_EMBEDDING_MODEL,
93113
api_key=(
@@ -96,31 +116,49 @@ def create_skillset(self):
96116
else None
97117
),
98118
inputs=[
99-
InputFieldMappingEntry(name="text", source="/document/pages/*"),
119+
InputFieldMappingEntry(name="text", source="/document/pages_with_chunks/*/page_text"),
100120
],
101121
outputs=[
102122
OutputFieldMappingEntry(name="embedding", target_name="content_vector")
103123
],
104124
)
105125

126+
metadata_shaper = ShaperSkill(
127+
description="Structure metadata fields into a complex object",
128+
context="/document/pages_with_chunks/*",
129+
inputs=[
130+
InputFieldMappingEntry(name="id", source="/document/id"),
131+
InputFieldMappingEntry(name="source", source="/document/metadata_storage_path"),
132+
InputFieldMappingEntry(name="title", source="/document/title"),
133+
InputFieldMappingEntry(name="chunk", source="/document/pages_with_chunks/*/chunk_no"),
134+
],
135+
outputs=[
136+
OutputFieldMappingEntry(name="output", target_name="metadata_object")
137+
]
138+
)
139+
106140
index_projections = SearchIndexerIndexProjections(
107141
selectors=[
108142
SearchIndexerIndexProjectionSelector(
109143
target_index_name=self.env_helper.AZURE_SEARCH_INDEX,
110144
parent_key_field_name="id",
111-
source_context="/document/pages/*",
145+
source_context="/document/pages_with_chunks/*",
112146
mappings=[
113147
InputFieldMappingEntry(
114-
name="content", source="/document/pages/*"
148+
name="content", source="/document/pages_with_chunks/*/page_text"
115149
),
116150
InputFieldMappingEntry(
117151
name="content_vector",
118-
source="/document/pages/*/content_vector",
152+
source="/document/pages_with_chunks/*/content_vector",
119153
),
120154
InputFieldMappingEntry(name="title", source="/document/title"),
121155
InputFieldMappingEntry(
122156
name="source", source="/document/metadata_storage_path"
123157
),
158+
InputFieldMappingEntry(
159+
name="metadata",
160+
source="/document/pages_with_chunks/*/metadata_object",
161+
)
124162
],
125163
),
126164
],
@@ -132,7 +170,7 @@ def create_skillset(self):
132170
skillset = SearchIndexerSkillset(
133171
name=skillset_name,
134172
description="Skillset to chunk documents and generating embeddings",
135-
skills=[ocr_skill, merge_skill, split_skill, embedding_skill],
173+
skills=[ocr_skill, merge_skill, split_skill, combine_pages_and_chunk_nos_skill, embedding_skill, metadata_shaper],
136174
index_projections=index_projections,
137175
)
138176

code/create_app.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,17 @@ def get_citations(citation_list):
5454
else citation["url"]
5555
)
5656
title = citation["title"]
57-
url = get_markdown_url(metadata["source"], title, container_sas)
57+
source = metadata["source"]
58+
if "_SAS_TOKEN_PLACEHOLDER_" not in source:
59+
source += "_SAS_TOKEN_PLACEHOLDER_"
60+
url = get_markdown_url(source, title, container_sas)
5861
citations_dict["citations"].append(
5962
{
6063
"content": url + "\n\n\n" + citation["content"],
6164
"id": metadata["id"],
6265
"chunk_id": (
6366
re.findall(r"\d+", metadata["chunk_id"])[-1]
64-
if metadata["chunk_id"] is not None
67+
if metadata.get("chunk_id") is not None
6568
else metadata["chunk"]
6669
),
6770
"title": title,
@@ -209,11 +212,6 @@ def conversation_with_data(conversation: Request, env_helper: EnvHelper):
209212
env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN
210213
],
211214
"title_field": env_helper.AZURE_SEARCH_TITLE_COLUMN or None,
212-
"source_field": env_helper.AZURE_SEARCH_SOURCE_COLUMN
213-
or None,
214-
"text_field": env_helper.AZURE_SEARCH_TEXT_COLUMN or None,
215-
"layoutText_field": env_helper.AZURE_SEARCH_LAYOUT_TEXT_COLUMN
216-
or None,
217215
"url_field": env_helper.AZURE_SEARCH_FIELDS_METADATA
218216
or None,
219217
"filepath_field": (

infra/main.bicep

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1016,6 +1016,7 @@ module function './app/function.bicep' = if (hostingModel == 'code') {
10161016
AZURE_OPENAI_SYSTEM_MESSAGE: azureOpenAISystemMessage
10171017
DATABASE_TYPE: databaseType
10181018
APP_ENV: appEnvironment
1019+
BACKEND_URL: backendUrl
10191020
},
10201021
// Conditionally add database-specific settings
10211022
databaseType == 'CosmosDB'
@@ -1086,6 +1087,7 @@ module function_docker './app/function.bicep' = if (hostingModel == 'container')
10861087
AZURE_OPENAI_SYSTEM_MESSAGE: azureOpenAISystemMessage
10871088
DATABASE_TYPE: databaseType
10881089
APP_ENV: appEnvironment
1090+
BACKEND_URL: backendUrl
10891091
},
10901092
// Conditionally add database-specific settings
10911093
databaseType == 'CosmosDB'
@@ -1363,7 +1365,7 @@ var azureContentSafetyInfo = string({
13631365
endpoint: contentsafety.outputs.endpoint
13641366
})
13651367

1366-
var backendUrl = 'https://${functionName}.azurewebsites.net'
1368+
var backendUrl = hostingModel == 'container' ? 'https://${functionName}-docker.azurewebsites.net' : 'https://${functionName}.azurewebsites.net'
13671369

13681370
output APPLICATIONINSIGHTS_CONNECTION_STRING string = monitoring.outputs.applicationInsightsConnectionString
13691371
output AZURE_APP_SERVICE_HOSTING_MODEL string = hostingModel

0 commit comments

Comments
 (0)