66 AzureOpenAIEmbeddingSkill ,
77 OcrSkill ,
88 MergeSkill ,
9+ ShaperSkill ,
10+ WebApiSkill ,
911 SearchIndexerIndexProjections ,
1012 SearchIndexerIndexProjectionSelector ,
1113 SearchIndexerIndexProjectionsParameters ,
@@ -83,12 +85,30 @@ def create_skillset(self):
8385 inputs = [
8486 InputFieldMappingEntry (name = "text" , source = "/document/merged_content" ),
8587 ],
86- outputs = [OutputFieldMappingEntry (name = "textItems" , target_name = "pages" )],
88+ outputs = [
89+ OutputFieldMappingEntry (name = "textItems" , target_name = "pages" ),
90+ OutputFieldMappingEntry (name = "ordinalPositions" , target_name = "chunk_nos" ),
91+ ],
92+ )
93+
94+ # Custom WebApi skill to combine pages and chunk numbers into a single structure
95+ combine_pages_and_chunk_nos_skill = WebApiSkill (
96+ description = "Combine pages and chunk numbers together" ,
97+ context = "/document" ,
98+ uri = f"{ self .env_helper .BACKEND_URL } /api/combine_pages_and_chunknos" ,
99+ http_method = "POST" ,
100+ inputs = [
101+ InputFieldMappingEntry (name = "pages" , source = "/document/pages" ),
102+ InputFieldMappingEntry (name = "chunk_nos" , source = "/document/chunk_nos" ),
103+ ],
104+ outputs = [
105+ OutputFieldMappingEntry (name = "pages_with_chunks" , target_name = "pages_with_chunks" )
106+ ]
87107 )
88108
89109 embedding_skill = AzureOpenAIEmbeddingSkill (
90110 description = "Skill to generate embeddings via Azure OpenAI" ,
91- context = "/document/pages /*" ,
111+ context = "/document/pages_with_chunks /*" ,
92112 resource_uri = self .env_helper .AZURE_OPENAI_ENDPOINT ,
93113 deployment_id = self .env_helper .AZURE_OPENAI_EMBEDDING_MODEL ,
94114 api_key = (
@@ -104,31 +124,49 @@ def create_skillset(self):
104124 )
105125 ),
106126 inputs = [
107- InputFieldMappingEntry (name = "text" , source = "/document/pages/* " ),
127+ InputFieldMappingEntry (name = "text" , source = "/document/pages_with_chunks/*/page_text " ),
108128 ],
109129 outputs = [
110130 OutputFieldMappingEntry (name = "embedding" , target_name = "content_vector" )
111131 ],
112132 )
113133
134+ metadata_shaper = ShaperSkill (
135+ description = "Structure metadata fields into a complex object" ,
136+ context = "/document/pages_with_chunks/*" ,
137+ inputs = [
138+ InputFieldMappingEntry (name = "id" , source = "/document/id" ),
139+ InputFieldMappingEntry (name = "source" , source = "/document/metadata_storage_path" ),
140+ InputFieldMappingEntry (name = "title" , source = "/document/title" ),
141+ InputFieldMappingEntry (name = "chunk" , source = "/document/pages_with_chunks/*/chunk_no" ),
142+ ],
143+ outputs = [
144+ OutputFieldMappingEntry (name = "output" , target_name = "metadata_object" )
145+ ]
146+ )
147+
114148 index_projections = SearchIndexerIndexProjections (
115149 selectors = [
116150 SearchIndexerIndexProjectionSelector (
117151 target_index_name = self .env_helper .AZURE_SEARCH_INDEX ,
118152 parent_key_field_name = "id" ,
119- source_context = "/document/pages /*" ,
153+ source_context = "/document/pages_with_chunks /*" ,
120154 mappings = [
121155 InputFieldMappingEntry (
122- name = "content" , source = "/document/pages/* "
156+ name = "content" , source = "/document/pages_with_chunks/*/page_text "
123157 ),
124158 InputFieldMappingEntry (
125159 name = "content_vector" ,
126- source = "/document/pages /*/content_vector" ,
160+ source = "/document/pages_with_chunks /*/content_vector" ,
127161 ),
128162 InputFieldMappingEntry (name = "title" , source = "/document/title" ),
129163 InputFieldMappingEntry (
130164 name = "source" , source = "/document/metadata_storage_path"
131165 ),
166+ InputFieldMappingEntry (
167+ name = "metadata" ,
168+ source = "/document/pages_with_chunks/*/metadata_object" ,
169+ )
132170 ],
133171 ),
134172 ],
@@ -140,7 +178,7 @@ def create_skillset(self):
140178 skillset = SearchIndexerSkillset (
141179 name = skillset_name ,
142180 description = "Skillset to chunk documents and generating embeddings" ,
143- skills = [ocr_skill , merge_skill , split_skill , embedding_skill ],
181+ skills = [ocr_skill , merge_skill , split_skill , combine_pages_and_chunk_nos_skill , embedding_skill , metadata_shaper ],
144182 index_projections = index_projections ,
145183 )
146184
0 commit comments