Skip to content

Commit 5546e5a

Browse files
authored
Add 372 fix 489 (#528)
* fix video indexer auth * fixed video indexer support * fixed video indexer support * fix video indexer support * fixed video indexer support * added support for .xlsm * fixed all scope issue * added supprot for xml, yaml, and log #### **JSON** - Uses `RecursiveJsonSplitter`: - `max_chunk_size=600` - `convert_lists=True` - Produces JSON strings that retain original structure. - See `process_json_file`. #### **XML** - Uses `RecursiveCharacterTextSplitter` with XML-aware separators. - **Structure-preserving chunking**: - Separators prioritized: `\n\n` → `\n` → `>` (end of XML tags) → space → character - Splits at logical boundaries to maintain tag integrity - **Chunked by 4000 characters** with 200-character overlap for context preservation. - **Goal**: Preserve XML structure while providing manageable chunks for LLM processing. - See `process_xml`. #### **YAML / YML** - Processed using regex word splitting (similar to TXT). - **Chunked by 400 words**. - Maintains YAML structure through simple word-based splitting. - See `process_yaml`. #### **LOG** - Processed using line-based chunking to maintain log record integrity. - **Never splits mid-line** to preserve complete log entries. - **Line-Level Chunking**: 1. Split file by lines using `splitlines(keepends=True)` to preserve line endings. 2. Accumulate complete lines until reaching target word count ≈1000 words. 3. When adding next line would exceed target AND chunk already has content: - Finalize current chunk - Start new chunk with current line 4. If single line exceeds target, it gets its own chunk to prevent infinite loops. 5. Emit chunks with complete log records. - **Goal**: Provide substantial log context (1000 words) while ensuring no log entry is split across chunks. - See `process_log`. * updated yaml to use recursivesplitter * removed chunk overlap for yaml and xml * added support for older .doc files and .docm * added keyword and abstract for each doc in citation * added multi-modal input support * added ai vision analysis
1 parent b2aa14a commit 5546e5a

31 files changed

Lines changed: 4238 additions & 296 deletions

application/single_app/config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@
8888
EXECUTOR_TYPE = 'thread'
8989
EXECUTOR_MAX_WORKERS = 30
9090
SESSION_TYPE = 'filesystem'
91-
VERSION = "0.229.063"
91+
VERSION = "0.229.098"
9292

9393

9494
SECRET_KEY = os.getenv('SECRET_KEY', 'dev-secret-key-change-in-production')
@@ -121,9 +121,9 @@
121121
CLIENTS_LOCK = threading.Lock()
122122

123123
ALLOWED_EXTENSIONS = {
124-
'txt', 'pdf', 'docx', 'xlsx', 'xls', 'csv', 'pptx', 'html', 'jpg', 'jpeg', 'png', 'bmp', 'tiff', 'tif', 'heif', 'md', 'json',
124+
'txt', 'pdf', 'doc', 'docm', 'docx', 'xlsx', 'xls', 'xlsm','csv', 'pptx', 'html', 'jpg', 'jpeg', 'png', 'bmp', 'tiff', 'tif', 'heif', 'md', 'json',
125125
'mp4', 'mov', 'avi', 'mkv', 'flv', 'mxf', 'gxf', 'ts', 'ps', '3gp', '3gpp', 'mpg', 'wmv', 'asf', 'm4a', 'm4v', 'isma', 'ismv',
126-
'dvr-ms', 'wav'
126+
'dvr-ms', 'wav', 'xml', 'yaml', 'yml', 'log'
127127
}
128128
ALLOWED_EXTENSIONS_IMG = {'png', 'jpg', 'jpeg'}
129129
MAX_CONTENT_LENGTH = 5000 * 1024 * 1024 # 5000 MB AKA 5 GB

application/single_app/functions_authentication.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -245,15 +245,34 @@ def get_valid_access_token_for_plugins(scopes=None):
245245

246246
def get_video_indexer_account_token(settings, video_id=None):
247247
"""
248-
For ARM-based VideoIndexer accounts:
248+
Get Video Indexer access token using managed identity authentication.
249+
250+
This function authenticates with Azure Video Indexer using the App Service's
251+
managed identity. The managed identity must have Contributor role on the
252+
Video Indexer resource.
253+
254+
Authentication flow:
255+
1. Acquire ARM access token using DefaultAzureCredential (managed identity)
256+
2. Call ARM generateAccessToken API to get Video Indexer access token
257+
3. Use Video Indexer access token for all API operations
258+
"""
259+
from functions_debug import debug_print
260+
261+
debug_print(f"[VIDEO INDEXER AUTH] Starting token acquisition using managed identity for video_id: {video_id}")
262+
debug_print(f"[VIDEO INDEXER AUTH] Azure environment: {AZURE_ENVIRONMENT}")
263+
264+
return get_video_indexer_managed_identity_token(settings, video_id)
265+
266+
def get_video_indexer_managed_identity_token(settings, video_id=None):
267+
"""
268+
For ARM-based VideoIndexer accounts using managed identity:
249269
1) Acquire an ARM token with DefaultAzureCredential
250270
2) POST to the ARM generateAccessToken endpoint
251271
3) Return the account-level accessToken
252272
"""
253273
from functions_debug import debug_print
254274

255-
debug_print(f"[VIDEO INDEXER AUTH] Starting token acquisition for video_id: {video_id}")
256-
debug_print(f"[VIDEO INDEXER AUTH] Azure environment: {AZURE_ENVIRONMENT}")
275+
debug_print(f"[VIDEO INDEXER AUTH] Using managed identity authentication")
257276

258277
# 1) ARM token
259278
if AZURE_ENVIRONMENT == "usgovernment":

application/single_app/functions_content.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def extract_table_file(file_path, file_ext):
172172
try:
173173
if file_ext == '.csv':
174174
df = pandas.read_csv(file_path)
175-
elif file_ext in ['.xls', '.xlsx']:
175+
elif file_ext in ['.xls', '.xlsx', '.xlsm']:
176176
df = pandas.read_excel(file_path)
177177
else:
178178
raise ValueError("Unsupported file extension for table extraction.")

application/single_app/functions_documents.py

Lines changed: 1162 additions & 86 deletions
Large diffs are not rendered by default.

application/single_app/functions_search.py

Lines changed: 32 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,22 @@ def hybrid_search(query, user_id, document_id=None, top_n=12, doc_scope="all", a
4646
select=["id", "chunk_text", "chunk_id", "file_name", "user_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"]
4747
)
4848

49-
group_results = search_client_group.search(
50-
search_text=query,
51-
vector_queries=[vector_query],
52-
filter=(
53-
f"(group_id eq '{active_group_id}' or shared_group_ids/any(g: g eq '{active_group_id},approved')) and document_id eq '{document_id}'"
54-
),
55-
query_type="semantic",
56-
semantic_configuration_name="nexus-group-index-semantic-configuration",
57-
query_caption="extractive",
58-
query_answer="extractive",
59-
select=["id", "chunk_text", "chunk_id", "file_name", "group_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"]
60-
)
49+
# Only search group index if active_group_id is provided
50+
if active_group_id:
51+
group_results = search_client_group.search(
52+
search_text=query,
53+
vector_queries=[vector_query],
54+
filter=(
55+
f"(group_id eq '{active_group_id}' or shared_group_ids/any(g: g eq '{active_group_id},approved')) and document_id eq '{document_id}'"
56+
),
57+
query_type="semantic",
58+
semantic_configuration_name="nexus-group-index-semantic-configuration",
59+
query_caption="extractive",
60+
query_answer="extractive",
61+
select=["id", "chunk_text", "chunk_id", "file_name", "group_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"]
62+
)
63+
else:
64+
group_results = []
6165

6266
# Get visible public workspace IDs from user settings
6367
visible_public_workspace_ids = get_user_visible_public_workspace_ids_from_settings(user_id)
@@ -97,18 +101,22 @@ def hybrid_search(query, user_id, document_id=None, top_n=12, doc_scope="all", a
97101
select=["id", "chunk_text", "chunk_id", "file_name", "user_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"]
98102
)
99103

100-
group_results = search_client_group.search(
101-
search_text=query,
102-
vector_queries=[vector_query],
103-
filter=(
104-
f"(group_id eq '{active_group_id}' or shared_group_ids/any(g: g eq '{active_group_id},approved'))"
105-
),
106-
query_type="semantic",
107-
semantic_configuration_name="nexus-group-index-semantic-configuration",
108-
query_caption="extractive",
109-
query_answer="extractive",
110-
select=["id", "chunk_text", "chunk_id", "file_name", "group_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"]
111-
)
104+
# Only search group index if active_group_id is provided
105+
if active_group_id:
106+
group_results = search_client_group.search(
107+
search_text=query,
108+
vector_queries=[vector_query],
109+
filter=(
110+
f"(group_id eq '{active_group_id}' or shared_group_ids/any(g: g eq '{active_group_id},approved'))"
111+
),
112+
query_type="semantic",
113+
semantic_configuration_name="nexus-group-index-semantic-configuration",
114+
query_caption="extractive",
115+
query_answer="extractive",
116+
select=["id", "chunk_text", "chunk_id", "file_name", "group_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"]
117+
)
118+
else:
119+
group_results = []
112120

113121
# Get visible public workspace IDs from user settings
114122
visible_public_workspace_ids = get_user_visible_public_workspace_ids_from_settings(user_id)

application/single_app/functions_settings.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,10 @@ def get_settings():
134134
'number_of_historical_messages_to_summarize': 10,
135135
'enable_summarize_content_history_beyond_conversation_history_limit': False,
136136

137+
# Multi-Modal Vision Analysis
138+
'enable_multimodal_vision': False,
139+
'multimodal_vision_model': '',
140+
137141
# Document Classification
138142
'enable_document_classification': False,
139143
'document_classification_categories': [
@@ -215,11 +219,10 @@ def get_settings():
215219
'video_indexer_endpoint': video_indexer_endpoint,
216220
'video_indexer_location': '',
217221
'video_indexer_account_id': '',
218-
'video_indexer_api_key': '',
219222
'video_indexer_resource_group': '',
220223
'video_indexer_subscription_id': '',
221224
'video_indexer_account_name': '',
222-
'video_indexer_arm_api_version': '2021-11-10-preview',
225+
'video_indexer_arm_api_version': '2024-01-01',
223226
'video_index_timeout': 600,
224227

225228
# Audio file settings with Azure speech service

0 commit comments

Comments
 (0)