From 4485c1e0618585dd858fde075cba77e996e4c481 Mon Sep 17 00:00:00 2001 From: Bryon Lewis Date: Wed, 13 May 2026 08:21:04 -0400 Subject: [PATCH 1/5] move metadata testing scripts --- scripts/metadata/.gitignore | 4 ++++ scripts/{ => metadata}/JSONLAddMetadata.py | 0 scripts/{ => metadata}/generateMetadata.py | 4 ++-- scripts/{ => metadata}/generateNDJSON.py | 0 4 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 scripts/metadata/.gitignore rename scripts/{ => metadata}/JSONLAddMetadata.py (100%) rename scripts/{ => metadata}/generateMetadata.py (98%) rename scripts/{ => metadata}/generateNDJSON.py (100%) diff --git a/scripts/metadata/.gitignore b/scripts/metadata/.gitignore new file mode 100644 index 00000000..0a5316a2 --- /dev/null +++ b/scripts/metadata/.gitignore @@ -0,0 +1,4 @@ +*.csv +*.json +*.ndjson +*.mp4 \ No newline at end of file diff --git a/scripts/JSONLAddMetadata.py b/scripts/metadata/JSONLAddMetadata.py similarity index 100% rename from scripts/JSONLAddMetadata.py rename to scripts/metadata/JSONLAddMetadata.py diff --git a/scripts/generateMetadata.py b/scripts/metadata/generateMetadata.py similarity index 98% rename from scripts/generateMetadata.py rename to scripts/metadata/generateMetadata.py index 506a6e66..7c981504 100644 --- a/scripts/generateMetadata.py +++ b/scripts/metadata/generateMetadata.py @@ -15,8 +15,8 @@ apiURL = "127.0.0.1" # url of the server port = 8010 # set to your local port being used -rootFolderId = '68b832711394856d5124a243' # root folderId to push data to -limit = 1000 # only want to process X videos +rootFolderId = '6a046a841f9dc1415085ab7d' # root folderId to push data to +limit = 100 # only want to process X videos # Below is a global variable which shouldn't be edited totalFolders = 0 # use to maintain a total count of items added diff --git a/scripts/generateNDJSON.py b/scripts/metadata/generateNDJSON.py similarity index 100% rename from scripts/generateNDJSON.py rename to scripts/metadata/generateNDJSON.py From 5fe6a331381d8dd9ffd9b92ab943721ad375bb92 Mon Sep 17 00:00:00 2001 From: Bryon Lewis Date: Wed, 13 May 2026 10:41:59 -0400 Subject: [PATCH 2/5] metadata key processing updates --- server/dive_server/views_metadata.py | 307 +++++++++++------- server/dive_utils/metadata/models.py | 94 ++++-- server/dive_utils/metadata/numeric.py | 291 +++++++++++++++++ server/pyproject.toml | 2 + ...est_metadata_bulk_import_json_roundtrip.py | 107 ++++++ server/tests/test_numeric_metadata.py | 151 +++++++++ server/tox.ini | 2 + server/uv.lock | 49 ++- 8 files changed, 851 insertions(+), 152 deletions(-) create mode 100644 server/dive_utils/metadata/numeric.py create mode 100644 server/tests/test_metadata_bulk_import_json_roundtrip.py create mode 100644 server/tests/test_numeric_metadata.py diff --git a/server/dive_server/views_metadata.py b/server/dive_server/views_metadata.py index 092c7214..d7755ddd 100644 --- a/server/dive_server/views_metadata.py +++ b/server/dive_server/views_metadata.py @@ -35,10 +35,38 @@ ndjsonRegex, ) from dive_utils.metadata.models import DIVE_Metadata, DIVE_MetadataKeys +from dive_utils.metadata.numeric import ( + categorical_values_for_schema, + coerce_export_empty_strings, + finalize_metadata_keys_numerical_range, + is_nonfinite_numeric_placeholder, + merge_finite_numeric_range_dict, + sanitize_metadata_keys_doc_for_api, + sanitize_value_tree_for_girder_json, + safe_min_max, +) from dive_utils.types import DiveDatasetList, DIVEMetadataSlicerCLITaskParams from . import crud_dataset +# Metadata may store keys that duplicate injected export columns; strip by lowercase name so CSV has one Filename, etc. +_EXPORT_INJECTED_FIELD_LOWERS = frozenset({'divedataset', 'filename', 'dive_url'}) + + +def _metadata_fieldnames_for_export(metadata_items): + """Sorted metadata keys excluding fields we emit as fixed leading columns.""" + keys = {key for item in metadata_items for key in item.get('metadata', {}).keys()} + return sorted(k for k in keys if k.lower() not in _EXPORT_INJECTED_FIELD_LOWERS) + + +def _strip_injected_metadata_keys_copy(meta: dict): + """Remove duplicate injected keys before JSON export (canonical DIVEDataset / Filename / DIVE_URL follow).""" + out = dict(meta) + for k in list(out.keys()): + if k.lower() in _EXPORT_INJECTED_FIELD_LOWERS: + del out[k] + return out + def python_to_javascript_type(py_type): type_mapping = { @@ -55,6 +83,66 @@ def python_to_javascript_type(py_type): return type_mapping.get(py_type, "unknown") +def _accumulate_flat_metadata_key_stats(metadataKeys, flat_dict): + """ + Merge one flat metadata dict into in-progress metadataKeys stats (same rules + as JSON import / recursive dataset creation: finite numerics, no NaN in string sets). + """ + for key, raw in flat_dict.items(): + if key not in metadataKeys and raw is not None: + metadataKeys[key] = { + 'type': python_to_javascript_type(type(raw)), + 'set': set(), + 'count': 0, + } + if raw is None: + continue + typ = metadataKeys[key]['type'] + if typ == 'string': + if is_nonfinite_numeric_placeholder(raw): + continue + metadataKeys[key]['set'].add(raw) + metadataKeys[key]['count'] += 1 + elif typ == 'array': + for el in raw: + if python_to_javascript_type(type(el)) != 'string': + continue + if is_nonfinite_numeric_placeholder(el): + continue + metadataKeys[key]['set'].add(el) + metadataKeys[key]['count'] += 1 + elif typ == 'number': + merged = merge_finite_numeric_range_dict(metadataKeys[key].get('range'), raw) + if merged is None: + continue + metadataKeys[key]['range'] = merged + + +def _finalize_metadata_keys_categories(metadataKeys, categorical_limit): + """Assign categorical vs search vs numerical; JSON-safe sets and numerical ranges.""" + for key in metadataKeys.keys(): + bucket = metadataKeys[key] + bucket['unique'] = len(bucket['set']) + t = bucket['type'] + if t in ('string', 'array') and ( + bucket['unique'] < categorical_limit + or (bucket['count'] <= len(bucket['set']) and len(bucket['set']) < categorical_limit) + ): + bucket['category'] = 'categorical' + bucket['set'] = categorical_values_for_schema(list(bucket['set'])) + elif t == 'string': + bucket['category'] = 'search' + del bucket['set'] + elif t == 'number': + bucket['category'] = 'numerical' + del bucket['set'] + rng = bucket.get('range') + if isinstance(rng, dict): + finalize_metadata_keys_numerical_range(rng) + else: + del bucket['set'] + + # Keys we do not attach imported {value, description} schema text to (matchers / system fields). _METADATA_IMPORT_DESCRIPTION_EXCLUDE_KEYS = frozenset( { @@ -103,6 +191,9 @@ def normalize_metadata_row_for_storage(row): - Per row: ``"DIVEMetadataKeyDescriptions": {"MyKey": "...", "Other": {"description": "..."}}`` (this object is not stored on dataset rows). """ + if isinstance(row, dict): + # JSON may contain NaN/Infinity; pandas CSV rows use float nan — keep trees Girder-JSON-safe. + sanitize_value_tree_for_girder_json(row, minmax_keys_to_zero=False) if not isinstance(row, dict): return row, {} out = {} @@ -197,9 +288,19 @@ def find_folder_by_path(folder, sibling_path, user): return current_folder +def _loads_metadata_import_json(s: str): + """ + Parse JSON for metadata bulk / folder import. + + Python's json.loads accepts NaN/Infinity by default; map those to None so we + never propagate non-finite floats like sparse CSV cells do. + """ + return json.loads(s, parse_constant=lambda _c: None) + + def load_ndjson_string(ndjson_string): # Split the string into lines and parse each line as JSON - return [json.loads(line) for line in ndjson_string.splitlines()] + return [_loads_metadata_import_json(line) for line in ndjson_string.splitlines()] def load_metadata_json(search_folder, type='ndjson'): @@ -223,7 +324,7 @@ def load_metadata_json(search_folder, type='ndjson'): file_string = b"".join(list(file_generator)).decode() json_data = {} if type == 'json': - json_data = json.loads(file_string) + json_data = _loads_metadata_import_json(file_string) elif type == 'ndjson': json_data = load_ndjson_string(file_string) # Now we determine data types from the array of data @@ -233,6 +334,16 @@ def load_metadata_json(search_folder, type='ndjson'): return json_data, file['name'] +_BULK_IMPORT_SKIP_KEYS = frozenset( + {'divedataset', 'filename', 'dive_path', 'divemetadatakeydescriptions'} +) + + +def _bulk_import_row_is_matcher_key(key): + """True for columns used only to locate a row (never stored via updateKey).""" + return isinstance(key, str) and key.lower() in _BULK_IMPORT_SKIP_KEYS + + def bulk_metadata_update_process(user, rootFolder, updates, replace=False): results = [] # Get or create MetadataKeys for the root @@ -259,11 +370,12 @@ def bulk_metadata_update_process(user, rootFolder, updates, replace=False): new_keys[key].add(value) # Infer types/categories for new keys for key, values in new_keys.items(): - if isinstance(next(iter(values)), str): - category = 'categorical' if len(values) < categoricalLimit else 'search' - elif isinstance(next(iter(values)), (int, float)): + # CSV/partial rows often yield NaN for empty cells; min/max must stay JSON-finite. + mm = safe_min_max(values) + if mm is not None: category = 'numerical' - + elif any(isinstance(v, str) for v in values): + category = 'categorical' if len(values) < categoricalLimit else 'search' else: category = 'search' info = { @@ -271,12 +383,9 @@ def bulk_metadata_update_process(user, rootFolder, updates, replace=False): "count": len(values), } if category == 'categorical': - info['set'] = list(values) + info['set'] = categorical_values_for_schema(values) elif category == 'numerical': - info['range'] = { - "min": min(values), - "max": max(values), - } + info['range'] = {'min': mm[0], 'max': mm[1]} DIVE_MetadataKeys().addKey(rootFolder, user, key, info, unlocked=False) for entry in normalized_updates: reason = None @@ -298,20 +407,31 @@ def bulk_metadata_update_process(user, rootFolder, updates, replace=False): if not dive_metadata: reason = f"No dataset found with id {dataset_id}" elif video_name: - dive_metadata_items = DIVE_Metadata().find( - {"filename": video_name, 'root': str(rootFolder["_id"])} + # List matches once: the old code only set dive_metadata inside count() > 1, so a + # unique Filename never matched and every row looked like "not_found". + filename_matches = list( + DIVE_Metadata().find( + {"filename": video_name, 'root': str(rootFolder["_id"])} + ) ) - if dive_metadata_items.count() > 1: - # do we have to match the resouce with the DIVE Path + if len(filename_matches) == 0: + dive_metadata = None + elif len(filename_matches) == 1: + dive_metadata = filename_matches[0] + else: dive_path = entry.get('DIVE_Path', False) if not dive_path: - reason = f"Multiple datasets found with videoName {video_name}, need DIVE_Path to disambiguate" + reason = ( + f"Multiple datasets found with videoName {video_name}, need DIVE_Path to disambiguate" + ) else: - for item in dive_metadata_items: + for item in filename_matches: if item.get('DIVE_Path', False) == dive_path: dive_metadata = item break - if not dive_metadata: + if not dive_metadata: + reason = f"No dataset matched videoName {video_name} with DIVE_Path {dive_path}" + if not dive_metadata and not reason: reason = f"No dataset found with videoName or DIVE_Name {video_name}" else: raise RestException('Metadata Updates need either DIVEDataset or Filename', code=400) @@ -325,6 +445,8 @@ def bulk_metadata_update_process(user, rootFolder, updates, replace=False): if replace: DIVE_Metadata().removeCustomKeys(dataset, rootId, user) for key, value in entry.items(): + if _bulk_import_row_is_matcher_key(key): + continue # Set the value for this key on the dataset try: DIVE_Metadata().updateKey( @@ -609,6 +731,9 @@ def process_metadata( item[f'ffprobe_{ffMetadataKey}'] = ffmetadata.get( ffMetadataKey, False ) + sanitize_value_tree_for_girder_json( + item, minmax_keys_to_zero=False + ) DIVE_Metadata().createMetadata( datasetFolder, folder, user, item ) @@ -627,45 +752,9 @@ def process_metadata( else: errorLog.append(f"Could not find any results for Video file {item[matcher]}") - for key in item.keys(): - if key not in metadataKeys.keys() and item[key] is not None: - datatype = python_to_javascript_type(type(item[key])) - metadataKeys[key] = {"type": datatype, "set": set(), "count": 0} - if item[key] is None: - continue # we skip null values for processing - if metadataKeys[key]['type'] == 'string': - metadataKeys[key]['set'].add(item[key]) - metadataKeys[key]['count'] += 1 - if metadataKeys[key]['type'] == 'array': - for arrayitem in item[key]: - if python_to_javascript_type(type(arrayitem)) == 'string': - metadataKeys[key]['set'].add(arrayitem) - metadataKeys[key]['count'] += 1 - if metadataKeys[key]['type'] == 'number': - if 'range' not in metadataKeys[key].keys(): - metadataKeys[key]['range'] = {"min": item[key], "max": item[key]} - metadataKeys[key]['range'] = { - "min": min(item[key], metadataKeys[key]["range"]["min"]), - "max": max(item[key], metadataKeys[key]["range"]["max"]), - } + _accumulate_flat_metadata_key_stats(metadataKeys, item) # now we need to determine what is categorical vs what is a search field - for key in metadataKeys.keys(): - item = metadataKeys[key] - metadataKeys[key]["unique"] = len(item["set"]) - if item["type"] in ['string', 'array'] and ( - item["unique"] < categoricalLimit - or (item["count"] <= len(item["set"]) and len(item["set"]) < categoricalLimit) - ): - metadataKeys[key]["category"] = "categorical" - metadataKeys[key]['set'] = list(metadataKeys[key]['set']) - elif item["type"] == 'string': - metadataKeys[key]["category"] = "search" - del metadataKeys[key]['set'] - elif item["type"] == 'number': - metadataKeys[key]["category"] = "numerical" - del metadataKeys[key]['set'] - else: - del metadataKeys[key]['set'] + _finalize_metadata_keys_categories(metadataKeys, categoricalLimit) _apply_imported_descriptions_to_metadata_keys(metadataKeys, key_import_descriptions) DIVE_MetadataKeys().createMetadataKeys(folder, user, metadataKeys) # add metadata to root folder for @@ -783,47 +872,12 @@ def create_metadata_folder( data[f'ffprobe_{ffMetadataKey}'] = float(raw_value) except (ValueError, TypeError): data[f'ffprobe_{ffMetadataKey}'] = str(raw_value) + sanitize_value_tree_for_girder_json(data, minmax_keys_to_zero=False) DIVE_Metadata().createMetadata(item, base_folder, user, data) - for key in data.keys(): - if key not in metadataKeys.keys() and data[key] is not None: - datatype = python_to_javascript_type(type(data[key])) - metadataKeys[key] = {"type": datatype, "set": set(), "count": 0} - if data[key] is None: - continue # we skip null values for processing - if metadataKeys[key]['type'] == 'string': - metadataKeys[key]['set'].add(data[key]) - metadataKeys[key]['count'] += 1 - if metadataKeys[key]['type'] == 'array': - for arrayitem in data[key]: - if python_to_javascript_type(type(arrayitem)) == 'string': - metadataKeys[key]['set'].add(arrayitem) - metadataKeys[key]['count'] += 1 - if metadataKeys[key]['type'] == 'number': - if 'range' not in metadataKeys[key].keys(): - metadataKeys[key]['range'] = {"min": data[key], "max": data[key]} - metadataKeys[key]['range'] = { - "min": min(data[key], metadataKeys[key]["range"]["min"]), - "max": max(data[key], metadataKeys[key]["range"]["max"]), - } + _accumulate_flat_metadata_key_stats(metadataKeys, data) # now we need to determine what is categorical vs what is a search field - for key in metadataKeys.keys(): - item = metadataKeys[key] - metadataKeys[key]["unique"] = len(item["set"]) - if item["type"] in ['string', 'array'] and ( - item["unique"] < categoricalLimit - or (item["count"] <= len(item["set"]) and len(item["set"]) < categoricalLimit) - ): - metadataKeys[key]["category"] = "categorical" - metadataKeys[key]['set'] = list(metadataKeys[key]['set']) - elif item["type"] == 'string': - metadataKeys[key]["category"] = "search" - del metadataKeys[key]['set'] - elif item["type"] == 'number': - metadataKeys[key]["category"] = "numerical" - del metadataKeys[key]['set'] - else: - del metadataKeys[key]['set'] + _finalize_metadata_keys_categories(metadataKeys, categoricalLimit) DIVE_MetadataKeys().createMetadataKeys(base_folder, user, metadataKeys) # add metadata to root folder for base_folder['meta'][DIVEMetadataMarker] = True @@ -862,17 +916,10 @@ def get_metadata_keys( query=query, user=user, ) - keys = metadata_key['metadataKeys'] - for key in keys: - item = keys[key] - if item.get('category', False): - if item['category'] == 'numerical': - if ( - item['range'] - and item['range']['min'] == float('inf') - or item['range']['max'] == float('-inf') - ): - item['range'] = {'min': 0, 'max': 0} + # NaN/inf in numerical ranges or categorical sets (common after partial CSV imports) + # breaks json.dumps(allow_nan=False). + if sanitize_metadata_keys_doc_for_api(metadata_key): + DIVE_MetadataKeys().save(metadata_key) if metadata_key.get('unlocked', False) is False: metadata_key['unlocked'] = [] DIVE_MetadataKeys().initialize_updated_data(folder, None) @@ -907,9 +954,14 @@ def filter_folder(self, folder, filters, limit, offset, sort): ) if metadata_items is not None: pages = math.ceil(metadata_items.count() / limit) + page_list = list(metadata_items) + # Dataset rows can contain pandas NaN in metadata (e.g. sparse CSV); Girder JSON forbids NaN. + for doc in page_list: + if sanitize_value_tree_for_girder_json(doc, minmax_keys_to_zero=False): + DIVE_Metadata().save(doc) structured_results = { 'totalPages': pages, - 'pageResults': list(metadata_items), + 'pageResults': page_list, 'count': total_items, 'filtered': metadata_items.count(), } @@ -1069,7 +1121,7 @@ def get_metadata_filter(self, folder, keys=None): for key in results.keys(): results[key] = sorted(results[key]) - + sanitize_value_tree_for_girder_json(results, minmax_keys_to_zero=False) return results @access.user @@ -1211,7 +1263,8 @@ def add_metadata_key( if category == 'categorical' and values_arr and len(values_arr) > 0: info['set'] = list(set(values_arr)) if category == 'numerical': - info['range'] = {'min': float('inf'), 'max': float('-inf')} + # Do not use inf sentinels — Girder JSON responses use allow_nan=False. + info['range'] = {'min': 0.0, 'max': 0.0} if info.get('set', None) is None: info['set'] = [] desc_text = (description or '').strip() if description is not None else '' @@ -1515,18 +1568,14 @@ def export_metadata(self, folder, filters, format, baseURL): if format == 'csv': output = io.StringIO(newline='') - # Infer CSV headers from all keys used across items - headers = sorted( - {key for item in metadata_items for key in item.get('metadata', {}).keys()} - ) - headers.insert(0, 'DIVEDataset') # Add DIVEDataset as first column - headers.insert(1, 'Filename') # Add filename as second column - headers.insert(2, 'DIVE_URL') # Add DIVE_URL as third column + # Infer CSV headers from metadata keys, excluding columns we inject once (avoids duplicate Filename). + meta_headers = _metadata_fieldnames_for_export(metadata_items) + headers = ['DIVEDataset', 'Filename', 'DIVE_URL', *meta_headers] writer = csv.DictWriter(output, fieldnames=headers, extrasaction='ignore') writer.writeheader() for item in metadata_items: - row = {key: item.get('metadata', {}).get(key, '') for key in headers} + row = {key: item.get('metadata', {}).get(key, '') for key in meta_headers} row['DIVEDataset'] = str(item['DIVEDataset']) row['Filename'] = item.get('filename', '') # generate the url for the dive metadata it should be of the format like @@ -1539,6 +1588,8 @@ def export_metadata(self, folder, filters, format, baseURL): else: tempBaseURL = getApiUrl().replace('/api/v1', '') row['DIVE_URL'] = f"{tempBaseURL}/#/viewer/{item['DIVEDataset']}?diveMetadataRootId={item['root']}" + sanitize_value_tree_for_girder_json(row, minmax_keys_to_zero=False) + coerce_export_empty_strings(row) writer.writerow(row) csv_output = output.getvalue() @@ -1552,7 +1603,7 @@ def export_metadata(self, folder, filters, format, baseURL): else: # JSON export_data = [] for item in metadata_items: - export_item = item.get('metadata', {}).copy() + export_item = _strip_injected_metadata_keys_copy(item.get('metadata', {})) export_item['DIVEDataset'] = str(item['DIVEDataset']) export_item['Filename'] = item.get('filename', '') export_data.append(export_item) @@ -1566,6 +1617,8 @@ def export_metadata(self, folder, filters, format, baseURL): else: tempBaseURL = getApiUrl().replace('/api/v1', '') export_item['DIVE_URL'] = f"{tempBaseURL}/#/viewer/{item['DIVEDataset']}?diveMetadataRootId={item['root']}" + sanitize_value_tree_for_girder_json(export_item, minmax_keys_to_zero=False) + coerce_export_empty_strings(export_item) setContentDisposition(filename, mime='application/json') setRawResponse() @@ -1584,12 +1637,12 @@ def bulk_metadata_process_file(self, user, rootFolder, updates, replace=False): json.dumps(previous_data).encode('utf-8') results = bulk_metadata_update_process(user, rootFolder, updates, replace) - # get errors in the results - errors = [] - for item in results: - if 'error' in item.keys(): - errors.append(item['error']) - if len(errors) > 0: + # Skip the heavy history snapshot if any row failed (not just not_found: partial_success + # and per-key errors use the "errors" list, which the old code ignored). + failed = any( + item.get('status') in ('not_found', 'error', 'partial_success') for item in results + ) + if failed: return results # Check and find DIVEMetadataMarker folder or create it if it doesn't exist metadata_folder = Folder().findOne( @@ -1651,6 +1704,8 @@ def bulk_metadata_process_file(self, user, rootFolder, updates, replace=False): ) def bulk_update_metadata_file(self, rootFolder, replace): user = self.getCurrentUser() + if rootFolder['meta'].get(DIVEMetadataMarker, False) is False: + raise RestException('Folder is not a DIVE Metadata folder', code=404) # We Need to find any JSON file in the folder and be able to proces it unprocessed_items = Folder().childItems( rootFolder, @@ -1675,9 +1730,9 @@ def bulk_update_metadata_file(self, rootFolder, replace): file_generator = File().download(file, headers=False)() file_string = b"".join(list(file_generator)).decode() if file['name'].endswith('.json'): # standard json file - updates = json.loads(file_string) + updates = _loads_metadata_import_json(file_string) elif file['name'].endswith('.ndjson'): # new line delimited json - updates = [json.loads(line) for line in file_string.splitlines()] + updates = [_loads_metadata_import_json(line) for line in file_string.splitlines()] elif file['name'].endswith('.csv'): # use pandas for automatic type conversion df = pd.read_csv(io.StringIO(file_string)) updates = df.to_dict(orient='records') @@ -1715,6 +1770,8 @@ def bulk_update_metadata_file(self, rootFolder, replace): ) def bulk_update_metadata(self, rootFolder, updates, replace=False): user = self.getCurrentUser() + if rootFolder['meta'].get(DIVEMetadataMarker, False) is False: + raise RestException('Folder is not a DIVE Metadata folder', code=404) # We want to get the Data before processing results = self.bulk_metadata_process_file(user, rootFolder, updates, replace) diff --git a/server/dive_utils/metadata/models.py b/server/dive_utils/metadata/models.py index 3b3e9ce9..b114d254 100644 --- a/server/dive_utils/metadata/models.py +++ b/server/dive_utils/metadata/models.py @@ -1,4 +1,5 @@ import datetime +import math from dateutil import parser from girder import events @@ -6,6 +7,12 @@ from girder.exceptions import ValidationException from girder.models.model_base import Model +from dive_utils.metadata.numeric import ( + categorical_values_for_schema, + is_nonfinite_numeric_placeholder, + merge_numeric_sample_into_range_dict, +) + class DIVE_Metadata(Model): # This is NOT an access controlled model; it is expected that all endpoints @@ -71,15 +78,22 @@ def validate(self, doc): return doc def updateKey(self, folder, root, owner, key, value, categoricalLimit=50, force=False): - existing = self.findOne({'DIVEDataset': str(folder['_id']), 'root': root}) + # root is the DIVE metadata *collection* folder id (string); folder is the dataset Girder folder. + root_id = str(root) + existing = self.findOne({'DIVEDataset': str(folder['_id']), 'root': root_id}) if not existing: - raise Exception(f'Note MetadataKeys with folderId: {folder["_id"]} found') - query = {'root': root} + raise Exception( + f'No DIVE_Metadata row for datasetId={folder["_id"]} and metadataRoot={root_id}' + ) + query = {'root': root_id} metadataKeys = DIVE_MetadataKeys().findOne( query=query, ) if not metadataKeys: - raise Exception(f'Could not find the root metadataKeys with folderId: {folder["_id"]}') + raise Exception( + f'No DIVE_MetadataKeys document for metadataRoot={root_id} ' + f'(request was for datasetId={folder["_id"]})' + ) if ( key not in metadataKeys['unlocked'] and metadataKeys['owner'] != str(owner['_id']) ) and force is False: @@ -106,16 +120,35 @@ def updateKey(self, folder, root, owner, key, value, categoricalLimit=50, force= raise Exception( f'Key: {key} is not in the metadata only keys: {editable_keys} can be updated' ) - if metadataKeys['metadataKeys'][key]['category'] == 'numerical': - existing['metadata'][key] = float(value) + cat = metadataKeys['metadataKeys'][key]['category'] + finite_num = None + non_num_stored = None + if cat == 'numerical': + try: + fv = float(value) + except (TypeError, ValueError): + fv = float('nan') + # Sparse CSV cells become NaN — never persist (breaks Girder JSON on filter/metadata APIs). + if math.isfinite(fv): + existing['metadata'][key] = fv + finite_num = fv + else: + existing['metadata'][key] = None else: - existing['metadata'][key] = value + non_num_stored = value + if is_nonfinite_numeric_placeholder(non_num_stored): + non_num_stored = None + existing['metadata'][key] = non_num_stored self.save(existing) - # now we need to update the metadataKey - DIVE_MetadataKeys().updateKeyValue(existing['root'], owner, key, value, categoricalLimit) + # now we need to update the metadataKey aggregate (skip non-finite numericals) + if cat == 'numerical' and finite_num is not None: + DIVE_MetadataKeys().updateKeyValue(existing['root'], owner, key, finite_num, categoricalLimit) + elif cat != 'numerical' and non_num_stored is not None: + DIVE_MetadataKeys().updateKeyValue(existing['root'], owner, key, non_num_stored, categoricalLimit) def deleteKey(self, folder, root, owner, key): - existing = self.findOne({'DIVEDataset': str(folder['_id']), 'root': root}) + root_id = str(root) + existing = self.findOne({'DIVEDataset': str(folder['_id']), 'root': root_id}) if not existing: return query = {'root': existing['root']} @@ -124,22 +157,33 @@ def deleteKey(self, folder, root, owner, key): owner=str(owner['_id']), ) if not metadataKeys: - raise Exception(f'Could not find the root metadataKeys with folderId: {folder["_id"]}') + raise Exception( + f'No DIVE_MetadataKeys document for metadataRoot={existing["root"]} ' + f'(deleteKey context datasetId={folder["_id"]})' + ) if existing['metadata'].get(key, None) is not None: del existing['metadata'][key] self.save(existing) def removeCustomKeys(self, folder, root, owner): - existing = self.findOne({'DIVEDataset': str(folder['_id'])}) + # Must scope by root: the same dataset id must not be paired with another metadata collection's row. + root_id = str(root) + existing = self.findOne({'DIVEDataset': str(folder['_id']), 'root': root_id}) if not existing: - raise Exception(f'Note MetadataKeys with folderId: {folder["_id"]} not found') + raise Exception( + f'No DIVE_Metadata row for datasetId={folder["_id"]} and metadataRoot={root_id} ' + f'(cannot clear custom keys for replace import)' + ) query = {'root': existing['root']} metadataKeys = DIVE_MetadataKeys().findOne( query=query, owner=str(owner['_id']), ) if not metadataKeys: - raise Exception(f'Could not find the root metadataKeys with folderId: {folder["_id"]}') + raise Exception( + f'No DIVE_MetadataKeys document for metadataRoot={query["root"]} and owner={owner["_id"]} ' + f'(removeCustomKeys context datasetId={folder["_id"]})' + ) keys_to_remove = [ key for key in existing['metadata'].keys() @@ -388,17 +432,17 @@ def updateKeyValue(self, folderId, owner, key, value, categoricalLimit): keyData = existing['metadataKeys'][key] category = keyData['category'] if category == 'categorical': - keyDataSet = set(keyData['set']) - if len(keyDataSet) + 1 < categoricalLimit: - keyDataSet.add(value) - else: - keyData['category'] = 'search' - del keyData['set'] - keyData['set'] = list(keyDataSet) + keyDataSet = set(categorical_values_for_schema(keyData.get('set', []))) + skip_add = is_nonfinite_numeric_placeholder(value) + if not skip_add: + if len(keyDataSet) + 1 < categoricalLimit: + keyDataSet.add(value) + else: + keyData['category'] = 'search' + keyData.pop('set', None) + if keyData.get('category') == 'categorical': + keyData['set'] = list(keyDataSet) if category == 'numerical' and keyData.get('range', False): - range = keyData['range'] - range['min'] = min(float(value), float(range['min'])) - range['max'] = max(float(value), float(range['max'])) - keyData['range'] = range + merge_numeric_sample_into_range_dict(keyData['range'], value) existing['metadataKeys'][key] = keyData self.save(existing) diff --git a/server/dive_utils/metadata/numeric.py b/server/dive_utils/metadata/numeric.py new file mode 100644 index 00000000..190b8b94 --- /dev/null +++ b/server/dive_utils/metadata/numeric.py @@ -0,0 +1,291 @@ +""" +Helpers for metadata key ranges and tabular import numbers. + +Pandas often yields NaN for empty CSV cells; JSON imports can also surface NaN or +Infinity because Python's json.loads accepts those literals by default. min/max +and stored values must stay finite for Girder's JSON (allow_nan=False). +""" + +from __future__ import annotations + +import math +import numbers +from typing import Any, Iterable + + +def as_json_finite_float(x: Any, default: float = 0.0) -> float: + """Coerce to float; return default if missing, non-numeric, nan, or inf.""" + if isinstance(x, bool): + return default + try: + fx = float(x) + except (TypeError, ValueError): + return default + return fx if math.isfinite(fx) else default + + +def as_finite_float_or_none(x: Any) -> float | None: + """Finite float or None (reject bool / nan / inf).""" + if isinstance(x, bool): + return None + try: + fx = float(x) + except (TypeError, ValueError): + return None + return fx if math.isfinite(fx) else None + + +def is_nonfinite_numeric_placeholder(v: Any) -> bool: + """ + True for NaN/inf reals (bool excluded). Used when merging values into string + categorical aggregates so CSV NaN columns do not pollute sets. + """ + if isinstance(v, bool): + return False + if isinstance(v, numbers.Real): + return not math.isfinite(float(v)) + return False + + +def merge_finite_numeric_range_dict( + current_range: dict[str, Any] | None, sample: Any +) -> dict[str, float] | None: + """ + Merge one numeric sample into a metadataKeys-style {'min','max'} dict. + + Endpoints must already be finite or missing; non-finite samples are skipped + (returns None). If an endpoint is missing/non-finite, range is re-seeded from + the sample only (safe for initial key inference from clean rows). + """ + fv = as_finite_float_or_none(sample) + if fv is None: + return None + if not isinstance(current_range, dict): + return {'min': fv, 'max': fv} + lo = as_finite_float_or_none(current_range.get('min')) + hi = as_finite_float_or_none(current_range.get('max')) + if lo is None or hi is None: + return {'min': fv, 'max': fv} + return {'min': min(fv, lo), 'max': max(fv, hi)} + + +def merge_numeric_sample_into_range_dict(rng: dict[str, Any], sample: Any) -> bool: + """ + Lenient in-place merge for persisted metadataKeys ranges (may contain legacy + non-finite endpoints). Returns False if sample is not a finite float. + """ + fv = as_finite_float_or_none(sample) + if fv is None: + return False + lo = as_json_finite_float(rng.get('min'), fv) + hi = as_json_finite_float(rng.get('max'), fv) + rng['min'] = min(fv, lo) + rng['max'] = max(fv, hi) + if rng['min'] > rng['max']: + rng['min'], rng['max'] = rng['max'], rng['min'] + return True + + +def finalize_metadata_keys_numerical_range(r: dict[str, Any]) -> None: + """Normalize min/max to finite JSON-safe floats and fix inversion (mutates r).""" + lo = as_json_finite_float(r.get('min'), 0.0) + hi = as_json_finite_float(r.get('max'), 0.0) + if lo > hi: + lo, hi = hi, lo + r['min'], r['max'] = lo, hi + + +def finite_numeric_samples(values: Iterable[Any]) -> list[float]: + """Collect finite floats from a mixed-value iterable (e.g. CSV column values).""" + out: list[float] = [] + for v in values: + if isinstance(v, bool): + continue + if isinstance(v, numbers.Real): + fx = float(v) + if math.isfinite(fx): + out.append(fx) + return out + + +def safe_min_max(values: Iterable[Any]) -> tuple[float, float] | None: + """Return (min, max) over finite numeric samples, or None if empty.""" + nums = finite_numeric_samples(values) + if not nums: + return None + return min(nums), max(nums) + + +def categorical_values_for_schema(values: Iterable[Any]) -> list[Any]: + """Drop non-finite numeric placeholders (CSV NaN) so metadata key sets stay JSON-safe.""" + out: list[Any] = [] + for v in values: + if isinstance(v, numbers.Real) and not isinstance(v, bool): + if not math.isfinite(float(v)): + continue + out.append(v) + return out + + +def _float_or_nan(x: Any) -> float: + try: + return float(x) + except (TypeError, ValueError): + return float('nan') + + +def repair_numerical_ranges_in_metadata_keys_doc(metadata_key_doc: dict) -> bool: + """ + Normalize every numerical key's range to JSON-safe finite floats. + + Mutates metadata_key_doc in place. Returns True if anything was changed + (caller may persist with Model.save). + """ + keys = metadata_key_doc.get('metadataKeys') or {} + changed = False + for _name, item in keys.items(): + if not isinstance(item, dict) or item.get('category') != 'numerical': + continue + r = item.get('range') + if not isinstance(r, dict): + item['range'] = {'min': 0.0, 'max': 0.0} + changed = True + continue + raw_lo = _float_or_nan(r.get('min')) + raw_hi = _float_or_nan(r.get('max')) + lo = as_json_finite_float(raw_lo, 0.0) + hi = as_json_finite_float(raw_hi, 0.0) + if lo > hi: + lo, hi = hi, lo + # Always rewrite when originals were non-finite, inverted, or differ from normalized pair + if ( + (not math.isfinite(raw_lo)) + or (not math.isfinite(raw_hi)) + or raw_lo > raw_hi + or lo != raw_lo + or hi != raw_hi + ): + r['min'] = lo + r['max'] = hi + changed = True + return changed + + +def sanitize_categorical_sets_in_metadata_keys_doc(metadata_key_doc: dict) -> bool: + """Remove NaN/inf from categorical value sets (mutates doc).""" + keys = metadata_key_doc.get('metadataKeys') or {} + changed = False + for _name, item in keys.items(): + if not isinstance(item, dict) or item.get('category') != 'categorical': + continue + s = item.get('set') + if not isinstance(s, list): + continue + cleaned = categorical_values_for_schema(s) + if len(cleaned) != len(s): + item['set'] = cleaned + changed = True + return changed + + +def deep_strip_nonfinite_numbers(metadata_key_doc: dict) -> bool: + """ + Catch-all for DIVE_MetadataKeys documents: non-finite reals in dict values (min/max -> 0), + non-finite list elements removed. Returns whether any mutation occurred. + """ + return sanitize_value_tree_for_girder_json(metadata_key_doc, minmax_keys_to_zero=True) + + +def sanitize_value_tree_for_girder_json(root: Any, *, minmax_keys_to_zero: bool = False) -> bool: + """ + Mutate dict/list trees in place so json.dumps(..., allow_nan=False) succeeds. + + - Dict values that are non-finite reals become None, or 0.0 for keys 'min'/'max' when + minmax_keys_to_zero=True (metadata key range endpoints). + - Non-finite reals are removed from lists (e.g. categorical value sets, filter value lists). + """ + changed = False + + def walk(o: Any) -> None: + nonlocal changed + if isinstance(o, dict): + for k, v in list(o.items()): + if isinstance(v, (dict, list)): + walk(v) + elif isinstance(v, numbers.Real) and not isinstance(v, bool): + fx = float(v) + if not math.isfinite(fx): + o[k] = 0.0 if (minmax_keys_to_zero and k in ('min', 'max')) else None + changed = True + elif isinstance(o, list): + i = 0 + while i < len(o): + v = o[i] + if isinstance(v, (dict, list)): + walk(v) + i += 1 + elif isinstance(v, numbers.Real) and not isinstance(v, bool): + if not math.isfinite(float(v)): + del o[i] + changed = True + continue + i += 1 + + walk(root) + return changed + + +def coerce_export_empty_strings(root: Any) -> None: + """ + For tabular/JSON export: use empty string instead of null/NaN for missing or invalid + categorical/search/string-friendly cells (re-import friendly). + """ + if isinstance(root, dict): + for k, v in list(root.items()): + if isinstance(v, dict): + coerce_export_empty_strings(v) + elif isinstance(v, list): + for i, el in enumerate(v): + if isinstance(el, (dict, list)): + coerce_export_empty_strings(el) + elif el is None: + root[k][i] = '' + elif isinstance(el, numbers.Real) and not isinstance(el, bool) and not math.isfinite(float(el)): + root[k][i] = '' + elif isinstance(el, str) and el.strip().lower() == 'nan': + root[k][i] = '' + elif v is None: + root[k] = '' + elif isinstance(v, numbers.Real) and not isinstance(v, bool) and not math.isfinite(float(v)): + root[k] = '' + elif isinstance(v, str) and v.strip().lower() == 'nan': + root[k] = '' + elif isinstance(root, list): + i = 0 + while i < len(root): + el = root[i] + if isinstance(el, dict): + coerce_export_empty_strings(el) + i += 1 + elif isinstance(el, list): + coerce_export_empty_strings(el) + i += 1 + elif el is None: + root[i] = '' + i += 1 + elif isinstance(el, numbers.Real) and not isinstance(el, bool) and not math.isfinite(float(el)): + root[i] = '' + i += 1 + elif isinstance(el, str) and el.strip().lower() == 'nan': + root[i] = '' + i += 1 + else: + i += 1 + + +def sanitize_metadata_keys_doc_for_api(metadata_key_doc: dict) -> bool: + """Repair all known JSON hazards in a DIVE_MetadataKeys document; returns whether to save.""" + c1 = repair_numerical_ranges_in_metadata_keys_doc(metadata_key_doc) + c2 = sanitize_categorical_sets_in_metadata_keys_doc(metadata_key_doc) + c3 = deep_strip_nonfinite_numbers(metadata_key_doc) + return c1 or c2 or c3 diff --git a/server/pyproject.toml b/server/pyproject.toml index a1f1bf8d..1848646a 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -68,6 +68,8 @@ dependencies = [ [dependency-groups] dev = [ "tox>=3.25.0", + "pytest>=7.0", + "pytest-ordering", ] [project.optional-dependencies] diff --git a/server/tests/test_metadata_bulk_import_json_roundtrip.py b/server/tests/test_metadata_bulk_import_json_roundtrip.py new file mode 100644 index 00000000..4fb7312a --- /dev/null +++ b/server/tests/test_metadata_bulk_import_json_roundtrip.py @@ -0,0 +1,107 @@ +""" +Integration-style tests: bulk CSV / JSON import shapes must serialize with +``json.dumps(..., allow_nan=False)`` the same way ``metadata_keys`` and +``filter_folder`` responses do after sanitization. +""" + +from __future__ import annotations + +import io +import json + +import pandas as pd +import pytest + +pytest.importorskip('girder') + +from dive_server.views_metadata import ( # noqa: E402 + _loads_metadata_import_json, + normalize_metadata_row_for_storage, +) +from dive_utils.metadata.numeric import ( # noqa: E402 + sanitize_metadata_keys_doc_for_api, + sanitize_value_tree_for_girder_json, +) + + +def _assert_strict_json(obj): + encoded = json.dumps(obj, allow_nan=False) + assert 'NaN' not in encoded and 'Infinity' not in encoded + json.loads(encoded) + + +@pytest.mark.integration +def test_csv_sparse_numeric_cells_round_trip_strict_json(): + """Pandas leaves NaN for empty cells; normalize must yield strict-JSON dicts.""" + csv_text = ( + 'DIVEDataset,Filename,Score,Label,EmptyNum\n' + 'ds1,video_a.mp4,1.5,hello,\n' + 'ds1,video_b.mp4,,world,\n' + ) + df = pd.read_csv(io.StringIO(csv_text)) + records = df.to_dict(orient='records') + assert len(records) == 2 + for row in records: + norm, _desc = normalize_metadata_row_for_storage(row) + _assert_strict_json(norm) + + +@pytest.mark.integration +def test_json_nan_infinity_literals_round_trip_strict_json(): + """Loose JSON NaN/Infinity become None at parse time; row still strict-JSON after normalize.""" + payload = ( + '[{"DIVEDataset": "x", "Filename": "a.mp4", "v": NaN, "nested": {"x": Infinity}},' + '{"DIVEDataset": "y", "k": [1, -Infinity, 2]}]' + ) + data = _loads_metadata_import_json(payload) + for row in data: + norm, _desc = normalize_metadata_row_for_storage(row) + _assert_strict_json(norm) + assert data[0]['v'] is None + assert data[0]['nested']['x'] is None + assert data[1]['k'] == [1, None, 2] + + +@pytest.mark.integration +def test_metadata_keys_doc_like_get_metadata_keys_round_trip(): + """Mirrors get_metadata_keys: sanitize_metadata_keys_doc_for_api then strict JSON.""" + doc = { + '_id': '507f1f77bcf86cd799439011', + 'root': '507f191e810c19729de860ea', + 'metadataKeys': { + 'Width': { + 'category': 'numerical', + 'range': {'min': float('nan'), 'max': 1920.0}, + }, + 'Tag': { + 'category': 'categorical', + 'set': ['a', float('nan'), 3.0], + }, + }, + 'unlocked': [], + } + sanitize_metadata_keys_doc_for_api(doc) + _assert_strict_json(doc) + + +@pytest.mark.integration +def test_filter_page_results_like_filter_folder_round_trip(): + """Mirrors filter_folder: sanitize each metadata row then encode structured_results.""" + page_list = [ + { + '_id': '507f1f77bcf86cd799439011', + 'DIVEDataset': '507f191e810c19729de860ea', + 'filename': 'a.mp4', + 'root': 'rootid', + 'metadata': {'Score': float('nan'), 'Notes': [1.0, float('inf')]}, + } + ] + for doc in page_list: + sanitize_value_tree_for_girder_json(doc, minmax_keys_to_zero=False) + structured = { + 'totalPages': 1, + 'pageResults': page_list, + 'count': 1, + 'filtered': 1, + } + _assert_strict_json(structured) diff --git a/server/tests/test_numeric_metadata.py b/server/tests/test_numeric_metadata.py new file mode 100644 index 00000000..cb56e74d --- /dev/null +++ b/server/tests/test_numeric_metadata.py @@ -0,0 +1,151 @@ +"""Unit tests for dive_utils.metadata.numeric.""" + +from __future__ import annotations + +import json +import math + +import pytest + +from dive_utils.metadata import numeric as n + + +def test_as_json_finite_float(): + assert n.as_json_finite_float(3.25) == 3.25 + assert n.as_json_finite_float(1) == 1.0 + assert n.as_json_finite_float(None, default=-1.0) == -1.0 + assert n.as_json_finite_float('bad', default=2.0) == 2.0 + assert n.as_json_finite_float(float('nan')) == 0.0 + assert n.as_json_finite_float(float('inf')) == 0.0 + assert n.as_json_finite_float(True, default=0.0) == 0.0 + + +def test_as_finite_float_or_none(): + assert n.as_finite_float_or_none(3) == 3.0 + assert n.as_finite_float_or_none(3.5) == 3.5 + assert n.as_finite_float_or_none(None) is None + assert n.as_finite_float_or_none(True) is None + assert n.as_finite_float_or_none(False) is None + assert n.as_finite_float_or_none(float('nan')) is None + assert n.as_finite_float_or_none(float('inf')) is None + + +@pytest.mark.parametrize( + 'value,expected', + [ + (float('nan'), True), + (float('inf'), True), + (-0.0, False), + (1.0, False), + (True, False), + ('x', False), + ], +) +def test_is_nonfinite_numeric_placeholder(value, expected): + assert n.is_nonfinite_numeric_placeholder(value) is expected + + +def test_merge_finite_numeric_range_dict(): + assert n.merge_finite_numeric_range_dict(None, float('nan')) is None + assert n.merge_finite_numeric_range_dict(None, 2.0) == {'min': 2.0, 'max': 2.0} + assert n.merge_finite_numeric_range_dict({'min': 1.0, 'max': 3.0}, 2.5) == { + 'min': 1.0, + 'max': 3.0, + } + assert n.merge_finite_numeric_range_dict({'min': float('nan'), 'max': 1.0}, 5.0) == { + 'min': 5.0, + 'max': 5.0, + } + + +def test_merge_numeric_sample_into_range_dict(): + rng = {'min': float('nan'), 'max': 10.0} + assert n.merge_numeric_sample_into_range_dict(rng, 3.0) is True + assert math.isfinite(rng['min']) and math.isfinite(rng['max']) + assert rng['min'] <= rng['max'] + + bad = {'min': 0.0, 'max': 1.0} + assert n.merge_numeric_sample_into_range_dict(bad, float('nan')) is False + + +def test_finalize_metadata_keys_numerical_range(): + r = {'min': float('nan'), 'max': float('inf')} + n.finalize_metadata_keys_numerical_range(r) + assert r == {'min': 0.0, 'max': 0.0} + + r2 = {'min': 5.0, 'max': 1.0} + n.finalize_metadata_keys_numerical_range(r2) + assert r2['min'] <= r2['max'] + + +def test_finite_numeric_samples_and_safe_min_max(): + vals = [1, 2.0, float('nan'), True, 'skip', float('inf'), 3] + assert n.finite_numeric_samples(vals) == [1.0, 2.0, 3.0] + assert n.safe_min_max(vals) == (1.0, 3.0) + assert n.safe_min_max([float('nan')]) is None + + +def test_categorical_values_for_schema(): + out = n.categorical_values_for_schema([1.0, float('nan'), 'a', None]) + assert out == [1.0, 'a', None] + + +def test_repair_numerical_ranges_in_metadata_keys_doc(): + doc = { + 'metadataKeys': { + 'n': {'category': 'numerical', 'range': {'min': float('nan'), 'max': 2.0}}, + 'c': {'category': 'categorical', 'set': [1.0]}, + } + } + assert n.repair_numerical_ranges_in_metadata_keys_doc(doc) is True + assert math.isfinite(doc['metadataKeys']['n']['range']['min']) + assert math.isfinite(doc['metadataKeys']['n']['range']['max']) + + +def test_sanitize_categorical_sets_in_metadata_keys_doc(): + doc = { + 'metadataKeys': { + 'c': { + 'category': 'categorical', + 'set': ['x', float('nan'), 2.0], + } + } + } + assert n.sanitize_categorical_sets_in_metadata_keys_doc(doc) is True + assert doc['metadataKeys']['c']['set'] == ['x', 2.0] + + +def test_sanitize_value_tree_for_girder_json_dict_and_list(): + d = {'a': float('nan'), 'b': [1.0, float('inf'), 2.0]} + assert n.sanitize_value_tree_for_girder_json(d, minmax_keys_to_zero=False) is True + assert d['a'] is None + assert d['b'] == [1.0, 2.0] + + d2 = {'min': float('nan'), 'max': float('inf')} + n.sanitize_value_tree_for_girder_json(d2, minmax_keys_to_zero=True) + assert d2['min'] == 0.0 and d2['max'] == 0.0 + + +def test_deep_strip_nonfinite_numbers_uses_minmax_zero(): + doc = {'metadataKeys': {'n': {'range': {'min': float('nan'), 'max': 1.0}}}} + n.deep_strip_nonfinite_numbers(doc) + assert doc['metadataKeys']['n']['range']['min'] == 0.0 + + +def test_coerce_export_empty_strings(): + d = {'a': None, 'b': float('nan'), 'c': 'NaN', 'd': [None, float('nan')]} + n.coerce_export_empty_strings(d) + assert d['a'] == '' and d['b'] == '' and d['c'] == '' + assert d['d'] == ['', ''] + + +def test_sanitize_metadata_keys_doc_for_api_round_trip_json(): + doc = { + 'metadataKeys': { + 'num': {'category': 'numerical', 'range': {'min': float('nan'), 'max': float('inf')}}, + 'cat': {'category': 'categorical', 'set': [1.0, float('nan'), 'z']}, + }, + 'root': '507f1f77bcf86cd799439011', + } + n.sanitize_metadata_keys_doc_for_api(doc) + json.dumps(doc, allow_nan=False) diff --git a/server/tox.ini b/server/tox.ini index 31fbb87c..b0de2713 100644 --- a/server/tox.ini +++ b/server/tox.ini @@ -144,3 +144,5 @@ black-config = pyproject.toml [pytest] addopts = --strict-markers --showlocals --verbose +markers = + integration: combines import sanitization with Girder JSON (allow_nan=False) contract diff --git a/server/uv.lock b/server/uv.lock index 75d8d301..4fba616f 100644 --- a/server/uv.lock +++ b/server/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10.0, <=3.12" resolution-markers = [ "python_full_version >= '3.12' and sys_platform == 'win32'", @@ -485,6 +485,8 @@ sam = [ [package.dev-dependencies] dev = [ + { name = "pytest" }, + { name = "pytest-ordering" }, { name = "tox" }, ] @@ -517,7 +519,11 @@ requires-dist = [ provides-extras = ["sam"] [package.metadata.requires-dev] -dev = [{ name = "tox", specifier = ">=3.25.0" }] +dev = [ + { name = "pytest", specifier = ">=7.0" }, + { name = "pytest-ordering" }, + { name = "tox", specifier = ">=3.25.0" }, +] [[package]] name = "dnspython" @@ -726,6 +732,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, ] +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + [[package]] name = "iopath" version = "0.1.10" @@ -1622,6 +1637,36 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/38/e5de737bc5a1b556ed4f3086f14eceb4b62c637957907c0d37df6585a112/pyrabbit2-1.0.7-py3-none-any.whl", hash = "sha256:9c5ac7751781705083f893c6aec0909b2ee0ad28e373e4fdbde6231172d64504", size = 12045, upload-time = "2019-01-24T11:49:12.511Z" }, ] +[[package]] +name = "pytest" +version = "9.0.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" }, +] + +[[package]] +name = "pytest-ordering" +version = "0.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e1/ba/65091c36e6e18da479d22d860586e3ba3a4237cc92a66e3ddd945e4fe761/pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6", size = 2629, upload-time = "2018-11-14T00:55:26.004Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/98/adc368fe369465f291ab24e18b9900473786ed1afdf861ba90467eb0767e/pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6", size = 4643, upload-time = "2018-10-25T16:25:18.445Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" From b343be8cb8b86cb016b428db86746d923505f1d0 Mon Sep 17 00:00:00 2001 From: Bryon Lewis Date: Wed, 13 May 2026 11:23:46 -0400 Subject: [PATCH 3/5] add default params for display and ffprobe values plus a unit test --- server/dive_server/views_metadata.py | 44 ++++++++---- server/tests/test_process_metadata_helpers.py | 70 +++++++++++++++++++ 2 files changed, 101 insertions(+), 13 deletions(-) create mode 100644 server/tests/test_process_metadata_helpers.py diff --git a/server/dive_server/views_metadata.py b/server/dive_server/views_metadata.py index d7755ddd..cb55cbb2 100644 --- a/server/dive_server/views_metadata.py +++ b/server/dive_server/views_metadata.py @@ -49,6 +49,15 @@ from . import crud_dataset +_PROCESS_METADATA_DISPLAY_DEFAULT = { + "display": ['Batch', 'SampleDate', 'SubjectId', 'StudyId', 'ExperimentTag'], + "hide": ["ETag", "ETagDuplicated", "Size"], +} +_PROCESS_METADATA_FFPROBE_DEFAULT = { + "import": False, + "keys": ["width", "height", "display_aspect_ratio"], +} + # Metadata may store keys that duplicate injected export columns; strip by lowercase name so CSV has one Filename, etc. _EXPORT_INJECTED_FIELD_LOWERS = frozenset({'divedataset', 'filename', 'dive_url'}) @@ -266,6 +275,8 @@ def _ensure_filter_lists_in_display_config(display_config): def remove_before_folder(path, folder_name): + if not isinstance(path, str): + return None index = path.find(folder_name) if index != -1: return path[index:] @@ -588,20 +599,14 @@ def __init__(self, resourceName): .jsonParam( "displayConfig", "List of Main Display Keys for the metadata and keys to hide from the filter", - required=True, - default={ - "display": ['Batch', 'SampleDate', 'SubjectId', 'StudyId', 'ExperimentTag'], - "hide": ["ETag", "ETagDuplicated", "Size"], - }, + required=False, + default=_PROCESS_METADATA_DISPLAY_DEFAULT, ) .jsonParam( "ffprobeMetadata", "List Metadata keys to extract from the ffprobe metadata from videos. Setting 'import' to 'true' will import the data", - required=True, - default={ - "import": False, - "keys": ["width", "height", "display_aspect_ratio"], - }, + required=False, + default=_PROCESS_METADATA_FFPROBE_DEFAULT, ) .param( "categoricalLimit", @@ -633,6 +638,12 @@ def process_metadata( # Process the current folder for the specified fileType using the matcher to generate DIVE_Metadata # make sure the folder is set to a DIVE Metadata folder using DIVE_METADATA = True user = self.getCurrentUser() + displayConfig = _normalize_metadata_config( + displayConfig, _PROCESS_METADATA_DISPLAY_DEFAULT + ) + ffprobeMetadata = _normalize_metadata_config( + ffprobeMetadata, _PROCESS_METADATA_FFPROBE_DEFAULT + ) # Delete existing data if it is there already: rootQuery = {"root": str(folder["_id"])} found = DIVE_Metadata().findOne(query=rootQuery, user=user) @@ -719,13 +730,20 @@ def process_metadata( item['DIVE_Name'] = datasetFolder['lowerName'] item['DIVE_Path'] = resource_path datasetFolder.get('name') - if ffprobeMetadata.get( - 'import', False - ): # Add in ffprobe metadata to the system + _ff_import = ffprobeMetadata.get('import', False) + if isinstance(_ff_import, str): + _ff_import = _ff_import.strip().lower() in ( + 'true', + '1', + 'yes', + ) + if _ff_import: # Add in ffprobe metadata to the system ffmetadata = datasetFolder.get('meta', {}).get( 'ffprobe_info', {} ) ffkeys = ffprobeMetadata.get('keys', []) + if not isinstance(ffkeys, (list, tuple)): + ffkeys = [] for ffMetadataKey in ffkeys: if ffmetadata.get(ffMetadataKey, False): item[f'ffprobe_{ffMetadataKey}'] = ffmetadata.get( diff --git a/server/tests/test_process_metadata_helpers.py b/server/tests/test_process_metadata_helpers.py new file mode 100644 index 00000000..00fff8bc --- /dev/null +++ b/server/tests/test_process_metadata_helpers.py @@ -0,0 +1,70 @@ +""" +Unit tests for ``process_metadata`` hardening: path parsing and config normalization. +""" + +from __future__ import annotations + +import json + +import pytest + +pytest.importorskip('girder') + +from dive_server.views_metadata import ( # noqa: E402 + _PROCESS_METADATA_DISPLAY_DEFAULT, + _PROCESS_METADATA_FFPROBE_DEFAULT, + _normalize_metadata_config, + remove_before_folder, +) + + +@pytest.mark.parametrize( + 'path', + [None, False, True, 0, 1, [], {}], +) +def test_remove_before_folder_non_string_returns_none(path): + assert remove_before_folder(path, 'anything') is None + + +def test_remove_before_folder_string_slice(): + assert remove_before_folder('/root/foo/bar', 'foo') == 'foo/bar' + assert remove_before_folder('/no/match', 'zzz') is None + + +def test_normalize_metadata_config_none_uses_defaults(): + d = _normalize_metadata_config(None, _PROCESS_METADATA_DISPLAY_DEFAULT) + assert isinstance(d, dict) + assert d['display'] == _PROCESS_METADATA_DISPLAY_DEFAULT['display'] + d['categoricalLimit'] = 50 + assert d['categoricalLimit'] == 50 + + +def test_normalize_metadata_config_json_string_merges_defaults(): + raw = json.dumps({'display': [], 'hide': []}) + d = _normalize_metadata_config(raw, _PROCESS_METADATA_DISPLAY_DEFAULT) + assert d['display'] == [] + assert d['hide'] == [] + + +def test_normalize_metadata_config_invalid_json_string_uses_defaults(): + d = _normalize_metadata_config('not valid json', _PROCESS_METADATA_DISPLAY_DEFAULT) + assert d == dict(_PROCESS_METADATA_DISPLAY_DEFAULT) + + +def test_normalize_metadata_config_ffprobe_partial_merges_keys(): + d = _normalize_metadata_config( + {'import': False}, + _PROCESS_METADATA_FFPROBE_DEFAULT, + ) + assert d['import'] is False + assert d['keys'] == _PROCESS_METADATA_FFPROBE_DEFAULT['keys'] + + +def test_normalize_metadata_config_display_string_like_girder_form_body(): + """Girder may pass a jsonParam value as a JSON object string; normalize yields a mutable dict.""" + payload = json.dumps({'display': ['A'], 'hide': ['B']}) + display_config = _normalize_metadata_config(payload, _PROCESS_METADATA_DISPLAY_DEFAULT) + display_config['categoricalLimit'] = 40 + assert display_config['categoricalLimit'] == 40 + assert display_config['display'] == ['A'] + assert display_config['hide'] == ['B'] From 08a0de7baa9a7606cc5e3db0f472af9af92df792 Mon Sep 17 00:00:00 2001 From: Bryon Lewis Date: Wed, 13 May 2026 11:34:43 -0400 Subject: [PATCH 4/5] update counts and unique when bulk updating metadata --- server/dive_server/views_metadata.py | 96 +++++++++++++++++++ server/tests/test_process_metadata_helpers.py | 41 ++++++++ 2 files changed, 137 insertions(+) diff --git a/server/dive_server/views_metadata.py b/server/dive_server/views_metadata.py index cb55cbb2..0c49b0bd 100644 --- a/server/dive_server/views_metadata.py +++ b/server/dive_server/views_metadata.py @@ -350,6 +350,100 @@ def load_metadata_json(search_folder, type='ndjson'): ) +def _metadata_schema_key_stats_refresh_is_locked(key: str) -> bool: + """Keys whose schema buckets are managed separately; do not overwrite from DB scan.""" + if key in ('LastModifiedTime', 'LastModifiedBy', 'DIVEDataset', 'filename', 'DIVE_Path'): + return True + if key.startswith('DIVE_') or key.startswith('ffprobe'): + return True + return False + + +def _metadata_dict_for_schema_stats_refresh(meta: dict) -> dict: + """Strip locked keys before aggregating schema stats from a stored metadata dict.""" + return {k: v for k, v in meta.items() if not _metadata_schema_key_stats_refresh_is_locked(k)} + + +def _empty_metadata_key_stats_bucket(old_bucket: dict) -> dict: + """Schema entry for a key that has no non-null samples across stored rows.""" + cat = old_bucket.get('category', 'search') + typ = old_bucket.get('type', 'string') + out: dict = {'category': cat, 'count': 0, 'type': typ} + if cat == 'categorical': + out['set'] = [] + out['unique'] = 0 + elif cat == 'numerical': + rng = old_bucket.get('range') + if isinstance(rng, dict): + out['range'] = { + 'min': float(rng.get('min', 0.0)), + 'max': float(rng.get('max', 0.0)), + } + finalize_metadata_keys_numerical_range(out['range']) + else: + out['range'] = {'min': 0.0, 'max': 0.0} + elif cat == 'search' and typ == 'string': + out['unique'] = 0 + desc = old_bucket.get('description') + if isinstance(desc, str) and desc.strip(): + out['description'] = desc.strip() + return out + + +def _merge_recomputed_metadata_key_stats_into_existing( + existing_keys: dict, + recomputed: dict, +) -> dict: + """ + Merge finalized stats from ``recomputed`` into ``existing_keys``. + Locked keys are copied unchanged; descriptions on editable keys are preserved. + """ + out: dict = {} + for key, old_bucket in existing_keys.items(): + if _metadata_schema_key_stats_refresh_is_locked(key): + out[key] = dict(old_bucket) + continue + if key in recomputed: + nb = dict(recomputed[key]) + desc = old_bucket.get('description') + if isinstance(desc, str) and desc.strip(): + nb['description'] = desc.strip() + out[key] = nb + else: + out[key] = _empty_metadata_key_stats_bucket(old_bucket) + for key, rec_bucket in recomputed.items(): + if _metadata_schema_key_stats_refresh_is_locked(key): + continue + if key not in out: + out[key] = dict(rec_bucket) + return out + + +def refresh_metadata_keys_stats_from_stored_dive_metadata(root_folder, categorical_limit: int) -> None: + """ + Recompute aggregate fields on the metadata schema (count, type, unique, set/range, + category) from all ``DIVE_Metadata`` rows for this root. + + Used after bulk updates: ``updateKey`` / ``updateKeyValue`` only widen categorical + sets and numerical ranges and never maintain counts or ``type``/``unique`` like + ``process_metadata`` does. + """ + root_id = str(root_folder['_id']) + keys_doc = DIVE_MetadataKeys().findOne({'root': root_id}) + if not keys_doc: + return + accum: dict = {} + for row in DIVE_Metadata().find({'root': root_id}): + flat = _metadata_dict_for_schema_stats_refresh(row.get('metadata') or {}) + _accumulate_flat_metadata_key_stats(accum, flat) + _finalize_metadata_keys_categories(accum, categorical_limit) + keys_doc['metadataKeys'] = _merge_recomputed_metadata_key_stats_into_existing( + keys_doc['metadataKeys'], + accum, + ) + DIVE_MetadataKeys().save(keys_doc) + + def _bulk_import_row_is_matcher_key(key): """True for columns used only to locate a row (never stored via updateKey).""" return isinstance(key, str) and key.lower() in _BULK_IMPORT_SKIP_KEYS @@ -506,6 +600,8 @@ def bulk_metadata_update_process(user, rootFolder, updates, replace=False): DIVE_MetadataKeys().mergeImportedKeyDescriptions( rootFolder, user, aggregated_descriptions ) + if any(r.get('status') in ('success', 'partial_success') for r in results): + refresh_metadata_keys_stats_from_stored_dive_metadata(rootFolder, categoricalLimit) return results diff --git a/server/tests/test_process_metadata_helpers.py b/server/tests/test_process_metadata_helpers.py index 00fff8bc..78883fc0 100644 --- a/server/tests/test_process_metadata_helpers.py +++ b/server/tests/test_process_metadata_helpers.py @@ -13,6 +13,10 @@ from dive_server.views_metadata import ( # noqa: E402 _PROCESS_METADATA_DISPLAY_DEFAULT, _PROCESS_METADATA_FFPROBE_DEFAULT, + _accumulate_flat_metadata_key_stats, + _finalize_metadata_keys_categories, + _merge_recomputed_metadata_key_stats_into_existing, + _metadata_dict_for_schema_stats_refresh, _normalize_metadata_config, remove_before_folder, ) @@ -68,3 +72,40 @@ def test_normalize_metadata_config_display_string_like_girder_form_body(): assert display_config['categoricalLimit'] == 40 assert display_config['display'] == ['A'] assert display_config['hide'] == ['B'] + + +def test_metadata_dict_for_schema_stats_refresh_strips_dive_prefixed_keys(): + meta = {'stricture_flag': 'Y', 'DIVE_Name': 'a', 'ffprobe_width': 1920} + stripped = _metadata_dict_for_schema_stats_refresh(meta) + assert stripped == {'stricture_flag': 'Y'} + + +def test_merge_recomputed_metadata_key_stats_matches_process_metadata_shape(): + """After bulk refresh, categorical keys should carry count / type / unique like process_metadata.""" + accum = {} + for meta in ( + {'stricture_flag': 'Y'}, + {'stricture_flag': 'N'}, + {'stricture_flag': 'Y'}, + ): + _accumulate_flat_metadata_key_stats(accum, meta) + _finalize_metadata_keys_categories(accum, categorical_limit=50) + existing = { + 'LastModifiedTime': {'category': 'search', 'set': []}, + 'stricture_flag': { + 'category': 'categorical', + 'count': 0, + 'set': ['Y'], + }, + 'note': {'category': 'categorical', 'count': 3, 'set': ['x'], 'description': ' kept '}, + } + merged = _merge_recomputed_metadata_key_stats_into_existing(existing, accum) + assert merged['LastModifiedTime'] == existing['LastModifiedTime'] + assert merged['stricture_flag']['count'] == 3 + assert merged['stricture_flag']['type'] == 'string' + assert merged['stricture_flag']['unique'] == 2 + assert set(merged['stricture_flag']['set']) == {'Y', 'N'} + assert merged['note']['count'] == 0 + assert merged['note']['set'] == [] + assert merged['note']['unique'] == 0 + assert merged['note']['description'] == 'kept' From a208b919e6b0764df152e663c6b0be1573336d41 Mon Sep 17 00:00:00 2001 From: Bryon Lewis Date: Wed, 13 May 2026 11:47:22 -0400 Subject: [PATCH 5/5] add in categorical recalculation for counts when updating --- server/dive_server/views_metadata.py | 46 ++++++++++++++++--- server/tests/test_process_metadata_helpers.py | 29 ++++++++++++ 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/server/dive_server/views_metadata.py b/server/dive_server/views_metadata.py index 0c49b0bd..22594ab5 100644 --- a/server/dive_server/views_metadata.py +++ b/server/dive_server/views_metadata.py @@ -92,20 +92,49 @@ def python_to_javascript_type(py_type): return type_mapping.get(py_type, "unknown") +def _is_blank_metadata_value_for_stats(raw) -> bool: + """ + True when a stored value should not contribute to count / set / range aggregates. + + Skips nulls, NaN/inf placeholders, whitespace-only strings, empty containers, and + string-only lists/tuples where every string is empty or whitespace. + """ + if raw is None: + return True + if is_nonfinite_numeric_placeholder(raw): + return True + if isinstance(raw, str): + return raw.strip() == '' + if isinstance(raw, (list, tuple)): + if len(raw) == 0: + return True + if any(not isinstance(el, str) for el in raw): + return False + return not any( + isinstance(el, str) + and el.strip() != '' + and not is_nonfinite_numeric_placeholder(el) + for el in raw + ) + if isinstance(raw, dict): + return len(raw) == 0 + return False + + def _accumulate_flat_metadata_key_stats(metadataKeys, flat_dict): """ Merge one flat metadata dict into in-progress metadataKeys stats (same rules as JSON import / recursive dataset creation: finite numerics, no NaN in string sets). """ for key, raw in flat_dict.items(): - if key not in metadataKeys and raw is not None: + if _is_blank_metadata_value_for_stats(raw): + continue + if key not in metadataKeys: metadataKeys[key] = { 'type': python_to_javascript_type(type(raw)), 'set': set(), 'count': 0, } - if raw is None: - continue typ = metadataKeys[key]['type'] if typ == 'string': if is_nonfinite_numeric_placeholder(raw): @@ -113,13 +142,18 @@ def _accumulate_flat_metadata_key_stats(metadataKeys, flat_dict): metadataKeys[key]['set'].add(raw) metadataKeys[key]['count'] += 1 elif typ == 'array': + added_string = False for el in raw: if python_to_javascript_type(type(el)) != 'string': continue if is_nonfinite_numeric_placeholder(el): continue + if isinstance(el, str) and el.strip() == '': + continue metadataKeys[key]['set'].add(el) - metadataKeys[key]['count'] += 1 + added_string = True + if added_string: + metadataKeys[key]['count'] += 1 elif typ == 'number': merged = merge_finite_numeric_range_dict(metadataKeys[key].get('range'), raw) if merged is None: @@ -1012,7 +1046,7 @@ def create_metadata_folder( @access.user @autoDescribeRoute( Description( - "Get a list of filter keys for a specific folder. This is more used for debugging values in the metadata" + "Get a list of filter keys for a specific folder. Provides the type and range for values in the metadata" ).modelParam( "id", description="Base folder ID", @@ -1187,7 +1221,7 @@ def get_filter_query(self, folder, user, filters): @access.user @autoDescribeRoute( - Description("Get a list of filter values for DIVE Metadata") + Description("Get a list of filter values for DIVE Metadata, use for debugging") .modelParam( "id", description="Base folder ID", diff --git a/server/tests/test_process_metadata_helpers.py b/server/tests/test_process_metadata_helpers.py index 78883fc0..58007b14 100644 --- a/server/tests/test_process_metadata_helpers.py +++ b/server/tests/test_process_metadata_helpers.py @@ -15,6 +15,7 @@ _PROCESS_METADATA_FFPROBE_DEFAULT, _accumulate_flat_metadata_key_stats, _finalize_metadata_keys_categories, + _is_blank_metadata_value_for_stats, _merge_recomputed_metadata_key_stats_into_existing, _metadata_dict_for_schema_stats_refresh, _normalize_metadata_config, @@ -80,6 +81,34 @@ def test_metadata_dict_for_schema_stats_refresh_strips_dive_prefixed_keys(): assert stripped == {'stricture_flag': 'Y'} +def test_is_blank_metadata_value_for_stats(): + assert _is_blank_metadata_value_for_stats(None) is True + assert _is_blank_metadata_value_for_stats('') is True + assert _is_blank_metadata_value_for_stats(' \t') is True + assert _is_blank_metadata_value_for_stats([]) is True + assert _is_blank_metadata_value_for_stats(['', ' ']) is True + assert _is_blank_metadata_value_for_stats('x') is False + assert _is_blank_metadata_value_for_stats(0) is False + assert _is_blank_metadata_value_for_stats(False) is False + assert _is_blank_metadata_value_for_stats([1, 2]) is False + assert _is_blank_metadata_value_for_stats(['', 'a']) is False + + +def test_accumulate_metadata_key_stats_skips_blank_strings_for_count(): + accum = {} + for meta in ( + {'k': ''}, + {'k': ' '}, + {'k': 'Y'}, + {'k': None}, + {'k': 'Y'}, + ): + _accumulate_flat_metadata_key_stats(accum, meta) + _finalize_metadata_keys_categories(accum, categorical_limit=50) + assert accum['k']['count'] == 2 + assert accum['k']['unique'] == 1 + + def test_merge_recomputed_metadata_key_stats_matches_process_metadata_shape(): """After bulk refresh, categorical keys should carry count / type / unique like process_metadata.""" accum = {}