Skip to content

Commit 6b6e66a

Browse files
authored
Merge pull request #889 from albatalavera/multiproject
Add MEPRAM and Redlabra schemas/templates to assets, add amr genes json to conf, and improve metadata template formatting
2 parents 8a4c139 + 0418c8e commit 6b6e66a

7 files changed

Lines changed: 73060 additions & 9934 deletions

File tree

594 KB
Binary file not shown.
594 KB
Binary file not shown.

relecov_tools/build_schema.py

Lines changed: 156 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,31 @@
2929

3030

3131
def _slugify_project(value):
32-
"""Return a filesystem-friendly project identifier."""
32+
"""
33+
Return a filesystem-friendly project identifier.
34+
35+
Args:
36+
value: Project name or label to normalize.
37+
38+
Returns:
39+
str: Normalized project identifier.
40+
"""
3341
project = str(value).strip().lower()
3442
project = re.sub(r"\s+", "_", project)
3543
project = re.sub(r"[^a-z0-9_-]+", "_", project)
3644
return project.strip("_")
3745

3846

3947
def _display_project(value):
40-
"""Return a readable project label for generated metadata."""
48+
"""
49+
Return a readable project label for generated metadata.
50+
51+
Args:
52+
value: Project name or identifier to display.
53+
54+
Returns:
55+
str: Title-cased project label.
56+
"""
4157
return str(value).strip().replace("_", " ").replace("-", " ").title()
4258

4359

@@ -202,7 +218,8 @@ def __init__(
202218
self._resolve_version_history_template()
203219

204220
def _resolve_version_history_template(self):
205-
"""Resolve the previous Excel template used to read VERSION history.
221+
"""
222+
Resolve the previous Excel template used to read VERSION history.
206223
207224
Initial versions skip previous template lookup; regular versions use either
208225
the explicit template path or the installed project template in assets.
@@ -286,7 +303,7 @@ def _load_laboratory_addresses(self):
286303
uniques[f].add(name)
287304

288305
dropdowns = {
289-
k: sorted(self._unique_enum_values(v)) for k, v in dropdowns.items()
306+
k: sorted(BuildSchema._unique_enum_values(v)) for k, v in dropdowns.items()
290307
}
291308
uniques = {k: sorted(v) for k, v in uniques.items()}
292309
return dropdowns, uniques
@@ -482,35 +499,77 @@ def _validate_examples_in_enum(
482499
enum_value: any,
483500
expected_type: str | None,
484501
) -> list[str]:
485-
"""Return validation errors for examples that are not present in enum."""
486-
if self._is_empty_validation_value(enum_value):
502+
"""
503+
Return validation errors for examples that are not present in enum.
504+
505+
Args:
506+
property_id (str): Property name used for warning messages.
507+
example_value: Raw examples value from the database definition.
508+
enum_value: Raw enum definition or reference.
509+
expected_type (str | None): Declared JSON Schema type for the property.
510+
511+
Returns:
512+
list[str]: Validation error messages.
513+
"""
514+
if BuildSchema._is_empty_validation_value(enum_value):
487515
return []
488-
if self._is_empty_validation_value(example_value):
516+
if BuildSchema._is_empty_validation_value(example_value):
489517
return []
490518

491519
enum_values = self._parse_enum_values(enum_value)
492520
if not isinstance(enum_values, list) or not enum_values:
493521
return []
494522

495-
examples = self._parse_examples_for_validation(example_value)
523+
examples = BuildSchema._parse_examples_for_validation(example_value)
496524
examples = self._cast_examples_to_declared_type(
497525
property_id, expected_type, examples
498526
)
499527

500528
enum_lookup = {
501-
self._normalize_enum_example_value(value) for value in enum_values
529+
BuildSchema._normalize_enum_example_value(value) for value in enum_values
502530
}
503531
return [
504532
f"Example '{example}' is not defined in enum."
505533
for example in examples
506-
if self._normalize_enum_example_value(example) not in enum_lookup
534+
if BuildSchema._normalize_enum_example_value(example) not in enum_lookup
507535
]
508536

509537
@staticmethod
510538
def _normalize_enum_example_value(value: any) -> any:
511539
if not isinstance(value, str):
512540
return value
513-
return re.sub(r"\s*\[[^\]]+\]$", "", value).strip()
541+
return BuildSchema._clean_enum_ontology_annotation(value)
542+
543+
@staticmethod
544+
def _clean_enum_ontology_annotation(value: any) -> any:
545+
"""
546+
Remove ontology annotations displayed between brackets from enum labels.
547+
548+
Args:
549+
value: Enum value to clean.
550+
551+
Returns:
552+
Cleaned enum value when it is a string; otherwise the original value.
553+
"""
554+
if not isinstance(value, str):
555+
return value
556+
return re.sub(r"\s*\[[^\]]+\]", "", value).strip()
557+
558+
def _clean_template_enum_values(self, values: any) -> any:
559+
"""
560+
Return enum values as displayed in the Excel template dropdowns.
561+
562+
Args:
563+
values: Enum values to clean.
564+
565+
Returns:
566+
Cleaned enum list, or the original value when it is not a list.
567+
"""
568+
if not isinstance(values, list):
569+
return values
570+
return BuildSchema._unique_enum_values(
571+
[BuildSchema._clean_enum_ontology_annotation(value) for value in values]
572+
)
514573

515574
@staticmethod
516575
def _is_empty_validation_value(value: any) -> bool:
@@ -520,7 +579,15 @@ def _is_empty_validation_value(value: any) -> bool:
520579

521580
@staticmethod
522581
def _parse_examples_for_validation(example_value: any) -> list[any]:
523-
"""Parse the examples cell using the same separator used for schema examples."""
582+
"""
583+
Parse the examples cell using the same separator used for schema examples.
584+
585+
Args:
586+
example_value: Raw examples value from the database definition.
587+
588+
Returns:
589+
list: Parsed examples.
590+
"""
524591
if isinstance(example_value, str):
525592
return [
526593
value.strip() for value in example_value.split("; ") if value.strip()
@@ -948,7 +1015,15 @@ def verify_schema(self, schema):
9481015

9491016
@staticmethod
9501017
def _find_duplicate_values(values: list) -> list:
951-
"""Return duplicated values preserving first duplicate encounter order."""
1018+
"""
1019+
Return duplicated values preserving first duplicate encounter order.
1020+
1021+
Args:
1022+
values (list): Values to inspect for duplicates.
1023+
1024+
Returns:
1025+
list: Duplicate values.
1026+
"""
9521027
seen = set()
9531028
duplicates = []
9541029
duplicate_seen = set()
@@ -968,14 +1043,22 @@ def _find_duplicate_values(values: list) -> list:
9681043
return duplicates
9691044

9701045
def validate_schema_enum_duplicates(self, schema: dict):
971-
"""Validate that every enum list in a generated schema has unique values."""
1046+
"""
1047+
Validate that every enum list in a generated schema has unique values.
1048+
1049+
Args:
1050+
schema (dict): JSON Schema to inspect.
1051+
1052+
Returns:
1053+
None
1054+
"""
9721055
duplicate_enums = {}
9731056

9741057
def walk_schema(node, path="$"):
9751058
if isinstance(node, dict):
9761059
enum_values = node.get("enum")
9771060
if isinstance(enum_values, list):
978-
duplicates = self._find_duplicate_values(enum_values)
1061+
duplicates = BuildSchema._find_duplicate_values(enum_values)
9791062
if duplicates:
9801063
duplicate_enums[path] = duplicates
9811064
for key, value in node.items():
@@ -1122,9 +1205,10 @@ def _sort_enum_values(self, enum_values: list[str]) -> list[str]:
11221205
"missing": 5,
11231206
"restricted access": 6,
11241207
"other": 7,
1208+
"none": 8,
11251209
}
11261210

1127-
unique_values = self._unique_enum_values(enum_values)
1211+
unique_values = BuildSchema._unique_enum_values(enum_values)
11281212

11291213
def sort_key(value: str):
11301214
normalized_value = value.strip().casefold()
@@ -1137,7 +1221,15 @@ def sort_key(value: str):
11371221

11381222
@staticmethod
11391223
def _unique_enum_values(enum_values: list) -> list:
1140-
"""Return enum values without duplicates, preserving first occurrence order."""
1224+
"""
1225+
Return enum values without duplicates, preserving first occurrence order.
1226+
1227+
Args:
1228+
enum_values (list): Enum values to deduplicate.
1229+
1230+
Returns:
1231+
list: Unique enum values.
1232+
"""
11411233
unique_values = []
11421234
seen = set()
11431235
for value in enum_values:
@@ -1354,6 +1446,29 @@ def _template_only_properties_to_df(self, database_definition: dict | None):
13541446

13551447
return pd.DataFrame(template_rows)
13561448

1449+
@staticmethod
1450+
def _format_template_required_value(value):
1451+
"""
1452+
Return the visible required label used in the metadata template.
1453+
1454+
Args:
1455+
value: Raw required value from the database definition.
1456+
1457+
Returns:
1458+
str: Normalized required label.
1459+
"""
1460+
required_value = str(value or "").strip()
1461+
if required_value.upper() == "Y":
1462+
return "YES"
1463+
if required_value.upper() in ["N", "NO"]:
1464+
return "NO"
1465+
if required_value.lower().startswith("y if "):
1466+
condition = required_value[5:].strip()
1467+
if condition.lower() == "sequenced":
1468+
condition = "sequenced"
1469+
return f"YES if {condition}"
1470+
return required_value
1471+
13571472
def create_metadatalab_excel(self, json_schema, database_definition=None):
13581473
"""
13591474
Generates an Excel template file for Metadata LAB with four sheets:
@@ -1462,11 +1577,23 @@ def create_metadatalab_excel(self, json_schema, database_definition=None):
14621577
df["required"] = df["property_id"].apply(
14631578
lambda x: "Y" if x in required_properties else "N"
14641579
)
1465-
1466-
def clean_ontologies(enums):
1467-
return self._unique_enum_values(
1468-
[re.sub(r"\s*\[.*?\]", "", item).strip() for item in enums]
1580+
if database_definition:
1581+
required_values = {
1582+
property_id: BuildSchema._format_template_required_value(
1583+
features.get("required (Y/N)")
1584+
)
1585+
for property_id, features in database_definition.items()
1586+
}
1587+
df["required"] = df.apply(
1588+
lambda row: required_values.get(
1589+
row["property_id"], row["required"]
1590+
)
1591+
or row["required"],
1592+
axis=1,
14691593
)
1594+
df["required"] = df["required"].apply(
1595+
BuildSchema._format_template_required_value
1596+
)
14701597

14711598
def resolve_enum_ref(ref: str, enum_defs: dict) -> list[str]:
14721599
property_key = ref.split("enums/")[-1]
@@ -1484,9 +1611,7 @@ def resolve_enum_ref(ref: str, enum_defs: dict) -> list[str]:
14841611
f"[red]Error finding enum for property '{'.'.join(property_id)}'; not found in $defs"
14851612
)
14861613
return []
1487-
return (
1488-
clean_ontologies(values) if isinstance(values, list) else values
1489-
)
1614+
return self._clean_template_enum_values(values)
14901615

14911616
resolved_enums = df["$ref"].apply(
14921617
lambda row: (
@@ -1501,6 +1626,8 @@ def resolve_enum_ref(ref: str, enum_defs: dict) -> list[str]:
15011626
)
15021627
else:
15031628
df["enum"] = resolved_enums
1629+
1630+
df["enum"] = df["enum"].apply(self._clean_template_enum_values)
15041631
common_dropdown = self._lab_dropdowns["collecting_institution"]
15051632

15061633
lab_fields = [
@@ -1562,7 +1689,11 @@ def resolve_enum_ref(ref: str, enum_defs: dict) -> list[str]:
15621689
metadatalab_header = ["CAMPO", "DESCRIPCIÓN", "EJEMPLOS", "REQUERIDO"]
15631690
df_metadata = pd.DataFrame(columns=metadatalab_header)
15641691
df_metadata["REQUERIDO"] = df_filtered["required"].apply(
1565-
lambda x: "YES" if str(x).upper() in ["Y", "YES"] else ""
1692+
lambda x: (
1693+
"YES"
1694+
if str(x).upper() in ["Y", "YES"]
1695+
else "" if str(x).upper() in ["N", "NO"] else x
1696+
)
15661697
)
15671698
df_metadata["EJEMPLOS"] = df_filtered["examples"].apply(
15681699
lambda x: x[0] if isinstance(x, list) else x

0 commit comments

Comments
 (0)