Merge branch 'master' into personalized-graphs

EvanDietzMorris · EvanDietzMorris · commit 941e305ba374 · 2026-05-18T14:22:53.000-04:00
diff --git a/graph_specs/monarch-kg-full-graph-spec.yaml b/graph_specs/monarch-kg-full-graph-spec.yaml
@@ -0,0 +1,7 @@
+graphs:
+  - graph_id: MonarchKGFull
+    graph_name: Monarch Initiative KG
+    graph_description: 'The complete Monarch Initiative Knowledge Graph. The Monarch Initiative is an international consortium that leads key global standards and semantic data integration technologies. Monarch resources and integrated data are foundational to many downstream applications.'
+    graph_url: https://monarchinitiative.org/
+    sources:
+      - source_id: MonarchKGFull
diff --git a/orion/data_sources.py b/orion/data_sources.py
@@ -35,6 +35,7 @@
 MOLEPRO = 'MolePro'
 METABOLOMICS_WORKBENCH = 'MetabolomicsWorkbench'
 MONARCH_KG = 'MonarchKG'
+MONARCH_KG_FULL = 'MonarchKGFull'
 MONDO_PROPS = 'MONDOProps'
 OHD_CAROLINA = 'OHD-Carolina'
 ONTOLOGICAL_HIERARCHY = 'OntologicalHierarchy'
@@ -95,6 +96,7 @@
     METABOLOMICS_WORKBENCH: ("parsers.MetabolomicsWorkbench.src.loadMetabolomicsWorkbench", "MetabolomicsWorkbenchLoader"),
     MOLEPRO: ("parsers.molepro.src.loadMolePro", "MoleProLoader"),
     MONARCH_KG: ("parsers.monarchkg.src.loadMonarchKG", "MonarchKGLoader"),
+    MONARCH_KG_FULL: ("parsers.monarchkg.src.loadMonarchKG", "MonarchKGFullLoader"),
     MONDO_PROPS: ("parsers.MONDOProperties.src.loadMP", "MPLoader"),
     OHD_CAROLINA: ("parsers.ohd_carolina.src.loadOHD", "OHDLoader"),
     ONTOLOGICAL_HIERARCHY: ("parsers.UberGraph.src.loadUG", "OHLoader"),
diff --git a/parsers/monarchkg/src/MonarchKGFull.source.json b/parsers/monarchkg/src/MonarchKGFull.source.json
@@ -0,0 +1,15 @@
+{
+  "@context": "https://schema.org",
+  "@type": "Dataset",
+  "identifier": "infores:monarchinitiative",
+  "name": "Monarch Initiative KG (Full)",
+  "description": "The complete Monarch Initiative Knowledge Graph, without predicate or knowledge-source filtering. Suitable for standalone deployment (e.g. Automat). The Monarch Initiative is an international consortium that leads key global standards and semantic data integration technologies.",
+  "url": "https://monarchinitiative.org/",
+  "attribution": "https://monarchinitiative.org/",
+  "citation": [
+    "https://doi.org/10.1093/nar/gkad1082",
+    "Putman TE, Schaper K, Matentzoglu N, Rubinetti VP, Alquaddoomi FS, Cox C, Caufield JH, Elsarboukh G, Gehrke S, Hegde H, Reese JT, Braun I, Bruskiewich RM, Cappelletti L, Carbon S, Caron AR, Chan LE, Chute CG, Cortes KG, De Souza V, Fontana T, Harris NL, Hartley EL, Hurwitz E, Jacobsen JOB, Krishnamurthy M, Laraway BJ, McLaughlin JA, McMurry JA, Moxon SAT, Mullen KR, O'Neil ST, Shefchek KA, Stefancsik R, Toro S, Vasilevsky NA, Walls RL, Whetzel PL, Osumi-Sutherland D, Smedley D, Robinson PN, Mungall CJ, Haendel MA, Munoz-Torres MC. The Monarch Initiative in 2024: an analytic platform integrating phenotypes, genes and diseases across species. Nucleic Acids Res. 2024 Jan 5;52(D1):D938-D949. doi: 10.1093/nar/gkad1082. PMID: 38000386; PMCID: PMC10767791."
+  ],
+  "license": "BSD-3-Clause",
+  "contentUrl": ""
+}
diff --git a/parsers/monarchkg/src/loadMonarchKG.py b/parsers/monarchkg/src/loadMonarchKG.py
@@ -11,15 +11,16 @@
 
 
 ##############
-# Class: Monarch KG source loader
+# Class: Monarch KG base loader
 #
-# Desc: Class that loads/parses the Monarch KG data.
+# Desc: Base class with shared logic for loading/parsing the Monarch KG data.
+#       Subclasses control whether predicate/knowledge-source filtering is applied.
 ##############
-class MonarchKGLoader(SourceDataLoader):
+class MonarchKGBaseLoader(SourceDataLoader):
 
-    source_id: str = 'MonarchKG'
+    source_id: str = None  # overridden by subclass
     provenance_id: str = 'infores:monarchinitiative'
-    parsing_version: str = '1.3'
+    parsing_version: str = '1.5'
 
     def __init__(self, test_mode: bool = False, source_data_dir: str = None):
         """
@@ -28,35 +29,11 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
         """
         super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)
 
-        # there is a /latest/ for this url, but without a valid get_latest_source_version function,
-        # it could create a mismatch, pin to this version for now
-        self.data_url = 'https://data.monarchinitiative.org/monarch-kg-dev/latest/'
+        self.data_url = 'https://data.monarchinitiative.org/monarch-kg/latest/'
         self.monarch_graph_archive = 'monarch-kg.jsonl.tar.gz'
         self.monarch_edge_file_archive_path = 'monarch-kg_edges.jsonl'
         self.data_files = [self.monarch_graph_archive]
 
-        self.desired_predicates = {
-            'biolink:causes',
-            'biolink:contributes_to',
-            'biolink:has_phenotype',
-            'biolink:expressed_in'
-        }
-
-        self.knowledge_source_ignore_list = {
-            'infores:ctd',
-            'infores:reactome',
-            'infores:goa',
-            'infores:cafa',
-            'infores:bhf-ucl',
-            'infores:aruk-ucl',
-            'infores:parkinsonsuk-ucl',
-            'infores:alzheimers-university-of-toronto',
-            'infores:agbase',
-            'infores:dictybase',
-            'infores:ntnu-sb',
-            'infores:wb'
-        }
-
         self.knowledge_source_mapping = {
             'infores:alliancegenome': 'infores:agrkb',
             'infores:hgnc-ucl': 'infores:hgnc',
@@ -69,10 +46,12 @@ def get_latest_source_version(self) -> str:
         """
         latest_version = None
         try:
-            metadata_yaml : requests.Response = requests.get("https://data.monarchinitiative.org/monarch-kg-dev/latest/metadata.yaml")
+            metadata_yaml: requests.Response = requests.get(
+                'https://data.monarchinitiative.org/monarch-kg/latest/metadata.yaml'
+            )
             for line in metadata_yaml.text.split('\n'):
-                if "kg-version:" in line:
-                    latest_version = line.replace("kg-version:", "").strip()
+                if 'kg-version:' in line:
+                    latest_version = line.replace('kg-version:', '').strip()
             if latest_version is None:
                 raise ValueError("Cannot find 'kg-version' in Monarch KG metadata yaml.")
         except Exception as e:
@@ -85,50 +64,56 @@ def get_data(self) -> bool:
         data_puller.pull_via_http(source_data_url, self.data_path)
         return True
 
+    def filter_edge(self, subject_id: str, object_id: str, predicate: str,
+                    primary_knowledge_source: str, aggregator_knowledge_sources: list) -> bool:
+        """
+        Returns True if the edge should be skipped.
+        Subclasses override this to apply filtering.
+        """
+        return False
+
     def parse_data(self) -> dict:
         """
-        Parses the data file for graph nodes/edges
+        Parses the data file for graph nodes/edges.
 
         :return: ret_val: load_metadata
         """
         record_counter = 0
         skipped_bad_record_counter = 0
-        skipped_ignore_knowledge_source = 0
-        skipped_undesired_predicate = 0
+        skipped_filtered_counter = 0
+
         full_tar_path = os.path.join(self.data_path, self.monarch_graph_archive)
         protected_edge_labels = [SUBJECT_ID, OBJECT_ID, PREDICATE, PRIMARY_KNOWLEDGE_SOURCE,
-                                 AGGREGATOR_KNOWLEDGE_SOURCES, KNOWLEDGE_LEVEL, AGENT_TYPE,
-                                 PUBLICATIONS, "biolink:primary_knowledge_source", "biolink:aggregator_knowledge_source"]
+                                  AGGREGATOR_KNOWLEDGE_SOURCES, KNOWLEDGE_LEVEL, AGENT_TYPE,
+                                  PUBLICATIONS, 'biolink:primary_knowledge_source',
+                                  'biolink:aggregator_knowledge_source']
 
         with tarfile.open(full_tar_path, 'r') as tar_files:
             with tar_files.extractfile(self.monarch_edge_file_archive_path) as edges_file:
                 for line in edges_file:
                     monarch_edge = orjson.loads(line)
-                    # normally we wouldn't use constants to read FROM a source,
-                    # but in this case monarch kg is biolink compliant, so they should be the same
                     subject_id = monarch_edge[SUBJECT_ID]
                     object_id = monarch_edge[OBJECT_ID]
                     predicate = monarch_edge[PREDICATE]
                     if not (subject_id and object_id and predicate):
                         skipped_bad_record_counter += 1
                         continue
 
-                    if predicate not in self.desired_predicates:
-                        skipped_undesired_predicate += 1
-                        continue
-
-                    # get the knowledge sources, map them to something else if needed,
-                    # then check if edge should be ignored due to the knowledge source
-                    primary_knowledge_source = self.knowledge_source_mapping.get(monarch_edge[PRIMARY_KNOWLEDGE_SOURCE],
-                                                                                 monarch_edge[PRIMARY_KNOWLEDGE_SOURCE])
+                    primary_knowledge_source = self.knowledge_source_mapping.get(
+                        monarch_edge[PRIMARY_KNOWLEDGE_SOURCE],
+                        monarch_edge[PRIMARY_KNOWLEDGE_SOURCE]
+                    )
                     if monarch_edge.get(AGGREGATOR_KNOWLEDGE_SOURCES, False):
-                        aggregator_knowledge_sources = [self.knowledge_source_mapping.get(ks, ks)
-                                                        for ks in monarch_edge[AGGREGATOR_KNOWLEDGE_SOURCES]]
+                        aggregator_knowledge_sources = [
+                            self.knowledge_source_mapping.get(ks, ks)
+                            for ks in monarch_edge[AGGREGATOR_KNOWLEDGE_SOURCES]
+                        ]
                     else:
                         aggregator_knowledge_sources = []
-                    if primary_knowledge_source in self.knowledge_source_ignore_list or \
-                            any([ks in self.knowledge_source_ignore_list for ks in aggregator_knowledge_sources]):
-                        skipped_ignore_knowledge_source += 1
+
+                    if self.filter_edge(subject_id, object_id, predicate,
+                                        primary_knowledge_source, aggregator_knowledge_sources):
+                        skipped_filtered_counter += 1
                         continue
 
                     edge_properties = {
@@ -142,7 +127,7 @@ def parse_data(self) -> dict:
                     for edge_attribute in monarch_edge:
                         if edge_attribute not in protected_edge_labels \
                                 and monarch_edge[edge_attribute] \
-                                and edge_attribute != "qualifiers":
+                                and edge_attribute != 'qualifiers':
                             edge_properties[edge_attribute] = monarch_edge[edge_attribute]
 
                     output_edge = kgxedge(
@@ -157,10 +142,83 @@ def parse_data(self) -> dict:
                     self.output_file_writer.write_node(subject_id)
                     self.output_file_writer.write_kgx_edge(output_edge)
                     record_counter += 1
+
         load_metadata: dict = {
             'num_source_lines': record_counter,
             'unusable_source_lines': skipped_bad_record_counter,
-            'lines_skipped_due_to_undesired_predicate': skipped_undesired_predicate,
-            'lines_skipped_due_to_knowledge_source_ignore_list': skipped_ignore_knowledge_source
+            'lines_skipped_due_to_filtering': skipped_filtered_counter,
         }
         return load_metadata
+
+
+##############
+# Class: Monarch KG loader — ROBOKOP curated subset
+#
+# Desc: Filters to the predicates and knowledge sources used in the ROBOKOP graph.
+#       Sources already ingested separately (CTD, Reactome, GOA, etc.) are excluded
+#       to avoid duplication in the merged graph.
+##############
+class MonarchKGLoader(MonarchKGBaseLoader):
+
+    source_id: str = 'MonarchKG'
+
+    def __init__(self, test_mode: bool = False, source_data_dir: str = None):
+        super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)
+
+        self.desired_predicates = {
+            'biolink:causes',
+            'biolink:contributes_to',
+            'biolink:has_phenotype',
+            'biolink:expressed_in'
+        }
+
+        self.knowledge_source_ignore_list = {
+            'infores:ctd',
+            'infores:reactome',
+            'infores:goa',
+            'infores:cafa',
+            'infores:bhf-ucl',
+            'infores:aruk-ucl',
+            'infores:parkinsonsuk-ucl',
+            'infores:alzheimers-university-of-toronto',
+            'infores:agbase',
+            'infores:dictybase',
+            'infores:ntnu-sb',
+            'infores:wb'
+        }
+
+        # Curie prefixes known not to normalize — edges where subject or object
+        # starts with any of these are discarded.
+        self.non_normalizable_curie_prefixes = {
+            'ZP', 'phenopacket.store', 'WB', 'CLINVAR', 'FYPO',
+            'PomBase', 'MMRRC', 'WBPhenotype', 'CAID', 'XPO', 'CUREID'
+        }
+
+    def filter_edge(self, subject_id: str, object_id: str, predicate: str,
+                    primary_knowledge_source: str, aggregator_knowledge_sources: list) -> bool:
+        if predicate not in self.desired_predicates:
+            return True
+        if primary_knowledge_source in self.knowledge_source_ignore_list or \
+                any(ks in self.knowledge_source_ignore_list for ks in aggregator_knowledge_sources):
+            return True
+        for curie in (subject_id, object_id):
+            prefix = curie.split(':')[0]
+            if prefix in self.non_normalizable_curie_prefixes:
+                return True
+        return False
+
+
+##############
+# Class: Monarch KG Full loader — complete Monarch KG
+#
+# Desc: Loads the entire Monarch KG without predicate or knowledge-source filtering.
+#       Intended for producing a standalone Monarch KG graph (e.g. for Automat),
+#       rather than the curated ROBOKOP subset.
+##############
+class MonarchKGFullLoader(MonarchKGBaseLoader):
+
+    source_id: str = 'MonarchKGFull'
+
+    def __init__(self, test_mode: bool = False, source_data_dir: str = None):
+        super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)
+        # No filtering — filter_edge inherits the base class no-op
diff --git a/uv.lock b/uv.lock