From ea4b3089f00b19846e0a444afac316d01e1eaad1 Mon Sep 17 00:00:00 2001
From: Avery Wang <averyjwang@gmail.com>
Date: Fri, 27 Feb 2026 07:41:25 -0800
Subject: [PATCH] remove references to cdd replace cdd with a JSON file with
 overrides

---
 ...add_clinical_attribute_metadata_headers.py | 134 ++++++------------
 1 file changed, 40 insertions(+), 94 deletions(-)

diff --git a/import-scripts/add_clinical_attribute_metadata_headers.py b/import-scripts/add_clinical_attribute_metadata_headers.py
index 2d57b444a..6e84dec65 100644
--- a/import-scripts/add_clinical_attribute_metadata_headers.py
+++ b/import-scripts/add_clinical_attribute_metadata_headers.py
@@ -1,16 +1,30 @@
 #!/usr/bin/env python
 # ------------------------------------------------------------------------------
-# Utility script which adds metadata headers to specified file(s)
-# Metadata is pulled from the clinical data dictionary web service (cdd)
-# Four lines added at the top (display name, dscriptions, datatype, priority)
-# Changes are only made if all input files are valid
+# Utility script which adds metadata headers to the specified clinical file(s).
+#
+# Metadata is loaded exclusively from a JSON file (-i / --independent-metadata-file)
+# that maps normalized column header names to their metadata objects. Each entry
+# in the JSON file must have the following keys:
+#   DISPLAY_NAME, DESCRIPTIONS, DATATYPE, ATTRIBUTE_TYPE, PRIORITY
+#
+# Default behavior for attributes not found in the JSON file:
+#   DISPLAY_NAME  : attribute name with underscores replaced by spaces, title-cased
+#   DESCRIPTIONS  : same as DISPLAY_NAME default
+#   DATATYPE      : STRING
+#   ATTRIBUTE_TYPE: SAMPLE
+#   PRIORITY      : 1
+#
+# The script adds four/five metadata headers lines (display name, description, datatype, priority, attribute type).
+# The fifth line (attribute type) is added when the file is not split between patient/sample, based on the
+# clinical file name (i.e., not data_clinical_patient.txt or data_clinical_sample.txt).
+#
+# Changes are only made if all input files are valid (exist and are writable).
 # ------------------------------------------------------------------------------
 
 from clinicalfile_utils import write_data, write_header_line, get_header
 import argparse
 import json
 import os
-import requests
 import shutil
 import sys
 import tempfile
@@ -24,35 +38,24 @@
 COLUMN_HEADER_KEY = 'column_header'
 ATTRIBUTE_TYPE_KEY = 'attribute_type'
 PRIORITY_KEY = 'priority'
-OVERRIDDEN_STUDY_NAME_KEY = 'name'
-DEFAULT_URL = "https://cdd.cbioportal.mskcc.org/api/"
 
 PATIENT_CLINICAL_FILE_PATTERN = "data_clinical_patient.txt"
 SAMPLE_CLINICAL_FILE_PATTERN = "data_clinical_sample.txt"
 
-def check_valid_studyid(study_id, base_cdd_url):
-    query = base_cdd_url + "cancerStudies"
-    response = requests.get(query)
-    response_as_json = json.loads(response.text)
-    if study_id not in [overridden_study[OVERRIDDEN_STUDY_NAME_KEY] for overridden_study in response_as_json]:
-        print >> ERROR_FILE, 'Invalid study id: ' + study_id + ", there is no associated attributes in CDD"
-        sys.exit(2)
-
-def response_is_200(response):
-    if response.status_code != 200:
-        return False
-    return True
+def get_metadata_dictionary(all_attributes, metadata_file):
+    """Load metadata for the given attributes from the JSON metadata file.
 
-def get_independently_determined_attributes_metadata_dictionary(all_attributes, independent_metadata_file):
+    Returns a dict mapping normalized column header -> metadata object for
+    every attribute that appears in the file. Attributes absent from the file
+    are omitted and will receive default values at write time.
+    """
     metadata_dictionary = {}
-    if not independent_metadata_file:
-        return metadata_dictionary
-    f = open(independent_metadata_file, "r")
-    independently_determined_attributes = json.load(f)
+    f = open(metadata_file, "r")
+    all_metadata = json.load(f)
     f.close()
     for normalized_column_header in all_attributes:
-        if normalized_column_header in independently_determined_attributes:
-            metadata = independently_determined_attributes[normalized_column_header]
+        if normalized_column_header in all_metadata:
+            metadata = all_metadata[normalized_column_header]
             metadata_dictionary[normalized_column_header] = {
                     'DISPLAY_NAME' : metadata['DISPLAY_NAME'],
                     'DESCRIPTIONS' : metadata['DESCRIPTIONS'],
@@ -62,49 +65,6 @@ def get_independently_determined_attributes_metadata_dictionary(all_attributes,
                     }
     return metadata_dictionary
 
-def add_clinical_attribute_metadata_from_cdd(study_id, header, base_cdd_url, metadata_mapping):
-    response = requests.post(base_cdd_url + "?cancerStudy=" + study_id if study_id else base_cdd_url, json=list(header))
-    # ROB : we can probably delete the following block, unless we want to try to succed even when our independently determined metadata is missing a needed attribute
-    # ROB : or change this to a graceful exit reporting the missing headers
-    # ROB VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
-    if not response_is_200(response):
-        for attr in header:
-            x = [attr]
-            response = requests.post(base_cdd_url + "?cancerStudy=" + study_id if study_id else base_cdd_url, json=list(x))
-            if not response_is_200(response):
-                continue
-            response_as_json = json.loads(response.text)
-            for entry in response_as_json:
-                normalized_column_header =  entry[COLUMN_HEADER_KEY]
-                display_name = entry[DISPLAY_NAME_KEY]
-                description =  entry[DESCRIPTION_KEY]
-                datatype = entry[DATATYPE_KEY]
-                attribute_type = entry[ATTRIBUTE_TYPE_KEY]
-                priority = entry[PRIORITY_KEY]
-                metadata_mapping[normalized_column_header] = {
-                        'DISPLAY_NAME' : display_name,
-                        'DESCRIPTIONS' : description,
-                        'DATATYPE' : datatype,
-                        'ATTRIBUTE_TYPE' : attribute_type,
-                        'PRIORITY' : priority}
-        print metadata_mapping
-        return metadata_mapping
-    # ROB ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    response_as_json = json.loads(response.text)
-    for entry in response_as_json:
-        normalized_column_header =  entry[COLUMN_HEADER_KEY]
-        display_name = entry[DISPLAY_NAME_KEY]
-        description =  entry[DESCRIPTION_KEY]
-        datatype = entry[DATATYPE_KEY]
-        attribute_type = entry[ATTRIBUTE_TYPE_KEY]
-        priority = entry[PRIORITY_KEY]
-        metadata_mapping[normalized_column_header] = {
-                'DISPLAY_NAME' : display_name,
-                'DESCRIPTIONS' : description,
-                'DATATYPE' : datatype,
-                'ATTRIBUTE_TYPE' : attribute_type,
-                'PRIORITY' : priority}
-
 def write_headers(header, metadata_dictionary, output_file, is_mixed_attribute_types_format):
     name_line = []
     description_line = []
@@ -119,7 +79,7 @@ def write_headers(header, metadata_dictionary, output_file, is_mixed_attribute_t
             attribute_type_line.append(metadata_dictionary[attribute]['ATTRIBUTE_TYPE'])
             priority_line.append(metadata_dictionary[attribute]['PRIORITY'])
         else:
-            # if attribute is not defined in cdd, use defaults
+            # attribute not found in metadata file -- apply defaults
             name_line.append(attribute.replace("_", " ").title())
             description_line.append(attribute.replace("_", " ").title())
             datatype_line.append('STRING')
@@ -143,19 +103,13 @@ def check_if_mixed_attribute_types_format(filename):
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("-f", "--files", nargs = "+", help = "file(s) to add metadata headers", required = True)
-    parser.add_argument("-s", "--study-id", help = "study id for specific overrides", required = False)
-    parser.add_argument("-c", "--cdd-url", help = "the url for the cdd web application, default is https://cdd.cbioportal.mskcc.org/api/", required = False)
-    parser.add_argument("-i", "--independent-metadata-file", help = "a file containing a json map from normalized_header to metadata object", required = False)
+    parser.add_argument("-i", "--independent-metadata-file", help = "a JSON file containing a map from normalized column header to metadata object", required = True)
     args = parser.parse_args()
     clinical_files = args.files
-    study_id = args.study_id
-    cdd_url = args.cdd_url
-    independent_metadata_file = args.independent_metadata_file
-    # change base url if specified (i.e for testing)
-    if cdd_url:
-        base_cdd_url = cdd_url
-    else:
-        base_cdd_url = DEFAULT_URL
+    metadata_file = args.independent_metadata_file
+    if not os.path.exists(metadata_file):
+        print >> ERROR_FILE, 'Metadata file not found: ' + metadata_file
+        sys.exit(2)
     # check file (args) validity and return error if any file fails check
     missing_clinical_files = [clinical_file for clinical_file in clinical_files if not os.path.exists(clinical_file)]
     if len(missing_clinical_files) > 0:
@@ -165,23 +119,15 @@ def main():
     if len(not_writable_clinical_files) > 0:
         print >> ERROR_FILE, 'File(s) not writable: ' + ', '.join(not_writable_clinical_files)
         sys.exit(2)
-    if (study_id):
-        check_valid_studyid(study_id, base_cdd_url)
     all_attributes = set()
     # get a set of attributes used across all input files
     for clinical_file in clinical_files:
         all_attributes = all_attributes.union(get_header(clinical_file))
-    # set metadata for independently determined attributes which are members of all_attributes
-    metadata_dictionary = get_independently_determined_attributes_metadata_dictionary(all_attributes, independent_metadata_file)
-    # get a set of "to be determined by ddp" attributes
-    ddp_dependent_attributes = set()
-    for attribute in all_attributes:
-        if not attribute in metadata_dictionary:
-            ddp_dependent_attributes.add(attribute)
-    add_clinical_attribute_metadata_from_cdd(study_id, ddp_dependent_attributes, base_cdd_url, metadata_dictionary)
-    # check metadata is defined for all attributes in CDD
-    if len(metadata_dictionary.keys()) != len(all_attributes):
-        print >> ERROR_FILE, 'Error, metadata not found for attribute(s): ' + ', '.join(all_attributes.difference(metadata_dictionary.keys()))
+    # load metadata for all attributes from the JSON file; missing attributes get defaults at write time
+    metadata_dictionary = get_metadata_dictionary(all_attributes, metadata_file)
+    missing_attributes = all_attributes.difference(metadata_dictionary.keys())
+    if missing_attributes:
+        print >> ERROR_FILE, 'Warning: metadata not found for attribute(s), defaults will be used: ' + ', '.join(missing_attributes)
     for clinical_file in clinical_files:
         # create temp file to write to
         temp_file, temp_file_name = tempfile.mkstemp()