From d655f477dfa52c183d7e67deb9d1d496bfc3efd9 Mon Sep 17 00:00:00 2001 From: Robert Sheridan Date: Tue, 31 Jan 2023 11:12:31 -0500 Subject: [PATCH 1/4] filter germline events and all synonyous events - uses the unique mutation event key to detect duplicates --- .../filter_non_somatic_events_py3.py | 153 +++++++++++++++--- 1 file changed, 132 insertions(+), 21 deletions(-) diff --git a/import-scripts/filter_non_somatic_events_py3.py b/import-scripts/filter_non_somatic_events_py3.py index 063739d0a..3a3d139a9 100755 --- a/import-scripts/filter_non_somatic_events_py3.py +++ b/import-scripts/filter_non_somatic_events_py3.py @@ -13,9 +13,49 @@ """ import argparse +import sys from enum import Enum from generate_az_study_changelog_py3 import DataHandler +UNIQUE_MUTATION_EVENT_KEY_FIELDS = { + "Hugo_Symbol", + "Chromosome", + "Start_Position", + "End_Position", + "Reference_Allele", + "Tumor_Seq_Allele2", + "Tumor_Sample_Barcode" +} + +UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS = { + "Sample_ID", + "Site1_Hugo_Symbol", + "Site2_Hugo_Symbol", + "Site1_Entrez_Gene_Id", + "Site2_Entrez_Gene_Id", + "Site1_Region_Number", + "Site2_Region_Number", + "Site1_Region", + "Site2_Region", + "Site1_Chromosome", + "Site2_Chromosome", + "Site1_Contig", + "Site2_Contig", + "Site1_Position", + "Site2_Position", + "Event_Info", + "Breakpoint_Type", + "Connection_Type" +} + +REQUIRED_MUTATION_EVENT_FIELDS = + UNIQUE_MUTATION_EVENT_KEY_FIELDS.union({"Mutation_Status"}) + +REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS = + UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS.union({"SV_Status"}) + +ALL_REFERENCED_EVENT_FIELDS = + REQUIRED_MUTATION_EVENT_FIELDS.union(REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS) class EventType(Enum): """An Enum class to represent mutation or structural variant event types.""" @@ -23,15 +63,27 @@ class EventType(Enum): MUTATION = 1 STRUCTURAL_VARIANT = 2 - class LineProcessor: - """Handles the processing of each line - filtering for somatic events only""" + """Functionality common to all line processors""" - def __init__(self, event_type, col_indices, output_file_handle): + def __init__(self, event_type, col_indices): self.event_type = event_type self.col_indices = col_indices - self.output_file_handle = output_file_handle - self.header_was_written = False + self.raise_exception_if_missing_required_fields() + + def raise_exception_if_missing_required_fields(self): + required_field_set = set() + if self.event_type == EventType.MUTATION: + required_field_set = REQUIRED_MUTATION_EVENT_FIELDS + if self.event_type == EventType.STRUCTURAL_VARIANT: + required_field_set = REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS + missing_field_list = {} + for field_name in required_field_set: + if not field_name in self.col_indices: + missing_field_list.add(field_name) + if len(missing_field_list) > 0: + missing_fields = ",".join(missing_field_list) + raise IndexError(f'Unable to find required column(s) {missing_fields} in event file') def line_is_commented(self, line): """Determines if the given line in the file is a comment. @@ -44,6 +96,70 @@ def line_is_commented(self, line): """ return line[0] == '#' + def convert_line_to_fields(self, line): + return line.rstrip("\n").split("\t") + + def convert_line_to_field(self, field_index, line): + fields = self.convert_line_to_fields(line) + return fields[field_index] + + def compute_key_for_line(self, line): + unique_key_field_set = set() + if self.event_type == EventType.MUTATION: + unique_key_field_set = UNIQUE_MUTATION_EVENT_KEY_FIELDS + if self.event_type == EventType.STRUCTURAL_VARIANT: + unique_key_field_set = UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS + # key will be string representation of the object + fields = self.convert_line_to_fields(line) + key_value_terms = [] + for key in sorted(unique_key_field_set): + key_value_terms.append(key + "\t" + fields[self.col_indices[key]]) + computed_keys = "\t".join(computed_keys) + + +class LineGermlineEventScanner(LineProcessor): + """Registers the unique event keys for each event with germline status""" + + def __init__(self, event_type, col_indices, germline_event_key_set) + super().__init__(event_type, col_indices) + self.output_file_handle = output_file_handle + self.header_was_seen = False + self.germline_event_key_set = germline_event_key_set + + def scan(self, line): + """Scan data lines for all events which have status 'GERMLINE" and register them + + Args: + line (string): A line from the input file + + Raises: + IndexError: If any required column is not found in the input file + """ + if self.line_is_commented(line): + return + if not self.header_was_seen: + self.header_was_seen = True + return + + status_col_index = None + if event_type == EventType.MUTATION: + status_col_index = self.col_indices['Mutation_Status'] + elif event_type == EventType.STRUCTURAL_VARIANT: + status_col_index = self.col_indices['SV_Status'] + value = self.convert_line_to_field(status_col_index, line) + if value.casefold() == 'GERMLINE'.casefold(): + self.germline_event_key_set.add(self.compute_key_for_line(line)) + + +class LineFileWriter(LineProcessor): + """Handles the processing of each line - filtering for somatic events only""" + + def __init__(self, event_type, col_indices, germline_event_key_set, output_file_handle): + super().__init__(event_type, col_indices) + self.output_file_handle = output_file_handle + self.header_was_written = False + self.germline_event_key_set = germline_event_key_set + def process(self, line): """Process each line of the given file to remove all events that are not 'SOMATIC' or 'UNKNOWN'. @@ -63,22 +179,10 @@ def process(self, line): self.header_was_written = True return - col_index = None - if event_type == EventType.MUTATION: - if 'Mutation_Status' not in self.col_indices: - raise IndexError('Unable to find Mutation_status column in event file') - col_index = self.col_indices['Mutation_Status'] - elif event_type == EventType.STRUCTURAL_VARIANT: - if 'SV_Status' not in self.col_indices: - raise IndexError('Unable to find SV_Status column in event file') - col_index = self.col_indices['SV_Status'] - - cols = line.split('\t') - value = cols[col_index].rstrip('\n') - if value.casefold() == 'SOMATIC'.casefold() or value.casefold() == 'UNKNOWN'.casefold(): + line_key = self.compute_key_for_line(line) + if not line_key in self.germline_event_key_set: self.output_file_handle.write(line) - class FilteredFileWriter: """Handles writing the filtered file containing only somatic events""" @@ -87,13 +191,20 @@ def __init__(self, input_file_path, output_file_path, event_type): self.output_file_path = output_file_path self.event_type = event_type self.data_handler = DataHandler(input_file_path) - self.col_indices = self.data_handler.get_col_indices({"Mutation_Status", "SV_Status"}) + self.col_indices = self.data_handler.get_col_indices(ALL_REFERENCED_EVENT_FIELDS) + self.germline_event_keys = set() def write(self): """Processes the input file and writes out a filtered version including only somatic events""" + # scan for all germline events and record keys + with open(input_file_path, "r") as input_file_handle: + line_germline_event_scanner = LineGermlineEventScanner(self.event_type, self.col_indices, self.germline_event_keys) + for line in input_file_handle: + line_germline_event_scanner.scan(line) + # read/write events, filtering those which match a germline event key with open(input_file_path, "r") as input_file_handle: with open(output_file_path, "w") as output_file_handle: - line_processor = LineProcessor(self.event_type, self.col_indices, output_file_handle) + line_processor = LineFileWriter(self.event_type, self.col_indices, self.germline_event_keys, output_file_handle) for line in input_file_handle: line_processor.process(line) From d1c76f394a658b61d9837a24e9ca3ae5a43509ef Mon Sep 17 00:00:00 2001 From: Calla Chennault <51501715+callachennault@users.noreply.github.com> Date: Tue, 31 Jan 2023 11:46:59 -0500 Subject: [PATCH 2/4] Initial fixes --- .../filter_non_somatic_events_py3.py | 122 +++++++++++------- 1 file changed, 77 insertions(+), 45 deletions(-) diff --git a/import-scripts/filter_non_somatic_events_py3.py b/import-scripts/filter_non_somatic_events_py3.py index 3a3d139a9..7bdb74667 100755 --- a/import-scripts/filter_non_somatic_events_py3.py +++ b/import-scripts/filter_non_somatic_events_py3.py @@ -13,46 +13,45 @@ """ import argparse -import sys from enum import Enum from generate_az_study_changelog_py3 import DataHandler UNIQUE_MUTATION_EVENT_KEY_FIELDS = { - "Hugo_Symbol", - "Chromosome", - "Start_Position", - "End_Position", - "Reference_Allele", - "Tumor_Seq_Allele2", - "Tumor_Sample_Barcode" + 'Hugo_Symbol', + 'Chromosome', + 'Start_Position', + 'End_Position', + 'Reference_Allele', + 'Tumor_Seq_Allele2', + 'Tumor_Sample_Barcode' } UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS = { - "Sample_ID", - "Site1_Hugo_Symbol", - "Site2_Hugo_Symbol", - "Site1_Entrez_Gene_Id", - "Site2_Entrez_Gene_Id", - "Site1_Region_Number", - "Site2_Region_Number", - "Site1_Region", - "Site2_Region", - "Site1_Chromosome", - "Site2_Chromosome", - "Site1_Contig", - "Site2_Contig", - "Site1_Position", - "Site2_Position", - "Event_Info", - "Breakpoint_Type", - "Connection_Type" + 'Sample_ID', + 'Site1_Hugo_Symbol', + 'Site2_Hugo_Symbol', + 'Site1_Entrez_Gene_Id', + 'Site2_Entrez_Gene_Id', + 'Site1_Region_Number', + 'Site2_Region_Number', + 'Site1_Region', + 'Site2_Region', + 'Site1_Chromosome', + 'Site2_Chromosome', + 'Site1_Contig', + 'Site2_Contig', + 'Site1_Position', + 'Site2_Position', + 'Event_Info', + 'Breakpoint_Type', + 'Connection_Type' } REQUIRED_MUTATION_EVENT_FIELDS = - UNIQUE_MUTATION_EVENT_KEY_FIELDS.union({"Mutation_Status"}) + UNIQUE_MUTATION_EVENT_KEY_FIELDS.union({'Mutation_Status'}) REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS = - UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS.union({"SV_Status"}) + UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS.union({'SV_Status'}) ALL_REFERENCED_EVENT_FIELDS = REQUIRED_MUTATION_EVENT_FIELDS.union(REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS) @@ -72,17 +71,23 @@ def __init__(self, event_type, col_indices): self.raise_exception_if_missing_required_fields() def raise_exception_if_missing_required_fields(self): + """Checks that all required fields were found in the file for the given event type. + + Raises: + IndexError: If any of the required fields are not found + """ required_field_set = set() if self.event_type == EventType.MUTATION: required_field_set = REQUIRED_MUTATION_EVENT_FIELDS if self.event_type == EventType.STRUCTURAL_VARIANT: required_field_set = REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS + missing_field_list = {} for field_name in required_field_set: if not field_name in self.col_indices: missing_field_list.add(field_name) if len(missing_field_list) > 0: - missing_fields = ",".join(missing_field_list) + missing_fields = ','.join(missing_field_list) raise IndexError(f'Unable to find required column(s) {missing_fields} in event file') def line_is_commented(self, line): @@ -97,32 +102,56 @@ def line_is_commented(self, line): return line[0] == '#' def convert_line_to_fields(self, line): - return line.rstrip("\n").split("\t") + """Converts a tab-delimited data line to a list of values. + + Args: + line (string): A line from the input file + + Returns: + list: List of values from the line + """ + return line.rstrip('\n').split('\t') def convert_line_to_field(self, field_index, line): - fields = self.convert_line_to_fields(line) - return fields[field_index] + """Returns a value of interest in a tab-delimited data line. + + Args: + field_index (int): Index of the field of interest + line (string): A line from the input file + + Returns: + string: The value found at field_index + """ + return self.convert_line_to_fields(line)[field_index] def compute_key_for_line(self, line): + """Computes a unique key for the given line of data, using the unique + key fields defined for the event type. + + Args: + line (string): A line from the input file + """ unique_key_field_set = set() if self.event_type == EventType.MUTATION: unique_key_field_set = UNIQUE_MUTATION_EVENT_KEY_FIELDS if self.event_type == EventType.STRUCTURAL_VARIANT: unique_key_field_set = UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS - # key will be string representation of the object + + # Key will be string representation of the object fields = self.convert_line_to_fields(line) key_value_terms = [] for key in sorted(unique_key_field_set): - key_value_terms.append(key + "\t" + fields[self.col_indices[key]]) - computed_keys = "\t".join(computed_keys) + key_value_terms.append(key + '\t' + fields[self.col_indices[key]]) + computed_keys = '\t'.join(computed_keys) + + return computed_keys class LineGermlineEventScanner(LineProcessor): """Registers the unique event keys for each event with germline status""" - def __init__(self, event_type, col_indices, germline_event_key_set) + def __init__(self, event_type, col_indices, germline_event_key_set): super().__init__(event_type, col_indices) - self.output_file_handle = output_file_handle self.header_was_seen = False self.germline_event_key_set = germline_event_key_set @@ -146,6 +175,7 @@ def scan(self, line): status_col_index = self.col_indices['Mutation_Status'] elif event_type == EventType.STRUCTURAL_VARIANT: status_col_index = self.col_indices['SV_Status'] + value = self.convert_line_to_field(status_col_index, line) if value.casefold() == 'GERMLINE'.casefold(): self.germline_event_key_set.add(self.compute_key_for_line(line)) @@ -183,6 +213,7 @@ def process(self, line): if not line_key in self.germline_event_key_set: self.output_file_handle.write(line) + class FilteredFileWriter: """Handles writing the filtered file containing only somatic events""" @@ -196,14 +227,15 @@ def __init__(self, input_file_path, output_file_path, event_type): def write(self): """Processes the input file and writes out a filtered version including only somatic events""" - # scan for all germline events and record keys - with open(input_file_path, "r") as input_file_handle: + # Scan for all germline events and record keys + with open(input_file_path, 'r') as input_file_handle: line_germline_event_scanner = LineGermlineEventScanner(self.event_type, self.col_indices, self.germline_event_keys) for line in input_file_handle: line_germline_event_scanner.scan(line) - # read/write events, filtering those which match a germline event key - with open(input_file_path, "r") as input_file_handle: - with open(output_file_path, "w") as output_file_handle: + + # Read/write events, filtering those which match a germline event key + with open(input_file_path, 'r') as input_file_handle: + with open(output_file_path, 'w') as output_file_handle: line_processor = LineFileWriter(self.event_type, self.col_indices, self.germline_event_keys, output_file_handle) for line in input_file_handle: line_processor.process(line) @@ -231,12 +263,12 @@ def write(self): event_type = None if not args.event_type: raise ValueError('Event type argument is missing') - if args.event_type.casefold() == "mutation".casefold(): + if args.event_type.casefold() == 'mutation'.casefold(): event_type = EventType.MUTATION - elif args.event_type.casefold() == "structural_variant".casefold(): + elif args.event_type.casefold() == 'structural_variant'.casefold(): event_type = EventType.STRUCTURAL_VARIANT if event_type is None: - raise ValueError(f'event type argument {args.event_type} not recognized or missing') + raise ValueError(f'Event type argument {args.event_type} not recognized or missing') # Filter the file filtered_file_writer = FilteredFileWriter(input_file_path, output_file_path, event_type) From 32ee13f4d01a19aeb0987cd140bbe0281dd9d56f Mon Sep 17 00:00:00 2001 From: Calla Chennault <51501715+callachennault@users.noreply.github.com> Date: Tue, 31 Jan 2023 11:49:53 -0500 Subject: [PATCH 3/4] Format code --- .../filter_non_somatic_events_py3.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/import-scripts/filter_non_somatic_events_py3.py b/import-scripts/filter_non_somatic_events_py3.py index 7bdb74667..443cdfdb1 100755 --- a/import-scripts/filter_non_somatic_events_py3.py +++ b/import-scripts/filter_non_somatic_events_py3.py @@ -23,7 +23,7 @@ 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2', - 'Tumor_Sample_Barcode' + 'Tumor_Sample_Barcode', } UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS = { @@ -44,17 +44,15 @@ 'Site2_Position', 'Event_Info', 'Breakpoint_Type', - 'Connection_Type' + 'Connection_Type', } -REQUIRED_MUTATION_EVENT_FIELDS = - UNIQUE_MUTATION_EVENT_KEY_FIELDS.union({'Mutation_Status'}) +REQUIRED_MUTATION_EVENT_FIELDS = UNIQUE_MUTATION_EVENT_KEY_FIELDS.union({'Mutation_Status'}) -REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS = - UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS.union({'SV_Status'}) +REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS = UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS.union({'SV_Status'}) + +ALL_REFERENCED_EVENT_FIELDS = REQUIRED_MUTATION_EVENT_FIELDS.union(REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS) -ALL_REFERENCED_EVENT_FIELDS = - REQUIRED_MUTATION_EVENT_FIELDS.union(REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS) class EventType(Enum): """An Enum class to represent mutation or structural variant event types.""" @@ -62,6 +60,7 @@ class EventType(Enum): MUTATION = 1 STRUCTURAL_VARIANT = 2 + class LineProcessor: """Functionality common to all line processors""" @@ -81,7 +80,7 @@ def raise_exception_if_missing_required_fields(self): required_field_set = REQUIRED_MUTATION_EVENT_FIELDS if self.event_type == EventType.STRUCTURAL_VARIANT: required_field_set = REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS - + missing_field_list = {} for field_name in required_field_set: if not field_name in self.col_indices: @@ -125,7 +124,7 @@ def convert_line_to_field(self, field_index, line): return self.convert_line_to_fields(line)[field_index] def compute_key_for_line(self, line): - """Computes a unique key for the given line of data, using the unique + """Computes a unique key for the given line of data, using the unique key fields defined for the event type. Args: @@ -136,7 +135,7 @@ def compute_key_for_line(self, line): unique_key_field_set = UNIQUE_MUTATION_EVENT_KEY_FIELDS if self.event_type == EventType.STRUCTURAL_VARIANT: unique_key_field_set = UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS - + # Key will be string representation of the object fields = self.convert_line_to_fields(line) key_value_terms = [] @@ -175,7 +174,7 @@ def scan(self, line): status_col_index = self.col_indices['Mutation_Status'] elif event_type == EventType.STRUCTURAL_VARIANT: status_col_index = self.col_indices['SV_Status'] - + value = self.convert_line_to_field(status_col_index, line) if value.casefold() == 'GERMLINE'.casefold(): self.germline_event_key_set.add(self.compute_key_for_line(line)) @@ -229,14 +228,18 @@ def write(self): """Processes the input file and writes out a filtered version including only somatic events""" # Scan for all germline events and record keys with open(input_file_path, 'r') as input_file_handle: - line_germline_event_scanner = LineGermlineEventScanner(self.event_type, self.col_indices, self.germline_event_keys) + line_germline_event_scanner = LineGermlineEventScanner( + self.event_type, self.col_indices, self.germline_event_keys + ) for line in input_file_handle: line_germline_event_scanner.scan(line) - + # Read/write events, filtering those which match a germline event key with open(input_file_path, 'r') as input_file_handle: with open(output_file_path, 'w') as output_file_handle: - line_processor = LineFileWriter(self.event_type, self.col_indices, self.germline_event_keys, output_file_handle) + line_processor = LineFileWriter( + self.event_type, self.col_indices, self.germline_event_keys, output_file_handle + ) for line in input_file_handle: line_processor.process(line) From a440d309633760f8f8527577fab10ba6d1da2a9a Mon Sep 17 00:00:00 2001 From: Calla Chennault <51501715+callachennault@users.noreply.github.com> Date: Tue, 31 Jan 2023 12:11:20 -0500 Subject: [PATCH 4/4] Bug fixes --- import-scripts/filter_non_somatic_events_py3.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/import-scripts/filter_non_somatic_events_py3.py b/import-scripts/filter_non_somatic_events_py3.py index 443cdfdb1..a07bdbccf 100755 --- a/import-scripts/filter_non_somatic_events_py3.py +++ b/import-scripts/filter_non_somatic_events_py3.py @@ -81,12 +81,12 @@ def raise_exception_if_missing_required_fields(self): if self.event_type == EventType.STRUCTURAL_VARIANT: required_field_set = REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS - missing_field_list = {} + missing_field_set = set() for field_name in required_field_set: if not field_name in self.col_indices: - missing_field_list.add(field_name) - if len(missing_field_list) > 0: - missing_fields = ','.join(missing_field_list) + missing_field_set.add(field_name) + if len(missing_field_set) > 0: + missing_fields = ','.join(missing_field_set) raise IndexError(f'Unable to find required column(s) {missing_fields} in event file') def line_is_commented(self, line): @@ -141,9 +141,7 @@ def compute_key_for_line(self, line): key_value_terms = [] for key in sorted(unique_key_field_set): key_value_terms.append(key + '\t' + fields[self.col_indices[key]]) - computed_keys = '\t'.join(computed_keys) - - return computed_keys + return '\t'.join(key_value_terms) class LineGermlineEventScanner(LineProcessor): @@ -264,14 +262,12 @@ def write(self): # Ensure that a recognizable event type code is input. event_type = None - if not args.event_type: - raise ValueError('Event type argument is missing') if args.event_type.casefold() == 'mutation'.casefold(): event_type = EventType.MUTATION elif args.event_type.casefold() == 'structural_variant'.casefold(): event_type = EventType.STRUCTURAL_VARIANT if event_type is None: - raise ValueError(f'Event type argument {args.event_type} not recognized or missing') + raise ValueError(f'Event type argument {args.event_type} not recognized') # Filter the file filtered_file_writer = FilteredFileWriter(input_file_path, output_file_path, event_type)