diff --git a/alphaquant/cluster/cluster_missingval.py b/alphaquant/cluster/cluster_missingval.py index f1d2763a..6dfa12b4 100644 --- a/alphaquant/cluster/cluster_missingval.py +++ b/alphaquant/cluster/cluster_missingval.py @@ -8,6 +8,42 @@ PVALUE_THRESHOLD_FOR_INTENSITY_BASED_COUNTING = 0.1 +# Determines at which level missing value testing is performed. +# Set once based on tree structure, then reused. +MISSINGVAL_TEST_LEVEL = None + + +def _determine_missingval_test_level(root_node): + """Determine the appropriate level for missing value statistical testing. + + Scenarios: + 1) "mod_seq_charge" exists in tree -> test at mod_seq_charge level + 2) "mod_seq" is one level above leaves -> test at base ion level + 3) "seq" is one level above leaves -> test at base ion level + 4) "gene" is one level above leaves -> test at base ion level + """ + # Check if mod_seq_charge nodes exist (fragment-level data) + mod_seq_charge_nodes = anytree.search.findall(root_node, filter_=lambda node: node.type == "mod_seq_charge") + if len(mod_seq_charge_nodes) > 0: + # Scenario 1: fragment-level data — everything below mod_seq_charge is collapsed to mod_seq_charge as the lowest identification level + return "mod_seq_charge" + + # For all other cases, check what's one level above leaves + leaf_parent_type = root_node.leaves[0].parent.type + + if leaf_parent_type == "mod_seq": + # Scenario 2: charged peptides without fragments + return "base" + elif leaf_parent_type == "seq": + # Scenario 3: peptides without charge info + return "base" + elif leaf_parent_type == "gene": + # Scenario 4: simplest hierarchy, leaves directly under gene + return "base" + else: + raise ValueError(f"Unexpected tree structure: leaf parent type is '{leaf_parent_type}'. " + f"Expected one of: 'mod_seq', 'seq', 'gene', or tree with 'mod_seq_charge' nodes.") + def create_protnode_from_missingval_ions(gene_name,diffions, normed_c1, normed_c2): return MissingValProtNodeCreator(gene_name, diffions, normed_c1, normed_c2).prot_node @@ -76,11 +112,22 @@ def _assign_properties_to_missingval_base_ions(self, root_node): @staticmethod - def _get_nodes_to_test(root_node): #get the nodes in the lowest level that is relevant for the binomial test - if root_node.leaves[0].parent.type == "mod_seq": #when AlphaQuant works with precursors only (not fragments), the precursors themselves are the "base ions" and the "mod_seq_charge" node does not exist - return root_node.children - else: + def _get_nodes_to_test(root_node): + """Get the nodes at which to perform the missing value statistical test. + + Uses MISSINGVAL_TEST_LEVEL which is set once based on tree structure. + """ + global MISSINGVAL_TEST_LEVEL + + # Set the test level if not already determined + if MISSINGVAL_TEST_LEVEL is None: + MISSINGVAL_TEST_LEVEL = _determine_missingval_test_level(root_node) + + if MISSINGVAL_TEST_LEVEL == "mod_seq_charge": return anytree.search.findall(root_node, filter_=lambda node: node.type == "mod_seq_charge") + else: # "base" + # In short trees (no fragments), leaves are the precursors themselves — the right level to test + return root_node.leaves def _propagate_properties_to_nodes_to_test(self,nodes_to_test): #goes through each node to test and merges the properties from it's base to the node itself diff --git a/alphaquant/config/quant_reader_config.yaml b/alphaquant/config/quant_reader_config.yaml index 0b253cda..050149b6 100644 --- a/alphaquant/config/quant_reader_config.yaml +++ b/alphaquant/config/quant_reader_config.yaml @@ -1342,8 +1342,25 @@ diaumpire_precursor_ms1: ion_cols: - Peptide Key +fragpipe_precursor: + format: widetable + quant_pre_or_suffix: " Intensity" + protein_cols: + - Protein + ion_hierarchy: + sequence_int: + order: [SEQ, MOD, CHARGE] + mapping: + SEQ: + - Peptide Sequence + MOD: + - Modified Sequence + CHARGE: + - Charge + use_iontree: False + ml_level: SEQ -fragpipe_precursors: +fragpipe_modseq: format: widetable quant_pre_or_suffix: " Intensity" protein_cols: @@ -1358,3 +1375,6 @@ fragpipe_precursors: - Modified Sequence use_iontree: False ml_level: SEQ + + + diff --git a/alphaquant/config/quant_reader_config_lightweight.yaml b/alphaquant/config/quant_reader_config_lightweight.yaml deleted file mode 100644 index 74704faa..00000000 --- a/alphaquant/config/quant_reader_config_lightweight.yaml +++ /dev/null @@ -1,481 +0,0 @@ ---- -#this file determines the parameters used to convert long format tables as e.g. produced by Spectronaut or DIA-NN into a wide table format -alphadia_precursor_protein: - format: longtable - sample_ID: run - quant_ID: - precursor: mean_overlapping_intensity - protein_cols: - - pg_master - ion_hierarchy: - precursor: - order: [SEQ, MOD, CHARGE] - mapping: - SEQ: - - sequence - MOD: - - mods - CHARGE: - - charge - use_iontree: True - ml_level: CHARGE - filters: - protein_qval: - param: pg_qval - comparator: "<=" - value: 0.01 - - -alphapept_peptides: - format: longtable - sample_ID: shortname - quant_ID: - precursor_intensity: int_sum - protein_cols: - - protein_group - ion_hierarchy: - precursor_intensity: - order: [SEQ, CHARGE] - mapping: - SEQ: - - naked_sequence - CHARGE: - - charge - - use_iontree: False - -maxquant_peptides: - format: widetable - quant_pre_or_suffix: "Intensity " - protein_cols: - - Gene names - ion_cols: - - Sequence - ion_hierarchy: - sequence_int: - order: [SEQ, MOD] - mapping: - SEQ: - - Sequence - MOD: - - Mass - filters: - reverse: - param: Reverse - comparator: "!=" - value: "+" - contaminant: - param: Potential contaminant - comparator: "!=" - value: "+" - amino_acid: - param: Amino acid before - comparator: "!=" - value: "XYZ" - ml_level: SEQ - use_iontree: False - - -maxquant_evidence: - format: longtable - sample_ID: Experiment #Raw file - quant_ID: Intensity - protein_cols: - - Gene names - ion_cols: - - Modified sequence - - Charge - - -diann_fragion_ms1_corrected: - format: longtable - sample_ID: Run - quant_ID: - fragion: Fragment.Quant.Corrected - #Fragment.Quant.Raw - ms1iso: Ms1.Area - protein_cols: - - Genes - split_cols: - Fragment.Quant.Corrected: ";" - ion_hierarchy: - fragion: - order: [SEQ, MOD, CHARGE, FRGION] - mapping: - SEQ: - - Stripped.Sequence - MOD: - - Modified.Sequence - CHARGE: - - Precursor.Charge - FRGION: - - Fragment.Quant.Corrected - ms1iso: - order: [SEQ, MOD, CHARGE, MS1ISOTOPES] - mapping: - SEQ: - - Stripped.Sequence - MOD: - - Modified.Sequence - CHARGE: - - Precursor.Charge - MS1ISOTOPES: - - Precursor.Charge - use_iontree: True - ml_level: CHARGE - filters: - protein_qval: - param: Lib.PG.Q.Value - comparator: "<=" - value: 0.01 - - -diann_precursor_fragion_ms1: - format: longtable - sample_ID: Run - quant_ID: - fragion: Fragment.Quant.Raw - ms1iso: Ms1.Area - precursor: Precursor.Normalised - protein_cols: - - Genes - split_cols: - Fragment.Quant.Raw: ";" - ion_hierarchy: - fragion: - order: [SEQ, MOD, CHARGE, FRGION] - mapping: - SEQ: - - Stripped.Sequence - MOD: - - Modified.Sequence - CHARGE: - - Precursor.Charge - FRGION: - - Fragment.Quant.Raw - ms1iso: - order: [SEQ, MOD, CHARGE, MS1ISOTOPES] - mapping: - SEQ: - - Stripped.Sequence - MOD: - - Modified.Sequence - CHARGE: - - Precursor.Charge - MS1ISOTOPES: - - Precursor.Charge - precursor: - order: [SEQ, MOD, CHARGE, PRECURSOR] - mapping: - SEQ: - - Stripped.Sequence - MOD: - - Modified.Sequence - CHARGE: - - Precursor.Charge - PRECURSOR: - - Precursor.Charge - use_iontree: True - ml_level: CHARGE - filters: - protein_qval: - param: Lib.PG.Q.Value - comparator: "<=" - value: 0.01 - - -diann_precursor: - format: longtable - sample_ID: Run - quant_ID: - precursor: Precursor.Normalised - protein_cols: - - Genes - ion_hierarchy: - precursor: - order: [SEQ, MOD, CHARGE] - mapping: - SEQ: - - Stripped.Sequence - MOD: - - Modified.Sequence - CHARGE: - - Precursor.Charge - use_iontree: True - ml_level: CHARGE - filters: - protein_qval: - param: Lib.PG.Q.Value - comparator: "<=" - value: 0.01 - - - -spectronaut_ptm_fragion: - format: longtable - sample_ID: R.Label - quant_ID: - fragion: F.PeakArea - protein_cols: - - ptm_id - ion_cols: - - FG.Id - - F.FrgIon - - F.FrgLossType - - F.Charge - ion_hierarchy: - fragion: - order: [SEQ, MOD, CHARGE, FRGION] - mapping: - SEQ: - - PEP.StrippedSequence - MOD: - - ptm_mapped_modseq - CHARGE: - - FG.Charge - FRGION: - - F.FrgIon - - F.FrgLossType - - F.Charge - filters: - fragion_intensity: - param: F.PeakArea - comparator: ">" - value: 5.0 - use_iontree: True - ml_level: CHARGE - annotation_columns: - - PEP.StrippedSequence - - -spectronaut_fragion_ms1_gene: - format: longtable - sample_ID: R.Label - quant_ID: - fragion: F.PeakArea - ms1iso: FG.MS1IsotopeIntensities (Measured) - protein_cols: - - PG.Genes - ion_cols: - - FG.Id - - F.FrgIon - - F.FrgLossType - - F.Charge - split_cols: - FG.MS1IsotopeIntensities (Measured): ";" - ion_hierarchy: - fragion: - order: [SEQ, MOD, CHARGE, FRGION] - mapping: - SEQ: - - PEP.StrippedSequence - MOD: - - EG.ModifiedSequence - CHARGE: - - FG.Charge - FRGION: - - F.FrgIon - - F.FrgLossType - - F.Charge - ms1iso: - order: [SEQ, MOD, CHARGE, MS1ISOTOPES] - mapping: - SEQ: - - PEP.StrippedSequence - MOD: - - EG.ModifiedSequence - CHARGE: - - FG.Charge - MS1ISOTOPES: - - FG.MS1IsotopeIntensities (Measured) - filters: - fragion_intensity: - param: F.PeakArea - comparator: ">" - value: 5.0 - gene_unique: - param: PEP.IsGeneSpecific - comparator: "==" - value: "True" - use_iontree: True - ml_level: CHARGE - - -spectronaut_fragion_ms1_gene: - format: longtable - sample_ID: R.Label - quant_ID: - fragion: F.PeakArea - ms1iso: FG.MS1IsotopeIntensities (Measured) - protein_cols: - - PG.Genes - ion_cols: - - FG.Id - - F.FrgIon - - F.FrgLossType - - F.Charge - split_cols: - FG.MS1IsotopeIntensities (Measured): ";" - ion_hierarchy: - fragion: - order: [SEQ, MOD, CHARGE, FRGION] - mapping: - SEQ: - - PEP.StrippedSequence - MOD: - - EG.ModifiedSequence - CHARGE: - - FG.Charge - FRGION: - - F.FrgIon - - F.FrgLossType - - F.Charge - ms1iso: - order: [SEQ, MOD, CHARGE, MS1ISOTOPES] - mapping: - SEQ: - - PEP.StrippedSequence - MOD: - - EG.ModifiedSequence - CHARGE: - - FG.Charge - MS1ISOTOPES: - - FG.MS1IsotopeIntensities (Measured) - filters: - fragion_intensity: - param: F.PeakArea - comparator: ">" - value: 5.0 - use_iontree: True - ml_level: CHARGE - - -spectronaut_precursor_gene: - format: longtable - sample_ID: R.Label - quant_ID: - precursor: FG.Quantity - protein_cols: - - PG.Genes - ion_hierarchy: - precursor: - order: [SEQ, MOD, CHARGE] - mapping: - SEQ: - - PEP.StrippedSequence - MOD: - - EG.ModifiedSequence - CHARGE: - - FG.Charge - filters: - gene_unique: - param: PEP.IsGeneSpecific - comparator: "==" - value: "True" - use_iontree: True - ml_level: CHARGE - - -spectronaut_precursor_gene: - format: longtable - sample_ID: R.Label - quant_ID: - precursor: FG.Quantity - protein_cols: - - PG.Genes - ion_hierarchy: - precursor: - order: [SEQ, MOD, CHARGE] - mapping: - SEQ: - - PEP.StrippedSequence - MOD: - - EG.ModifiedSequence - CHARGE: - - FG.Charge - use_iontree: True - ml_level: CHARGE - - -openswath_precursor_aligned: - format: longtable - sample_ID: run_id - quant_ID: Intensity - protein_cols: - - ProteinName - ion_cols: - - peptide_group_label - filters: - decoy: - param: decoy - comparator: "==" - value: 0 - -openswath_pyprophet: - format: longtable - sample_ID: filename - quant_ID: Intensity - protein_cols: - - ProteinName - ion_cols: - - FullPeptideName - - Charge - filters: - decoy: - param: decoy - comparator: "==" - value: 0 - -skyline_precursor: - format: longtable - sample_ID: ReplicateName - quant_ID: TotalAreaFragment - protein_cols: - - ProteinName - ion_cols: - - PeptideModifiedSequence - - PrecursorCharge - -diaumpire_precursor_ms1: - format: widetable - protein_cols: - - Proteins - ion_cols: - - Peptide Key - - -diann_wideformat: - format: widetable - protein_cols: - - Protein.Group - ion_cols: - - Stripped.Sequence - - Modified.Sequence - - Precursor.Charge - ion_hierarchy: - sequence_int: - order: [SEQ, MOD] - mapping: - SEQ: - - Stripped.Sequence - MOD: - - Modified.Sequence - CH: - - Precursor.Charge - ml_level: SEQ - use_iontree: False - -fragpipe_precursors: - format: widetable - quant_pre_or_suffix: " Intensity" - protein_cols: - - Protein - ion_hierarchy: - sequence_int: - order: [SEQ, MOD] - mapping: - SEQ: - - Peptide Sequence - MOD: - - Modified Sequence - use_iontree: False