Merge pull request #132 from evancofer/master

kathyxchen · web-flow · commit dbef3f52959b · 2019-12-17T15:46:13.000-05:00
Allow specifying a substring to mutate with ISM
diff --git a/docs/source/overview/cli.md b/docs/source/overview/cli.md
@@ -303,13 +303,15 @@ You may find that there are more output files than you expect in `output_dir` at
 - **Warnings:** Selene may detect that the `ref` base(s) in a variant do not match with the bases specified in the reference sequence FASTA at the `(chrom, pos)`. In this case, Selene will use the `ref` base(s) specified in the VCF file in place of those in the reference genome and output predictions accordingly. These predictions will be distinguished by the row label column `ref_match` value `False`. You may review these variants and determine whether you still want to use those predictions/scores. If you find that most of the variants have `ref_match = False`, it may be that you have specified the wrong reference genome version---please check this before proceeding.  
 
 ### _In silico_ mutagenesis
-An example configuration for _in silico_ mutagenesis when using a single sequence as input:
+An example configuration for _in silico_ mutagenesis of the whole sequence (i.e. rather than a subsequence), when using a single sequence as input:
 ```YAML
 in_silico_mutagenesis: {
     input_sequence: ATCGATAAAATTCTGGAG...,
     save_data: [predictions, diffs],
     output_path_prefix: /path/to/output/dir/filename_prefix,
-    mutate_n_bases: 1
+    mutate_n_bases: 1,
+    start_position: 0,
+    end_position: None
 }
 ```
 
@@ -318,15 +320,19 @@ in_silico_mutagenesis: {
 - `save_data`: A list of the data files to output. Must input 1 or more of the following options: `[abs_diffs, diffs, logits, predictions]`. (Note that the raw prediction values will not be outputted by default---you must specify `predictions` in the list if you want them.)
 - `output_path_prefix`: Optional, default is "ism". The path to which the data files are written. We have specified that it should be a filename _prefix_ because we will append additional information depending on what files you would like to output (e.g. `fileprefix_logits.tsv`) If directories in the path do not yet exist, they will automatically be created. 
 - `mutate_n_bases`: Optional, default is 1. The number of bases to mutate at any time. Standard _in silico_ mutagenesis only mutates a single base at a time, so we encourage users to start by leaving this value at 1. Double/triple mutations will be more difficult to interpret and are something we may work on in the future. 
+- `start_position`: Optional, default is 0. The starting position of the subsequence that should be mutated. This value should be nonnegative, and less than `end_position`. Also, the value of `end_position - start_position` should be at least `mutate_n_bases`.
+- `end_position`: Optional, default is `None`. If left as `None`, Selene will use the `sequence_length` parameter from `analyze_sequences`. This is the ending position of the subsequence that should be mutated. This value should be nonnegative, and greater than `start_position`. The value of `end_position -  start_position` should be at least `mutate_n_bases`.
 
-An example configuration for _in silico_ mutagenesis when using a FASTA file as input:
+An example configuration for _in silico_ mutagenesis of the center 100 bases of a 1000 base sequence read from a FASTA file input:
 ```YAML
 in_silico_mutagenesis: {
-    input_path: /path/to/sequences1.fa, 
+    input_path: /path/to/sequences1.fa,
     save_data: [logits],
     output_dir: /path/to/output/predictions/dir,
     mutate_n_bases: 1,
-    use_sequence_name: True
+    use_sequence_name: True,
+    start_position: 450,
+    end_position: 550
 }
 ```
 
@@ -338,6 +344,8 @@ in_silico_mutagenesis: {
 - `use_sequence_name`: Optional, default is `True`.
   - If `use_sequence_name`, output files are prefixed by the sequence name/description corresponding to each sequence in the FASTA file. Spaces in the description are replaced with underscores '_'.
   - If not `use_sequence_name`, output files are prefixed with the index `i` corresponding to the `i`th sequence in the FASTA file.
+- `start_position`: Optional, default is 0. The starting position of the subsequence that should be mutated. This value should be nonnegative, and less than `end_position`. The value of `end_position - start_position` should be at least `mutate_n_bases`.
+- `end_position`: Optional, default is `None`. If left as `None`, Selene will use the `sequence_length` parameter passed to `analyze_sequences`. This is the ending position of the subsequence that should be mutated. This value should be nonnegative, and greater than `start_position`. The value of `end_position -  start_position` should be at least `mutate_n_bases`.
 
 ## Sampler configurations
 Data sampling is used during model training and evaluation. You must specify the sampler in the configuration YAML file alongside the other operation-specific configurations (i.e. `train_model` or `evaluate_model`). 
diff --git a/selene_sdk/predict/_in_silico_mutagenesis.py b/selene_sdk/predict/_in_silico_mutagenesis.py
@@ -7,7 +7,9 @@
 
 def in_silico_mutagenesis_sequences(sequence,
                                     mutate_n_bases=1,
-                                    reference_sequence=Genome):
+                                    reference_sequence=Genome,
+                                    start_position=0,
+                                    end_position=None):
     """
     Creates a list containing each mutation that occurs from an
     *in silico* mutagenesis across the whole sequence.
@@ -26,6 +28,13 @@ def in_silico_mutagenesis_sequences(sequence,
     reference_sequence : class, optional
         Default is `selene_sdk.sequences.Genome`. The type of sequence
         that has been passed in.
+    start_position : int, optional
+        Default is 0. The starting position of the subsequence to be
+        mutated.
+    end_position : int or None, optional
+        Default is None. The ending position of the subsequence to be
+        mutated. If left as `None`, then `len(sequence)` will be
+        used.
 
     Returns
     -------
@@ -39,7 +48,46 @@ def in_silico_mutagenesis_sequences(sequence,
         we return a list with length of 3000-4000, depending on the number of
         unknown bases in the input sequences.
 
+    Raises
+    ------
+    ValueError
+        If the value of `start_position` or `end_position` is negative.
+    ValueError
+        If there are fewer than `mutate_n_bases` between `start_position`
+        and `end_position`.
+    ValueError
+        If `start_position` is greater or equal to `end_position`.
+    ValueError
+        If `start_position` is not less than `len(sequence)`.
+    ValueError
+        If `end_position` is greater than `len(sequence)`.
+
     """
+    if end_position is None:
+        end_position = len(sequence)
+    if start_position >= end_position:
+        raise ValueError(("Starting positions must be less than the ending "
+                          "positions. Found a starting position of {0} with "
+                          "an ending position of {1}.").format(start_position,
+                                                               end_position))
+    if start_position < 0:
+        raise ValueError("Negative starting positions are not supported.")
+    if end_position < 0:
+        raise ValueError("Negative ending positions are not supported.")
+    if start_position >= len(sequence):
+        raise ValueError(("Starting positions must be less than the sequence length."
+                          " Found a starting position of {0} with a sequence length "
+                          "of {1}.").format(start_position, len(sequence)))
+    if end_position > len(sequence):
+        raise ValueError(("Ending positions must be less than or equal to the sequence "
+                          "length. Found an ending position of {0} with a sequence "
+                          "length of {1}.").format(end_position, len(sequence)))
+    if (end_position - start_position) < mutate_n_bases:
+        raise ValueError(("Fewer bases exist in the substring specified by the starting "
+                          "and ending positions than need to be mutated. There are only "
+                          "{0} currently, but {1} bases must be mutated at a "
+                          "time").format(end_position - start_position, mutate_n_bases))
+
     sequence_alts = []
     for index, ref in enumerate(sequence):
         alts = []
@@ -50,7 +98,7 @@ def in_silico_mutagenesis_sequences(sequence,
         sequence_alts.append(alts)
     all_mutated_sequences = []
     for indices in itertools.combinations(
-            range(len(sequence)), mutate_n_bases):
+            range(start_position, end_position), mutate_n_bases):
         pos_mutations = []
         for i in indices:
             pos_mutations.append(sequence_alts[i])
diff --git a/selene_sdk/predict/model_predict.py b/selene_sdk/predict/model_predict.py
@@ -654,7 +654,9 @@ def in_silico_mutagenesis(self,
                               save_data,
                               output_path_prefix="ism",
                               mutate_n_bases=1,
-                              output_format="tsv"):
+                              output_format="tsv",
+                              start_position=0,
+                              end_position=None):
         """
         Applies *in silico* mutagenesis to a sequence.
 
@@ -674,6 +676,13 @@ def in_silico_mutagenesis(self,
             optimized operations for double and triple mutations.
         output_format : {'tsv', 'hdf5'}, optional
             Default is 'tsv'. The desired output format.
+        start_position : int, optional
+            Default is 0. The starting position of the subsequence to be
+            mutated.
+        end_position : int or None, optional
+            Default is None. The ending position of the subsequence to be
+            mutated. If left as `None`, then `self.sequence_length` will be
+            used.
 
         Returns
         -------
@@ -683,7 +692,46 @@ def in_silico_mutagenesis(self,
             file named `*_ref_predictions.h5` will be outputted with the
             model prediction for the original input sequence.
 
+        Raises
+        ------
+        ValueError
+            If the value of `start_position` or `end_position` is negative.
+        ValueError
+            If there are fewer than `mutate_n_bases` between `start_position`
+            and `end_position`.
+        ValueError
+            If `start_position` is greater or equal to `end_position`.
+        ValueError
+            If `start_position` is not less than `self.sequence_length`.
+        ValueError
+            If `end_position` is greater than `self.sequence_length`.
+
         """
+        if end_position is None:
+            end_position = self.sequence_length
+        if start_position >= end_position:
+            raise ValueError(("Starting positions must be less than the ending "
+                              "positions. Found a starting position of {0} with "
+                              "an ending position of {1}.").format(start_position,
+                                                                   end_position))
+        if start_position < 0:
+            raise ValueError("Negative starting positions are not supported.")
+        if end_position < 0:
+            raise ValueError("Negative ending positions are not supported.")
+        if start_position >= self.sequence_length:
+            raise ValueError(("Starting positions must be less than the sequence length."
+                              " Found a starting position of {0} with a sequence length "
+                              "of {1}.").format(start_position, self.sequence_length))
+        if end_position > self.sequence_length:
+            raise ValueError(("Ending positions must be less than or equal to the sequence "
+                              "length. Found an ending position of {0} with a sequence "
+                              "length of {1}.").format(end_position, self.sequence_length))
+        if (end_position - start_position) < mutate_n_bases:
+            raise ValueError(("Fewer bases exist in the substring specified by the starting "
+                              "and ending positions than need to be mutated. There are only "
+                              "{0} currently, but {1} bases must be mutated at a "
+                              "time").format(end_position - start_position, mutate_n_bases))
+
         path_dirs, _ = os.path.split(output_path_prefix)
         if path_dirs:
             os.makedirs(path_dirs, exist_ok=True)
@@ -704,7 +752,9 @@ def in_silico_mutagenesis(self,
         sequence = str.upper(sequence)
         mutated_sequences = in_silico_mutagenesis_sequences(
             sequence, mutate_n_bases=1,
-            reference_sequence=self.reference_sequence)
+            reference_sequence=self.reference_sequence,
+            start_position=start_position,
+            end_position=end_position)
         reporters = self._initialize_reporters(
             save_data,
             output_path_prefix,
@@ -744,7 +794,9 @@ def in_silico_mutagenesis_from_file(self,
                                         output_dir,
                                         mutate_n_bases=1,
                                         use_sequence_name=True,
-                                        output_format="tsv"):
+                                        output_format="tsv",
+                                        start_position=0,
+                                        end_position=None):
         """
         Apply *in silico* mutagenesis to all sequences in a FASTA file.
 
@@ -776,6 +828,16 @@ def in_silico_mutagenesis_from_file(self,
             the FASTA file will have its own set of output files, where
             the number of output files depends on the number of `save_data`
             predictions/scores specified.
+        start_position : int, optional
+            Default is 0. The starting position of the subsequence to be
+            mutated.
+        end_position : int or None, optional
+            Default is None. The ending position of the subsequence to be
+            mutated. If left as `None`, then `self.sequence_length` will be
+            used.
+
+
+
 
         Returns
         -------
@@ -785,7 +847,46 @@ def in_silico_mutagenesis_from_file(self,
             file named `*_ref_predictions.h5` will be outputted with the
             model prediction for the original input sequence.
 
+        Raises
+        ------
+        ValueError
+            If the value of `start_position` or `end_position` is negative.
+        ValueError
+            If there are fewer than `mutate_n_bases` between `start_position`
+            and `end_position`.
+        ValueError
+            If `start_position` is greater or equal to `end_position`.
+        ValueError
+            If `start_position` is not less than `self.sequence_length`.
+        ValueError
+            If `end_position` is greater than `self.sequence_length`.
+
         """
+        if end_position is None:
+            end_position = self.sequence_length
+        if start_position >= end_position:
+            raise ValueError(("Starting positions must be less than the ending "
+                              "positions. Found a starting position of {0} with "
+                              "an ending position of {1}.").format(start_position,
+                                                                   end_position))
+        if start_position < 0:
+            raise ValueError("Negative starting positions are not supported.")
+        if end_position < 0:
+            raise ValueError("Negative ending positions are not supported.")
+        if start_position >= self.sequence_length:
+            raise ValueError(("Starting positions must be less than the sequence length."
+                              " Found a starting position of {0} with a sequence length "
+                              "of {1}.").format(start_position, self.sequence_length))
+        if end_position > self.sequence_length:
+            raise ValueError(("Ending positions must be less than or equal to the sequence "
+                              "length. Found an ending position of {0} with a sequence "
+                              "length of {1}.").format(end_position, self.sequence_length))
+        if (end_position - start_position) < mutate_n_bases:
+            raise ValueError(("Fewer bases exist in the substring specified by the starting "
+                              "and ending positions than need to be mutated. There are only "
+                              "{0} currently, but {1} bases must be mutated at a "
+                              "time").format(end_position - start_position, mutate_n_bases))
+
         os.makedirs(output_dir, exist_ok=True)
 
         fasta_file = pyfaidx.Fasta(input_path)
@@ -803,7 +904,9 @@ def in_silico_mutagenesis_from_file(self,
             mutated_sequences = in_silico_mutagenesis_sequences(
                 cur_sequence,
                 mutate_n_bases=mutate_n_bases,
-                reference_sequence=self.reference_sequence)
+                reference_sequence=self.reference_sequence,
+                start_position=start_position,
+                end_position=end_position)
             cur_sequence_encoding = self.reference_sequence.sequence_to_encoding(
                 cur_sequence)
             base_encoding = cur_sequence_encoding.reshape(
diff --git a/selene_sdk/predict/tests/test_model_predict.py b/selene_sdk/predict/tests/test_model_predict.py
@@ -21,9 +21,19 @@ def test_in_silico_muta_sequences_single(self):
         expected_lists = [[e] for e in expected]
         self.assertListEqual(observed, expected_lists)
 
+    def test_in_silico_muta_sequences_single_subset_positions(self):
+        observed = in_silico_mutagenesis_sequences("ATCCG", start_position=1, end_position=4)
+        expected = [
+            (1, 'A'), (1, 'C'), (1, 'G'),
+            (2, 'A'), (2, 'G'), (2, 'T'),
+            (3, 'A'), (3, 'G'), (3, 'T')]
+
+        expected_lists = [[e] for e in expected]
+        self.assertListEqual(observed, expected_lists)
+
     def test_in_silico_muta_sequences_double(self):
         observed = in_silico_mutagenesis_sequences(
-            "ATC", mutate_n_bases=2)
+            "ATC", mutate_n_bases=2, start_position=0, end_position=3)
         expected = [
             [(0, 'C'), (1, 'A')], [(0, 'G'), (1, 'A')], [(0, 'T'), (1, 'A')],
             [(0, 'C'), (1, 'C')], [(0, 'G'), (1, 'C')], [(0, 'T'), (1, 'C')],
@@ -39,5 +49,16 @@ def test_in_silico_muta_sequences_double(self):
         ]
         self.assertCountEqual(observed, expected)
 
+    def test_in_silico_muta_sequences_double_subset_positions(self):
+        observed = in_silico_mutagenesis_sequences(
+            "ATCG", mutate_n_bases=2, start_position=1, end_position=3)
+        expected = [
+            [(1, 'A'), (2, 'A')], [(1, 'C'), (2, 'A')], [(1, 'G'), (2, 'A')],
+            [(1, 'A'), (2, 'G')], [(1, 'C'), (2, 'G')], [(1, 'G'), (2, 'G')],
+            [(1, 'A'), (2, 'T')], [(1, 'C'), (2, 'T')], [(1, 'G'), (2, 'T')],
+        ]
+        self.assertCountEqual(observed, expected)
+
+
 if __name__ == "__main__":
     unittest.main()