-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdna_data_structures.py
More file actions
88 lines (73 loc) · 2.93 KB
/
dna_data_structures.py
File metadata and controls
88 lines (73 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import collections
from typing import Dict, Union
class InvalidNucleotideError(Exception):
"""Custom errors for invalid nucleotides"""
pass
class DNA:
""""
Represents a DNA sequence and provides basic opeations and statisitcs
The sequence is stored as a string after validation
"""
valid_nucleotides = ['A', 'T', 'C', 'G']
def __init__(self, sequence):
""""
Initialise the DNA sequence
Args:
sequence (str): The DNA sequence
Raises:
InvalidNucleotideError if the sequence does not contain a valid nucleotide
"""
#converting sequence to uppercase for validation and storage
self.sequence = sequence.upper()
self._validate_sequence()
#pre-calculating counts for efficiency (using dict as required)
self._nucleotide_counts= self._count_nucleotides()
def _validate_sequence(self):
""""
Validates the DNA sequence, ensures only A,T,G,C nucleotides are allowed
Handles invalid nucleotides with appropriate error message
"""
#Implementing validation:
invalid_nucleotides = set(self.sequence)- set(self.valid_nucleotides)
if invalid_nucleotides:
error_message= (f"Invalid nucleotides found in DNA sequence:"
f"{', '.join(sorted(invalid_nucleotides))}"
f"Only A,T,G,C nucleotides are allowed")
raise InvalidNucleotideError(error_message)
def _count_nucleotides(self) -> Dict[str, int]:
""""
Counts the nucleotides in the DNA sequence using collections.Counter
Uses dictionary-like structure to store nucleotide counts
"""
return collections.Counter(self.sequence)
def get_length (self) -> int:
""""
Returns the length of the DNA sequence
"""
return len(self.sequence)
def get_nucleotide_count(self) -> Dict[str, int]:
""""
Returns count of each nucleotide in the DNA sequence
"""
return self._nucleotide_counts
def calculate_GC_content(self) ->float:
""""
Calculates and returns the percentage of Guanine and Cytosine (GC) content
"""
total_length = self.get_length()
if total_length == 0:
return 0.0
G_count= self._nucleotide_counts.get('G', 0)
C_count= self._nucleotide_counts.get('C', 0)
GC_content=((G_count + C_count) / total_length) * 100
return round(GC_content,2)
def calculate_AT_content(self) -> float:
""""
Calculates and returns the percentage of Adenine and Thymine (AT) content
"""
return round(100.0- self.calculate_GC_content(),2)
def __str__(self):
""""
String representation of the DNA sequence
"""
return f'DNA sequence: {self.sequence}'