|
16 | 16 | }, |
17 | 17 | { |
18 | 18 | "cell_type": "code", |
19 | | - "execution_count": 1, |
| 19 | + "execution_count": null, |
20 | 20 | "metadata": {}, |
21 | 21 | "outputs": [], |
22 | 22 | "source": [ |
| 23 | + "import csv\n", |
23 | 24 | "import os\n", |
24 | 25 | "import re\n", |
25 | | - "import csv\n", |
| 26 | + "\n", |
26 | 27 | "import pandas as pd\n", |
27 | 28 | "from Bio import Entrez, SeqIO" |
28 | 29 | ] |
|
98 | 99 | }, |
99 | 100 | { |
100 | 101 | "cell_type": "code", |
101 | | - "execution_count": 3, |
| 102 | + "execution_count": null, |
102 | 103 | "metadata": {}, |
103 | 104 | "outputs": [], |
104 | 105 | "source": [ |
|
118 | 119 | " handle = Entrez.efetch(\n", |
119 | 120 | " db=\"nucleotide\", id=accession, rettype=format, retmode=\"text\"\n", |
120 | 121 | " )\n", |
121 | | - " records = list(SeqIO.parse(handle, \"fasta\")) # Use parse() instead of read()\n", |
| 122 | + " records = list(\n", |
| 123 | + " SeqIO.parse(handle, \"fasta\")\n", |
| 124 | + " ) # Use parse() instead of read()\n", |
122 | 125 | " handle.close()\n", |
123 | 126 | "\n", |
124 | 127 | " if records:\n", |
125 | | - " output_path = os.path.join(output_dir, f\"{accession.split('.')[0]}.{extension}\")\n", |
| 128 | + " output_path = os.path.join(\n", |
| 129 | + " output_dir, f\"{accession.split('.')[0]}.{extension}\"\n", |
| 130 | + " )\n", |
126 | 131 | " SeqIO.write(records, output_path, \"fasta\")\n", |
127 | 132 | " print(f\"Downloaded: {accession}\")\n", |
128 | 133 | " else:\n", |
|
144 | 149 | "metadata": {}, |
145 | 150 | "outputs": [], |
146 | 151 | "source": [ |
147 | | - "email = \"sample@email.com\" #ENTER YOUR EMAIL\n", |
| 152 | + "email = \"sample@email.com\" # ENTER YOUR EMAIL\n", |
148 | 153 | "accession_numbers = \"pangenome/data/accession_numbers.txt\"" |
149 | 154 | ] |
150 | 155 | }, |
|
161 | 166 | "metadata": {}, |
162 | 167 | "outputs": [], |
163 | 168 | "source": [ |
164 | | - "get_sequences(email,\n", |
165 | | - " accession_numbers,\n", |
166 | | - " \"pangenome/Annotation/Genes\",\n", |
167 | | - " format = \"fasta_cds_na\",\n", |
168 | | - " extension = \"gen\")" |
| 169 | + "get_sequences(\n", |
| 170 | + " email,\n", |
| 171 | + " accession_numbers,\n", |
| 172 | + " \"pangenome/Annotation/Genes\",\n", |
| 173 | + " format=\"fasta_cds_na\",\n", |
| 174 | + " extension=\"gen\",\n", |
| 175 | + ")" |
169 | 176 | ] |
170 | 177 | }, |
171 | 178 | { |
|
181 | 188 | "metadata": {}, |
182 | 189 | "outputs": [], |
183 | 190 | "source": [ |
184 | | - "get_sequences(email,\n", |
185 | | - " accession_numbers,\n", |
186 | | - " \"pangenome/Annotation/Proteins_classic\",\n", |
187 | | - " format = \"fasta_cds_aa\",\n", |
188 | | - " extension = \"prt\")" |
| 191 | + "get_sequences(\n", |
| 192 | + " email,\n", |
| 193 | + " accession_numbers,\n", |
| 194 | + " \"pangenome/Annotation/Proteins_classic\",\n", |
| 195 | + " format=\"fasta_cds_aa\",\n", |
| 196 | + " extension=\"prt\",\n", |
| 197 | + ")" |
189 | 198 | ] |
190 | 199 | }, |
191 | 200 | { |
|
243 | 252 | " print(\"Genes directory does not exist.\")\n", |
244 | 253 | " exit(1)\n", |
245 | 254 | "\n", |
| 255 | + "\n", |
246 | 256 | "# Function to extract FASTA sequences\n", |
247 | 257 | "def read_fasta(file_path):\n", |
248 | 258 | " sequences = []\n", |
|
262 | 272 | " sequences.append((header, \"\\n\".join(seq))) # Append last sequence\n", |
263 | 273 | " return sequences\n", |
264 | 274 | "\n", |
| 275 | + "\n", |
265 | 276 | "# Function to write updated FASTA sequences\n", |
266 | 277 | "def write_fasta(file_path, sequences):\n", |
267 | 278 | " with open(file_path, \"w\") as f:\n", |
268 | 279 | " for header, seq in sequences:\n", |
269 | 280 | " f.write(f\"{header}\\n{seq}\\n\")\n", |
270 | 281 | "\n", |
| 282 | + "\n", |
271 | 283 | "# Process all .prt files in Proteins directory\n", |
272 | 284 | "for prt_file in os.listdir(proteins_dir):\n", |
273 | 285 | " if prt_file.endswith(\".prt\"):\n", |
274 | 286 | " # Get corresponding .gen file\n", |
275 | 287 | " base_name = os.path.splitext(prt_file)[0] # Remove .prt extension\n", |
276 | 288 | " gen_file = f\"{base_name}.gen\"\n", |
277 | | - " \n", |
| 289 | + "\n", |
278 | 290 | " prt_path = os.path.join(proteins_dir, prt_file)\n", |
279 | 291 | " gen_path = os.path.join(genes_dir, gen_file)\n", |
280 | 292 | "\n", |
|
289 | 301 | "\n", |
290 | 302 | " # Ensure both files have the same number of sequences\n", |
291 | 303 | " if len(prt_seqs) != len(gen_seqs):\n", |
292 | | - " print(f\"Skipping {gen_file} (mismatch: {len(prt_seqs)} protein seqs vs {len(gen_seqs)} gene seqs)\")\n", |
| 304 | + " print(\n", |
| 305 | + " f\"Skipping {gen_file} (mismatch: {len(prt_seqs)} protein seqs vs {len(gen_seqs)} gene seqs)\"\n", |
| 306 | + " )\n", |
293 | 307 | " continue\n", |
294 | 308 | "\n", |
295 | 309 | " # Replace headers in .gen file\n", |
296 | | - " updated_gen_seqs = [(prt_seqs[i][0], gen_seqs[i][1]) for i in range(len(gen_seqs))]\n", |
| 310 | + " updated_gen_seqs = [\n", |
| 311 | + " (prt_seqs[i][0], gen_seqs[i][1]) for i in range(len(gen_seqs))\n", |
| 312 | + " ]\n", |
297 | 313 | "\n", |
298 | 314 | " # Write updated .gen file\n", |
299 | 315 | " write_fasta(gen_path, updated_gen_seqs)\n", |
|
653 | 669 | }, |
654 | 670 | { |
655 | 671 | "cell_type": "code", |
656 | | - "execution_count": 6, |
| 672 | + "execution_count": null, |
657 | 673 | "metadata": {}, |
658 | 674 | "outputs": [ |
659 | 675 | { |
|
665 | 681 | } |
666 | 682 | ], |
667 | 683 | "source": [ |
668 | | - "fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.6.aln'\n", |
669 | | - "output_fasta_file = 'pangenome/Alignment/MSAs/nad4l.aln'\n", |
| 684 | + "fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.6.aln\"\n", |
| 685 | + "output_fasta_file = \"pangenome/Alignment/MSAs/nad4l.aln\"\n", |
670 | 686 | "\n", |
671 | 687 | "process_msa(fasta_file, output_fasta_file)" |
672 | 688 | ] |
|
707 | 723 | } |
708 | 724 | ], |
709 | 725 | "source": [ |
710 | | - "fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.32.aln'\n", |
711 | | - "output_fasta_file = 'pangenome/Alignment/MSAs/cox2.aln'\n", |
| 726 | + "fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.32.aln\"\n", |
| 727 | + "output_fasta_file = \"pangenome/Alignment/MSAs/cox2.aln\"\n", |
712 | 728 | "\n", |
713 | 729 | "process_msa(fasta_file, output_fasta_file)" |
714 | 730 | ] |
|
749 | 765 | } |
750 | 766 | ], |
751 | 767 | "source": [ |
752 | | - "fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.130.aln'\n", |
753 | | - "output_fasta_file = 'pangenome/Alignment/MSAs/cob.aln'\n", |
| 768 | + "fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.130.aln\"\n", |
| 769 | + "output_fasta_file = \"pangenome/Alignment/MSAs/cob.aln\"\n", |
754 | 770 | "\n", |
755 | 771 | "process_msa(fasta_file, output_fasta_file)" |
756 | 772 | ] |
|
779 | 795 | }, |
780 | 796 | { |
781 | 797 | "cell_type": "code", |
782 | | - "execution_count": 12, |
| 798 | + "execution_count": null, |
783 | 799 | "metadata": {}, |
784 | 800 | "outputs": [ |
785 | 801 | { |
|
791 | 807 | } |
792 | 808 | ], |
793 | 809 | "source": [ |
794 | | - "fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.565.aln'\n", |
795 | | - "output_fasta_file = 'pangenome/Alignment/MSAs/cox1.aln'\n", |
| 810 | + "fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.565.aln\"\n", |
| 811 | + "output_fasta_file = \"pangenome/Alignment/MSAs/cox1.aln\"\n", |
796 | 812 | "\n", |
797 | 813 | "process_msa(fasta_file, output_fasta_file)" |
798 | 814 | ] |
|
1063 | 1079 | }, |
1064 | 1080 | { |
1065 | 1081 | "cell_type": "code", |
1066 | | - "execution_count": 11, |
| 1082 | + "execution_count": null, |
1067 | 1083 | "metadata": {}, |
1068 | 1084 | "outputs": [], |
1069 | 1085 | "source": [ |
1070 | 1086 | "def modify_tree_file(input_file, output_file):\n", |
1071 | 1087 | " \"\"\"\n", |
1072 | | - " Reads a Newick tree from a file, adds \".1\" to all accession numbers, \n", |
| 1088 | + " Reads a Newick tree from a file, adds \".1\" to all accession numbers,\n", |
1073 | 1089 | " and writes the modified tree to an output file.\n", |
1074 | 1090 | "\n", |
1075 | 1091 | " Args:\n", |
|
1085 | 1101 | " tree_str = infile.read().strip()\n", |
1086 | 1102 | "\n", |
1087 | 1103 | " # Modify tree\n", |
1088 | | - " modified_tree = re.sub(r'NC_\\d+', add_suffix, tree_str)\n", |
| 1104 | + " modified_tree = re.sub(r\"NC_\\d+\", add_suffix, tree_str)\n", |
1089 | 1105 | "\n", |
1090 | 1106 | " # Write modified tree to output file\n", |
1091 | 1107 | " with open(output_file, \"w\") as outfile:\n", |
|
1185 | 1201 | " df = pd.read_csv(input_file, sep=\"\\t\")\n", |
1186 | 1202 | "\n", |
1187 | 1203 | " # Extract the last 4 digits of the 'Year' column\n", |
1188 | | - " df['Year'] = df['Year'].apply(lambda x: str(x)[-4:] if pd.notnull(x) else 'ND')\n", |
| 1204 | + " df[\"Year\"] = df[\"Year\"].apply(lambda x: str(x)[-4:] if pd.notnull(x) else \"ND\")\n", |
1189 | 1205 | "\n", |
1190 | 1206 | " # Save the updated DataFrame to a new .tsv file\n", |
1191 | 1207 | " df.to_csv(output_file, sep=\"\\t\", index=False)\n", |
|
1199 | 1215 | "metadata": {}, |
1200 | 1216 | "outputs": [], |
1201 | 1217 | "source": [ |
1202 | | - "clean_year('metadata/raw_metadata.tsv',\n", |
1203 | | - " 'metadata/metadata.tsv')" |
| 1218 | + "clean_year(\"metadata/raw_metadata.tsv\", \"metadata/metadata.tsv\")" |
1204 | 1219 | ] |
1205 | 1220 | }, |
1206 | 1221 | { |
|
0 commit comments