Skip to content

Commit 8298041

Browse files
committed
style(notebooks): apply Ruff formatter and import sorting
1 parent 2b6e956 commit 8298041

6 files changed

Lines changed: 347 additions & 187 deletions

File tree

01_PanPhylo_analysis/01_pangenome.ipynb

Lines changed: 50 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,14 @@
1616
},
1717
{
1818
"cell_type": "code",
19-
"execution_count": 1,
19+
"execution_count": null,
2020
"metadata": {},
2121
"outputs": [],
2222
"source": [
23+
"import csv\n",
2324
"import os\n",
2425
"import re\n",
25-
"import csv\n",
26+
"\n",
2627
"import pandas as pd\n",
2728
"from Bio import Entrez, SeqIO"
2829
]
@@ -98,7 +99,7 @@
9899
},
99100
{
100101
"cell_type": "code",
101-
"execution_count": 3,
102+
"execution_count": null,
102103
"metadata": {},
103104
"outputs": [],
104105
"source": [
@@ -118,11 +119,15 @@
118119
" handle = Entrez.efetch(\n",
119120
" db=\"nucleotide\", id=accession, rettype=format, retmode=\"text\"\n",
120121
" )\n",
121-
" records = list(SeqIO.parse(handle, \"fasta\")) # Use parse() instead of read()\n",
122+
" records = list(\n",
123+
" SeqIO.parse(handle, \"fasta\")\n",
124+
" ) # Use parse() instead of read()\n",
122125
" handle.close()\n",
123126
"\n",
124127
" if records:\n",
125-
" output_path = os.path.join(output_dir, f\"{accession.split('.')[0]}.{extension}\")\n",
128+
" output_path = os.path.join(\n",
129+
" output_dir, f\"{accession.split('.')[0]}.{extension}\"\n",
130+
" )\n",
126131
" SeqIO.write(records, output_path, \"fasta\")\n",
127132
" print(f\"Downloaded: {accession}\")\n",
128133
" else:\n",
@@ -144,7 +149,7 @@
144149
"metadata": {},
145150
"outputs": [],
146151
"source": [
147-
"email = \"sample@email.com\" #ENTER YOUR EMAIL\n",
152+
"email = \"sample@email.com\" # ENTER YOUR EMAIL\n",
148153
"accession_numbers = \"pangenome/data/accession_numbers.txt\""
149154
]
150155
},
@@ -161,11 +166,13 @@
161166
"metadata": {},
162167
"outputs": [],
163168
"source": [
164-
"get_sequences(email,\n",
165-
" accession_numbers,\n",
166-
" \"pangenome/Annotation/Genes\",\n",
167-
" format = \"fasta_cds_na\",\n",
168-
" extension = \"gen\")"
169+
"get_sequences(\n",
170+
" email,\n",
171+
" accession_numbers,\n",
172+
" \"pangenome/Annotation/Genes\",\n",
173+
" format=\"fasta_cds_na\",\n",
174+
" extension=\"gen\",\n",
175+
")"
169176
]
170177
},
171178
{
@@ -181,11 +188,13 @@
181188
"metadata": {},
182189
"outputs": [],
183190
"source": [
184-
"get_sequences(email,\n",
185-
" accession_numbers,\n",
186-
" \"pangenome/Annotation/Proteins_classic\",\n",
187-
" format = \"fasta_cds_aa\",\n",
188-
" extension = \"prt\")"
191+
"get_sequences(\n",
192+
" email,\n",
193+
" accession_numbers,\n",
194+
" \"pangenome/Annotation/Proteins_classic\",\n",
195+
" format=\"fasta_cds_aa\",\n",
196+
" extension=\"prt\",\n",
197+
")"
189198
]
190199
},
191200
{
@@ -243,6 +252,7 @@
243252
" print(\"Genes directory does not exist.\")\n",
244253
" exit(1)\n",
245254
"\n",
255+
"\n",
246256
"# Function to extract FASTA sequences\n",
247257
"def read_fasta(file_path):\n",
248258
" sequences = []\n",
@@ -262,19 +272,21 @@
262272
" sequences.append((header, \"\\n\".join(seq))) # Append last sequence\n",
263273
" return sequences\n",
264274
"\n",
275+
"\n",
265276
"# Function to write updated FASTA sequences\n",
266277
"def write_fasta(file_path, sequences):\n",
267278
" with open(file_path, \"w\") as f:\n",
268279
" for header, seq in sequences:\n",
269280
" f.write(f\"{header}\\n{seq}\\n\")\n",
270281
"\n",
282+
"\n",
271283
"# Process all .prt files in Proteins directory\n",
272284
"for prt_file in os.listdir(proteins_dir):\n",
273285
" if prt_file.endswith(\".prt\"):\n",
274286
" # Get corresponding .gen file\n",
275287
" base_name = os.path.splitext(prt_file)[0] # Remove .prt extension\n",
276288
" gen_file = f\"{base_name}.gen\"\n",
277-
" \n",
289+
"\n",
278290
" prt_path = os.path.join(proteins_dir, prt_file)\n",
279291
" gen_path = os.path.join(genes_dir, gen_file)\n",
280292
"\n",
@@ -289,11 +301,15 @@
289301
"\n",
290302
" # Ensure both files have the same number of sequences\n",
291303
" if len(prt_seqs) != len(gen_seqs):\n",
292-
" print(f\"Skipping {gen_file} (mismatch: {len(prt_seqs)} protein seqs vs {len(gen_seqs)} gene seqs)\")\n",
304+
" print(\n",
305+
" f\"Skipping {gen_file} (mismatch: {len(prt_seqs)} protein seqs vs {len(gen_seqs)} gene seqs)\"\n",
306+
" )\n",
293307
" continue\n",
294308
"\n",
295309
" # Replace headers in .gen file\n",
296-
" updated_gen_seqs = [(prt_seqs[i][0], gen_seqs[i][1]) for i in range(len(gen_seqs))]\n",
310+
" updated_gen_seqs = [\n",
311+
" (prt_seqs[i][0], gen_seqs[i][1]) for i in range(len(gen_seqs))\n",
312+
" ]\n",
297313
"\n",
298314
" # Write updated .gen file\n",
299315
" write_fasta(gen_path, updated_gen_seqs)\n",
@@ -653,7 +669,7 @@
653669
},
654670
{
655671
"cell_type": "code",
656-
"execution_count": 6,
672+
"execution_count": null,
657673
"metadata": {},
658674
"outputs": [
659675
{
@@ -665,8 +681,8 @@
665681
}
666682
],
667683
"source": [
668-
"fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.6.aln'\n",
669-
"output_fasta_file = 'pangenome/Alignment/MSAs/nad4l.aln'\n",
684+
"fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.6.aln\"\n",
685+
"output_fasta_file = \"pangenome/Alignment/MSAs/nad4l.aln\"\n",
670686
"\n",
671687
"process_msa(fasta_file, output_fasta_file)"
672688
]
@@ -707,8 +723,8 @@
707723
}
708724
],
709725
"source": [
710-
"fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.32.aln'\n",
711-
"output_fasta_file = 'pangenome/Alignment/MSAs/cox2.aln'\n",
726+
"fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.32.aln\"\n",
727+
"output_fasta_file = \"pangenome/Alignment/MSAs/cox2.aln\"\n",
712728
"\n",
713729
"process_msa(fasta_file, output_fasta_file)"
714730
]
@@ -749,8 +765,8 @@
749765
}
750766
],
751767
"source": [
752-
"fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.130.aln'\n",
753-
"output_fasta_file = 'pangenome/Alignment/MSAs/cob.aln'\n",
768+
"fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.130.aln\"\n",
769+
"output_fasta_file = \"pangenome/Alignment/MSAs/cob.aln\"\n",
754770
"\n",
755771
"process_msa(fasta_file, output_fasta_file)"
756772
]
@@ -779,7 +795,7 @@
779795
},
780796
{
781797
"cell_type": "code",
782-
"execution_count": 12,
798+
"execution_count": null,
783799
"metadata": {},
784800
"outputs": [
785801
{
@@ -791,8 +807,8 @@
791807
}
792808
],
793809
"source": [
794-
"fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.565.aln'\n",
795-
"output_fasta_file = 'pangenome/Alignment/MSAs/cox1.aln'\n",
810+
"fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.565.aln\"\n",
811+
"output_fasta_file = \"pangenome/Alignment/MSAs/cox1.aln\"\n",
796812
"\n",
797813
"process_msa(fasta_file, output_fasta_file)"
798814
]
@@ -1063,13 +1079,13 @@
10631079
},
10641080
{
10651081
"cell_type": "code",
1066-
"execution_count": 11,
1082+
"execution_count": null,
10671083
"metadata": {},
10681084
"outputs": [],
10691085
"source": [
10701086
"def modify_tree_file(input_file, output_file):\n",
10711087
" \"\"\"\n",
1072-
" Reads a Newick tree from a file, adds \".1\" to all accession numbers, \n",
1088+
" Reads a Newick tree from a file, adds \".1\" to all accession numbers,\n",
10731089
" and writes the modified tree to an output file.\n",
10741090
"\n",
10751091
" Args:\n",
@@ -1085,7 +1101,7 @@
10851101
" tree_str = infile.read().strip()\n",
10861102
"\n",
10871103
" # Modify tree\n",
1088-
" modified_tree = re.sub(r'NC_\\d+', add_suffix, tree_str)\n",
1104+
" modified_tree = re.sub(r\"NC_\\d+\", add_suffix, tree_str)\n",
10891105
"\n",
10901106
" # Write modified tree to output file\n",
10911107
" with open(output_file, \"w\") as outfile:\n",
@@ -1185,7 +1201,7 @@
11851201
" df = pd.read_csv(input_file, sep=\"\\t\")\n",
11861202
"\n",
11871203
" # Extract the last 4 digits of the 'Year' column\n",
1188-
" df['Year'] = df['Year'].apply(lambda x: str(x)[-4:] if pd.notnull(x) else 'ND')\n",
1204+
" df[\"Year\"] = df[\"Year\"].apply(lambda x: str(x)[-4:] if pd.notnull(x) else \"ND\")\n",
11891205
"\n",
11901206
" # Save the updated DataFrame to a new .tsv file\n",
11911207
" df.to_csv(output_file, sep=\"\\t\", index=False)\n",
@@ -1199,8 +1215,7 @@
11991215
"metadata": {},
12001216
"outputs": [],
12011217
"source": [
1202-
"clean_year('metadata/raw_metadata.tsv',\n",
1203-
" 'metadata/metadata.tsv')"
1218+
"clean_year(\"metadata/raw_metadata.tsv\", \"metadata/metadata.tsv\")"
12041219
]
12051220
},
12061221
{

01_PanPhylo_analysis/03_phylogenomics.ipynb

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
},
1717
{
1818
"cell_type": "code",
19-
"execution_count": 1,
19+
"execution_count": null,
2020
"metadata": {},
2121
"outputs": [],
2222
"source": [
@@ -209,14 +209,14 @@
209209
},
210210
{
211211
"cell_type": "code",
212-
"execution_count": 3,
212+
"execution_count": null,
213213
"metadata": {},
214214
"outputs": [],
215215
"source": [
216216
"def rename_sequences(input_dir, output_dir):\n",
217217
" # Create the output directory if it doesn't exist\n",
218218
" os.makedirs(output_dir, exist_ok=True)\n",
219-
" \n",
219+
"\n",
220220
" # Loop through each FASTA file in the directory\n",
221221
" for filename in os.listdir(input_dir):\n",
222222
" if filename.endswith(\".fasta\") or filename.endswith(\".fa\"):\n",
@@ -225,26 +225,26 @@
225225
"\n",
226226
" # Dictionary to keep track of sequence names and their counts\n",
227227
" name_count = {}\n",
228-
" \n",
228+
"\n",
229229
" # List to store the updated sequences\n",
230230
" updated_sequences = []\n",
231-
" \n",
231+
"\n",
232232
" # Read the FASTA file\n",
233233
" for record in SeqIO.parse(filepath, \"fasta\"):\n",
234234
" name = record.id\n",
235-
" \n",
235+
"\n",
236236
" # If the name is already seen, append a count suffix\n",
237237
" if name in name_count:\n",
238238
" name_count[name] += 1\n",
239239
" new_name = f\"{name}_{name_count[name]}\"\n",
240240
" else:\n",
241241
" name_count[name] = 1\n",
242242
" new_name = f\"{name}_1\"\n",
243-
" \n",
243+
"\n",
244244
" # Update the record ID\n",
245245
" record.id = new_name\n",
246246
" record.description = \"\" # Optionally clear the description\n",
247-
" \n",
247+
"\n",
248248
" # Store the updated record\n",
249249
" updated_sequences.append(record)\n",
250250
"\n",
@@ -510,11 +510,13 @@
510510
},
511511
{
512512
"cell_type": "code",
513-
"execution_count": 9,
513+
"execution_count": null,
514514
"metadata": {},
515515
"outputs": [],
516516
"source": [
517-
"df_all = pd.read_csv('phylogenomics/protein_ortho_output/myproject.proteinortho.tsv', sep='\\t')"
517+
"df_all = pd.read_csv(\n",
518+
" \"phylogenomics/protein_ortho_output/myproject.proteinortho.tsv\", sep=\"\\t\"\n",
519+
")"
518520
]
519521
},
520522
{
@@ -1556,11 +1558,13 @@
15561558
},
15571559
{
15581560
"cell_type": "code",
1559-
"execution_count": 13,
1561+
"execution_count": null,
15601562
"metadata": {},
15611563
"outputs": [],
15621564
"source": [
1563-
"df_all.loc[1] = df_all.loc[1].apply(lambda x: x.replace(',', '*') if isinstance(x, str) else x)"
1565+
"df_all.loc[1] = df_all.loc[1].apply(\n",
1566+
" lambda x: x.replace(\",\", \"*\") if isinstance(x, str) else x\n",
1567+
")"
15641568
]
15651569
},
15661570
{
@@ -2049,11 +2053,13 @@
20492053
},
20502054
{
20512055
"cell_type": "code",
2052-
"execution_count": 15,
2056+
"execution_count": null,
20532057
"metadata": {},
20542058
"outputs": [],
20552059
"source": [
2056-
"df_all_cols_to_drop = df_all.columns[df_all.apply(lambda col: col.astype(str).str.contains(r'\\*', regex=True).any())]"
2060+
"df_all_cols_to_drop = df_all.columns[\n",
2061+
" df_all.apply(lambda col: col.astype(str).str.contains(r\"\\*\", regex=True).any())\n",
2062+
"]"
20572063
]
20582064
},
20592065
{
@@ -2094,12 +2100,12 @@
20942100
},
20952101
{
20962102
"cell_type": "code",
2097-
"execution_count": 17,
2103+
"execution_count": null,
20982104
"metadata": {},
20992105
"outputs": [],
21002106
"source": [
21012107
"df_all = df_all.drop(columns=df_all_cols_to_drop)\n",
2102-
"df_all.to_csv('phylogenomics/protein_ortho_output/All.tsv', sep='\\t', index=False)"
2108+
"df_all.to_csv(\"phylogenomics/protein_ortho_output/All.tsv\", sep=\"\\t\", index=False)"
21032109
]
21042110
},
21052111
{
@@ -4359,13 +4365,13 @@
43594365
},
43604366
{
43614367
"cell_type": "code",
4362-
"execution_count": 64,
4368+
"execution_count": null,
43634369
"metadata": {},
43644370
"outputs": [],
43654371
"source": [
43664372
"def modify_tree_file(input_file, output_file):\n",
43674373
" \"\"\"\n",
4368-
" Reads a Newick tree from a file, adds \".1\" to all accession numbers, \n",
4374+
" Reads a Newick tree from a file, adds \".1\" to all accession numbers,\n",
43694375
" and writes the modified tree to an output file.\n",
43704376
"\n",
43714377
" Args:\n",
@@ -4381,7 +4387,7 @@
43814387
" tree_str = infile.read().strip()\n",
43824388
"\n",
43834389
" # Modify tree\n",
4384-
" modified_tree = re.sub(r'NC_\\d+', add_suffix, tree_str)\n",
4390+
" modified_tree = re.sub(r\"NC_\\d+\", add_suffix, tree_str)\n",
43854391
"\n",
43864392
" # Write modified tree to output file\n",
43874393
" with open(output_file, \"w\") as outfile:\n",

0 commit comments

Comments
 (0)