PopovIILab
diff --git a/‎01_PanPhylo_analysis/01_pangenome.ipynb‎
Lines changed: 50 additions & 35 deletions b/‎01_PanPhylo_analysis/01_pangenome.ipynb‎
Lines changed: 50 additions & 35 deletions
diff --git a/‎01_PanPhylo_analysis/03_phylogenomics.ipynb‎
Lines changed: 25 additions & 19 deletions b/‎01_PanPhylo_analysis/03_phylogenomics.ipynb‎
Lines changed: 25 additions & 19 deletions
@@ -16,13 +16,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import csv\n",
     "import os\n",
     "import re\n",
-    "import csv\n",
+    "\n",
     "import pandas as pd\n",
     "from Bio import Entrez, SeqIO"
    ]
@@ -98,7 +99,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -118,11 +119,15 @@
     "            handle = Entrez.efetch(\n",
     "                db=\"nucleotide\", id=accession, rettype=format, retmode=\"text\"\n",
     "            )\n",
-    "            records = list(SeqIO.parse(handle, \"fasta\"))  # Use parse() instead of read()\n",
+    "            records = list(\n",
+    "                SeqIO.parse(handle, \"fasta\")\n",
+    "            )  # Use parse() instead of read()\n",
     "            handle.close()\n",
     "\n",
     "            if records:\n",
-    "                output_path = os.path.join(output_dir, f\"{accession.split('.')[0]}.{extension}\")\n",
+    "                output_path = os.path.join(\n",
+    "                    output_dir, f\"{accession.split('.')[0]}.{extension}\"\n",
+    "                )\n",
     "                SeqIO.write(records, output_path, \"fasta\")\n",
     "                print(f\"Downloaded: {accession}\")\n",
     "            else:\n",
@@ -144,7 +149,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "email = \"sample@email.com\" #ENTER YOUR EMAIL\n",
+    "email = \"sample@email.com\"  # ENTER YOUR EMAIL\n",
     "accession_numbers = \"pangenome/data/accession_numbers.txt\""
    ]
   },
@@ -161,11 +166,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "get_sequences(email,\n",
-    "              accession_numbers,\n",
-    "              \"pangenome/Annotation/Genes\",\n",
-    "              format = \"fasta_cds_na\",\n",
-    "              extension = \"gen\")"
+    "get_sequences(\n",
+    "    email,\n",
+    "    accession_numbers,\n",
+    "    \"pangenome/Annotation/Genes\",\n",
+    "    format=\"fasta_cds_na\",\n",
+    "    extension=\"gen\",\n",
+    ")"
    ]
   },
   {
@@ -181,11 +188,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "get_sequences(email,\n",
-    "              accession_numbers,\n",
-    "              \"pangenome/Annotation/Proteins_classic\",\n",
-    "              format = \"fasta_cds_aa\",\n",
-    "              extension = \"prt\")"
+    "get_sequences(\n",
+    "    email,\n",
+    "    accession_numbers,\n",
+    "    \"pangenome/Annotation/Proteins_classic\",\n",
+    "    format=\"fasta_cds_aa\",\n",
+    "    extension=\"prt\",\n",
+    ")"
    ]
   },
   {
@@ -243,6 +252,7 @@
     "    print(\"Genes directory does not exist.\")\n",
     "    exit(1)\n",
     "\n",
+    "\n",
     "# Function to extract FASTA sequences\n",
     "def read_fasta(file_path):\n",
     "    sequences = []\n",
@@ -262,19 +272,21 @@
     "            sequences.append((header, \"\\n\".join(seq)))  # Append last sequence\n",
     "    return sequences\n",
     "\n",
+    "\n",
     "# Function to write updated FASTA sequences\n",
     "def write_fasta(file_path, sequences):\n",
     "    with open(file_path, \"w\") as f:\n",
     "        for header, seq in sequences:\n",
     "            f.write(f\"{header}\\n{seq}\\n\")\n",
     "\n",
+    "\n",
     "# Process all .prt files in Proteins directory\n",
     "for prt_file in os.listdir(proteins_dir):\n",
     "    if prt_file.endswith(\".prt\"):\n",
     "        # Get corresponding .gen file\n",
     "        base_name = os.path.splitext(prt_file)[0]  # Remove .prt extension\n",
     "        gen_file = f\"{base_name}.gen\"\n",
-    "        \n",
+    "\n",
     "        prt_path = os.path.join(proteins_dir, prt_file)\n",
     "        gen_path = os.path.join(genes_dir, gen_file)\n",
     "\n",
@@ -289,11 +301,15 @@
     "\n",
     "        # Ensure both files have the same number of sequences\n",
     "        if len(prt_seqs) != len(gen_seqs):\n",
-    "            print(f\"Skipping {gen_file} (mismatch: {len(prt_seqs)} protein seqs vs {len(gen_seqs)} gene seqs)\")\n",
+    "            print(\n",
+    "                f\"Skipping {gen_file} (mismatch: {len(prt_seqs)} protein seqs vs {len(gen_seqs)} gene seqs)\"\n",
+    "            )\n",
     "            continue\n",
     "\n",
     "        # Replace headers in .gen file\n",
-    "        updated_gen_seqs = [(prt_seqs[i][0], gen_seqs[i][1]) for i in range(len(gen_seqs))]\n",
+    "        updated_gen_seqs = [\n",
+    "            (prt_seqs[i][0], gen_seqs[i][1]) for i in range(len(gen_seqs))\n",
+    "        ]\n",
     "\n",
     "        # Write updated .gen file\n",
     "        write_fasta(gen_path, updated_gen_seqs)\n",
@@ -653,7 +669,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -665,8 +681,8 @@
     }
    ],
    "source": [
-    "fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.6.aln'\n",
-    "output_fasta_file = 'pangenome/Alignment/MSAs/nad4l.aln'\n",
+    "fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.6.aln\"\n",
+    "output_fasta_file = \"pangenome/Alignment/MSAs/nad4l.aln\"\n",
     "\n",
     "process_msa(fasta_file, output_fasta_file)"
    ]
@@ -707,8 +723,8 @@
     }
    ],
    "source": [
-    "fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.32.aln'\n",
-    "output_fasta_file = 'pangenome/Alignment/MSAs/cox2.aln'\n",
+    "fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.32.aln\"\n",
+    "output_fasta_file = \"pangenome/Alignment/MSAs/cox2.aln\"\n",
     "\n",
     "process_msa(fasta_file, output_fasta_file)"
    ]
@@ -749,8 +765,8 @@
     }
    ],
    "source": [
-    "fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.130.aln'\n",
-    "output_fasta_file = 'pangenome/Alignment/MSAs/cob.aln'\n",
+    "fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.130.aln\"\n",
+    "output_fasta_file = \"pangenome/Alignment/MSAs/cob.aln\"\n",
     "\n",
     "process_msa(fasta_file, output_fasta_file)"
    ]
@@ -779,7 +795,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -791,8 +807,8 @@
     }
    ],
    "source": [
-    "fasta_file = 'pangenome/Alignment/Align-LeMy/LeMy-mafft-align.565.aln'\n",
-    "output_fasta_file = 'pangenome/Alignment/MSAs/cox1.aln'\n",
+    "fasta_file = \"pangenome/Alignment/Align-LeMy/LeMy-mafft-align.565.aln\"\n",
+    "output_fasta_file = \"pangenome/Alignment/MSAs/cox1.aln\"\n",
     "\n",
     "process_msa(fasta_file, output_fasta_file)"
    ]
@@ -1063,13 +1079,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def modify_tree_file(input_file, output_file):\n",
     "    \"\"\"\n",
-    "    Reads a Newick tree from a file, adds \".1\" to all accession numbers, \n",
+    "    Reads a Newick tree from a file, adds \".1\" to all accession numbers,\n",
     "    and writes the modified tree to an output file.\n",
     "\n",
     "    Args:\n",
@@ -1085,7 +1101,7 @@
     "        tree_str = infile.read().strip()\n",
     "\n",
     "    # Modify tree\n",
-    "    modified_tree = re.sub(r'NC_\\d+', add_suffix, tree_str)\n",
+    "    modified_tree = re.sub(r\"NC_\\d+\", add_suffix, tree_str)\n",
     "\n",
     "    # Write modified tree to output file\n",
     "    with open(output_file, \"w\") as outfile:\n",
@@ -1185,7 +1201,7 @@
     "    df = pd.read_csv(input_file, sep=\"\\t\")\n",
     "\n",
     "    # Extract the last 4 digits of the 'Year' column\n",
-    "    df['Year'] = df['Year'].apply(lambda x: str(x)[-4:] if pd.notnull(x) else 'ND')\n",
+    "    df[\"Year\"] = df[\"Year\"].apply(lambda x: str(x)[-4:] if pd.notnull(x) else \"ND\")\n",
     "\n",
     "    # Save the updated DataFrame to a new .tsv file\n",
     "    df.to_csv(output_file, sep=\"\\t\", index=False)\n",
@@ -1199,8 +1215,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "clean_year('metadata/raw_metadata.tsv',\n",
-    "           'metadata/metadata.tsv')"
+    "clean_year(\"metadata/raw_metadata.tsv\", \"metadata/metadata.tsv\")"
    ]
   },
   {
 
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -209,14 +209,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def rename_sequences(input_dir, output_dir):\n",
     "    # Create the output directory if it doesn't exist\n",
     "    os.makedirs(output_dir, exist_ok=True)\n",
-    "    \n",
+    "\n",
     "    # Loop through each FASTA file in the directory\n",
     "    for filename in os.listdir(input_dir):\n",
     "        if filename.endswith(\".fasta\") or filename.endswith(\".fa\"):\n",
@@ -225,26 +225,26 @@
     "\n",
     "            # Dictionary to keep track of sequence names and their counts\n",
     "            name_count = {}\n",
-    "            \n",
+    "\n",
     "            # List to store the updated sequences\n",
     "            updated_sequences = []\n",
-    "            \n",
+    "\n",
     "            # Read the FASTA file\n",
     "            for record in SeqIO.parse(filepath, \"fasta\"):\n",
     "                name = record.id\n",
-    "                \n",
+    "\n",
     "                # If the name is already seen, append a count suffix\n",
     "                if name in name_count:\n",
     "                    name_count[name] += 1\n",
     "                    new_name = f\"{name}_{name_count[name]}\"\n",
     "                else:\n",
     "                    name_count[name] = 1\n",
     "                    new_name = f\"{name}_1\"\n",
-    "                \n",
+    "\n",
     "                # Update the record ID\n",
     "                record.id = new_name\n",
     "                record.description = \"\"  # Optionally clear the description\n",
-    "                \n",
+    "\n",
     "                # Store the updated record\n",
     "                updated_sequences.append(record)\n",
     "\n",
@@ -510,11 +510,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_all = pd.read_csv('phylogenomics/protein_ortho_output/myproject.proteinortho.tsv', sep='\\t')"
+    "df_all = pd.read_csv(\n",
+    "    \"phylogenomics/protein_ortho_output/myproject.proteinortho.tsv\", sep=\"\\t\"\n",
+    ")"
    ]
   },
   {
@@ -1556,11 +1558,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_all.loc[1] = df_all.loc[1].apply(lambda x: x.replace(',', '*') if isinstance(x, str) else x)"
+    "df_all.loc[1] = df_all.loc[1].apply(\n",
+    "    lambda x: x.replace(\",\", \"*\") if isinstance(x, str) else x\n",
+    ")"
    ]
   },
   {
@@ -2049,11 +2053,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_all_cols_to_drop = df_all.columns[df_all.apply(lambda col: col.astype(str).str.contains(r'\\*', regex=True).any())]"
+    "df_all_cols_to_drop = df_all.columns[\n",
+    "    df_all.apply(lambda col: col.astype(str).str.contains(r\"\\*\", regex=True).any())\n",
+    "]"
    ]
   },
   {
@@ -2094,12 +2100,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "df_all = df_all.drop(columns=df_all_cols_to_drop)\n",
-    "df_all.to_csv('phylogenomics/protein_ortho_output/All.tsv', sep='\\t', index=False)"
+    "df_all.to_csv(\"phylogenomics/protein_ortho_output/All.tsv\", sep=\"\\t\", index=False)"
    ]
   },
   {
@@ -4359,13 +4365,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def modify_tree_file(input_file, output_file):\n",
     "    \"\"\"\n",
-    "    Reads a Newick tree from a file, adds \".1\" to all accession numbers, \n",
+    "    Reads a Newick tree from a file, adds \".1\" to all accession numbers,\n",
     "    and writes the modified tree to an output file.\n",
     "\n",
     "    Args:\n",
@@ -4381,7 +4387,7 @@
     "        tree_str = infile.read().strip()\n",
     "\n",
     "    # Modify tree\n",
-    "    modified_tree = re.sub(r'NC_\\d+', add_suffix, tree_str)\n",
+    "    modified_tree = re.sub(r\"NC_\\d+\", add_suffix, tree_str)\n",
     "\n",
     "    # Write modified tree to output file\n",
     "    with open(output_file, \"w\") as outfile:\n",