add Tahoe data to DGB2

lilydtaub · lilydtaub · commit f80559075de7 · 2025-10-07T17:19:44.000-04:00
diff --git a/appyters/Drug_Gene_Budger2/appyter.json b/appyters/Drug_Gene_Budger2/appyter.json
@@ -2,7 +2,7 @@
     "$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json",
     "name": "Drug_Gene_Budger2",
     "title": "Dr. Gene Budger (DGB) 2",
-    "version": "0.0.7",
+    "version": "0.0.8",
     "description": "An appyter that retrieves drugs that up-regulate and down-regulate a single input gene across Connectivity Mapping datasets",
     "image": "dgb_logo.png",
     "authors": [
diff --git a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
@@ -100,6 +100,7 @@
     "- Ginkgo GDPx1 and GPDx2: Limma-Voom based differential gene expression results for 1,354 drugs.\n",
     "- Novartis DRUG-seq: Differential: Limma-Trend based differential gene expression results for 4,343 drugs. \n",
     "- LINCS L1000 Chemical Perturbations: Limma-Voom based differential gene expression results for a subset of 4,091 drugs from the LINCS L1000 Chemical Perturbation dataset. \n",
+    "- Tahoe 100-M: DESeq based differential gene expression results for 376 drugs tested across 50 different cancer cell lines. \n",
     "\n",
     "The Ginkgo dataset includes 4 primary cell types (epithelial melanocytes, smooth aortic muscle cells, skeletal muscle myoblasts and dermal fibroblasts) and one cell line (A549 lung carcinoma cell line). Previous analysis showed distinct transcriptional responses by cell type, so the drug rankings for the Ginkgo dataset are separated by cell type.\n",
     "\n",
@@ -119,6 +120,7 @@
     "import re\n",
     "from itertools import combinations\n",
     "import warnings\n",
+    "import hashlib\n",
     "\n",
     "## Tables\n",
     "from IPython.display import display, display_markdown, HTML\n",
@@ -129,6 +131,7 @@
     "\n",
     "## Venn Diagram\n",
     "from matplotlib_venn import venn3, venn2\n",
+    "import matplotlib.pyplot as plt\n",
     "\n",
     "## Volcano Plot\n",
     "from bokeh.plotting import figure, show\n",
@@ -149,6 +152,7 @@
     "novartis_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/novartis_de'\n",
     "lincs_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_de'\n",
     "deepcover_moa_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/deepcoverMoa_de'\n",
+    "tahoe_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/tahoe_de'\n",
     "\n",
     "# silence warnings\n",
     "warnings.filterwarnings('ignore')"
@@ -161,7 +165,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "in_ginkgo = in_novartis = in_lincs = True"
+    "in_ginkgo = in_novartis = in_lincs = in_tahoe = True"
    ]
   },
   {
@@ -253,17 +257,51 @@
     "    in_novartis=False"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7910ada",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get Tahoe DE results for gene\n",
+    "\n",
+    "# hash_bucket function used to sort genes into buckets\n",
+    "def hash_bucket(gene, num_buckets=512):\n",
+    "    '''\n",
+    "    gene: Gene symbol\n",
+    "    num_buckets: number of hash buckets to create\n",
+    "\n",
+    "    Returns integer hash for gene name (between 0-n_buckets)\n",
+    "    '''\n",
+    "    return int(hashlib.md5(gene.encode()).hexdigest(),16) % num_buckets\n",
+    "\n",
+    "query_gene_encoded = hash_bucket(query_gene)\n",
+    "\n",
+    "try:\n",
+    "    tahoe_de = pd.read_parquet(f'{tahoe_URL}/gene_bucket_{query_gene_encoded}.parquet')\n",
+    "    tahoe_de = tahoe_de[tahoe_de['gene_name']==query_gene]\n",
+    "    tahoe_de['log10adj.P.Val'] = tahoe_de['padj'].replace(0,1e-323).map(np.log10)*-1\n",
+    "    tahoe_de.rename(columns = {'log2FoldChange':'logFC', 'drug':'Drug', 'padj':'adj.P.Val'}, inplace=True)\n",
+    "    tahoe_de['GeneDir'] = np.where(tahoe_de['UpReg']>0,'Up','Dn')\n",
+    "    \n",
+    "except:\n",
+    "    print('Gene not in Tahoe-100M dataset')\n",
+    "    in_tahoe=False"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "c5607885",
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_lincs + in_novartis + in_ginkgo < 1:\n",
+    "if in_lincs + in_novartis + in_ginkgo + in_tahoe < 1:\n",
     "    print(f\"LINCS: {in_lincs}\")\n",
     "    print(f\"Novartis: {in_novartis}\")\n",
     "    print(f\"Ginkgo: {in_ginkgo}\")\n",
+    "    print(f\"Tahoe-100M: {in_tahoe}\")\n",
     "    raise Exception(\"Execution stopped, gene not found in any datasets\")"
    ]
   },
@@ -290,7 +328,7 @@
    "outputs": [],
    "source": [
     "# Get pubchem ID dataframe\n",
-    "pubchem_location = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/cmap_pubchem_ids.csv'\n",
+    "pubchem_location = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/cmap_pubchem_ids_10062025.csv'\n",
     "pubchem_ids = pd.read_csv(pubchem_location, dtype = {'Drug':str, 'CID':str})"
    ]
   },
@@ -533,6 +571,45 @@
     "    display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq', raw=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c04f416c",
+   "metadata": {},
+   "source": [
+    "### Tahoe-100M"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e07834e6",
+   "metadata": {},
+   "source": [
+    "Drug rankings for the Tahoe-100M dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca416e30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if in_tahoe:\n",
+    "    tahoe_drugs_up = get_rankings(tahoe_de, 'Tahoe', '', 'up', ranking_method)\n",
+    "    tahoe_drugs_down = get_rankings(tahoe_de, 'Tahoe', '', 'down', ranking_method)\n",
+    "\n",
+    "    display_markdown(f'**Top {top_n} up-regulators in Tahoe-100M**', raw=True)\n",
+    "    display(tahoe_drugs_up[0].head(top_n))\n",
+    "    display(HTML(download_link(tahoe_drugs_up[0], f'tahoe_drug_ranks_{query_gene}_UpReg.tsv', 'Download results averaged across drug dosages')))\n",
+    "    display(HTML(download_link(tahoe_drugs_up[1], f'tahoe_drug_ranks_{query_gene}_full_UpReg.tsv', 'Download results for all perturbations')))\n",
+    "    display_markdown(f'**Top {top_n} down-regulators in Tahoe-100M**', raw=True)\n",
+    "    display(tahoe_drugs_down[0].head(top_n))\n",
+    "    display(HTML(download_link(tahoe_drugs_down[0], f'tahoe_drug_ranks_{query_gene}_DnReg.tsv', 'Download results averaged across drug dosages')))\n",
+    "    display(HTML(download_link(tahoe_drugs_down[1], f'tahoe_drug_ranks_{query_gene}_full_DnReg.tsv', 'Download results for all perturbations')))\n",
+    "else:\n",
+    "    display_markdown(f'**{query_gene}** not found in Tahoe-100M', raw=True)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -554,7 +631,11 @@
     "# get results from novartis\n",
     "if in_novartis:\n",
     "    top_up['novartis'] = get_top(novartis_drugs_up[0], n=50)\n",
-    "    top_down['novartis'] = get_top(novartis_drugs_down[0], n=50)"
+    "    top_down['novartis'] = get_top(novartis_drugs_down[0], n=50)\n",
+    "# get results from Tahoe\n",
+    "if in_tahoe:\n",
+    "    top_up['tahoe'] = get_top(tahoe_drugs_up[0], n=50)\n",
+    "    top_down['tahoe'] = get_top(tahoe_drugs_down[0], n=50)"
    ]
   },
   {
@@ -605,6 +686,7 @@
     "            'ginkgo_A549': 'ginkgo_A549',\n",
     "            'lincs_l1000': 'lincs_l1000',\n",
     "            'novartis': 'novartis',\n",
+    "            'tahoe': 'tahoe',\n",
     "            'ginkgo_human_epithelial_melanocytes': 'ginkgo_melanocytes',\n",
     "            'ginkgo_human_dermal_fibroblast': 'ginkgo_fibroblasts',\n",
     "            'ginkgo_human_aortic_smooth_muscle_cells': 'ginkgo_muscle_cells',\n",
@@ -623,7 +705,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n",
     "    display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
     "else:\n",
     "    display_markdown(f\"**Overlap among top up regulators of {query_gene}**\", raw=True)\n",
@@ -692,7 +774,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n",
     "    display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
     "else:\n",
     "    overlap_down = get_overlapping_sets(top_down)\n",
@@ -713,14 +795,6 @@
    "outputs": [],
    "source": [
     "def get_ranking_averages(overlapping_df, data_dict, ranking_method):\n",
-    "    '''\n",
-    "    Retrieve average target ranking across datasets for drugs in overlapping sets. \n",
-    "\n",
-    "    Returns dataframe with columns for:\n",
-    "    Drug\n",
-    "    Average Rank\n",
-    "    Number of datasets for which drug was a significant regulator of the query gene\n",
-    "    '''\n",
     "    # get average, integrating across datasets\n",
     "    average_rank_vals = {}\n",
     "    average_pctrank_vals = {}\n",
@@ -777,7 +851,7 @@
     "    # clean column names\n",
     "    with_proteins.rename(columns = {'logFC':'Protein logFC', 'Pubchem' : 'PubChem CID'}, inplace=True)\n",
     "    with_proteins.drop(columns='CID',inplace=True)\n",
-    "    return with_proteins"
+    "    return with_proteins.sort_values(['N Datasets', 'Avg Adj.P.Val'], ascending=[False,True])"
    ]
   },
   {
@@ -794,11 +868,12 @@
     "                       'human_epithelial_melanocytes': in_ginkgo,\n",
     "                       'human_skeletal_muscle_myoblasts': in_ginkgo,\n",
     "                       'novartis': in_novartis,\n",
-    "                       'lincs': in_lincs}\n",
+    "                       'lincs': in_lincs,\n",
+    "                       'tahoe': in_tahoe}\n",
     "data_dict_down ={}\n",
     "data_dict_up = {}\n",
     "for source,present in data_source_present.items():\n",
-    "    if (present) & (not source in ['novartis','lincs']):\n",
+    "    if (present) & (not source in ['novartis','lincs','tahoe']):\n",
     "        data_dict_down[source] = ginkgo_drugs_down[source][1]\n",
     "        data_dict_up[source] = ginkgo_drugs_up[source][1]\n",
     "    elif (present) & (source == 'lincs'):\n",
@@ -807,8 +882,11 @@
     "    elif (present) & (source == 'novartis'):\n",
     "        data_dict_down[source] = novartis_drugs_down[1]\n",
     "        data_dict_up[source] = novartis_drugs_up[1]\n",
+    "    elif (present) & (source == 'tahoe'):\n",
+    "        data_dict_down[source] = tahoe_drugs_down[1]\n",
+    "        data_dict_up[source] = tahoe_drugs_up[1]\n",
     "\n",
-    "if in_ginkgo + in_lincs + in_novartis > 1:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe > 1:\n",
     "    overlapping_up_TargetRank = get_ranking_averages(overlap_up, data_dict_up, ranking_method)\n",
     "    overlapping_down_TargetRank = get_ranking_averages(overlap_down, data_dict_down, ranking_method)"
    ]
@@ -828,7 +906,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n",
     "    display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
     "else:\n",
     "    display_markdown(\"**Averages across datasets: Up-regulating drugs**\", raw=True)\n",
@@ -909,7 +987,7 @@
    "source": [
     "## Venn Diagrams\n",
     "\n",
-    "The venn diagrams show the overlap among either up-regulating or down-regulating drugs across the three datasets Novartis DRUG-seq, LINCS L1000, and Ginkgo (all cell types grouped). "
+    "The venn diagrams show the pairwise overlap among either up-regulating or down-regulating drugs across the four Connectivity Mapping datasets Tahoe-100M, Novartis DRUG-seq, LINCS L1000, and Ginkgo (all cell types grouped). "
    ]
   },
   {
@@ -930,7 +1008,8 @@
     "# define input data for venn diagrams\n",
     "data_source_present = {'ginkgo':in_ginkgo,\n",
     "                       'lincs_l1000':in_lincs,\n",
-    "                       'novartis':in_novartis}\n",
+    "                       'novartis':in_novartis,\n",
+    "                       'tahoe': in_tahoe}\n",
     "venn_up = {}\n",
     "venn_down = {}\n",
     "for source,present in data_source_present.items():\n",
@@ -990,7 +1069,8 @@
     "    for datasets, overlap in results.items():\n",
     "        if len(overlap) == 0:\n",
     "            overlap = ['None']\n",
-    "        print(f\"{', '.join(datasets)}: {', '.join(overlap)}\")"
+    "        # print(f\"{', '.join(datasets)}: {', '.join(overlap)}\")\n",
+    "        return f\"{', '.join(datasets)}: {', '.join(overlap)}\""
    ]
   },
   {
@@ -1000,12 +1080,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n",
     "    display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
     "else:\n",
     "    display_markdown(f'Overlap of top {query_gene} up-regulating drugs across sources', raw=True)\n",
-    "    create_venn(venn_up)\n",
-    "    print_overlap(venn_up)"
+    "    for combo in combinations(list(venn_up.keys()), 2):\n",
+    "        combo_venn = {k:venn_up[k] for k in combo if k in venn_up}\n",
+    "        create_venn(combo_venn)\n",
+    "        plt.title(print_overlap(combo_venn))\n",
+    "        plt.show()"
    ]
   },
   {
@@ -1015,12 +1098,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n",
     "    display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
     "else:\n",
     "    display_markdown(f'Overlap of top {query_gene} down-regulating drugs across sources', raw=True)\n",
-    "    create_venn(venn_down)\n",
-    "    print_overlap(venn_down)"
+    "    for combo in combinations(list(venn_down.keys()), 2):\n",
+    "        combo_venn = {k:venn_down[k] for k in combo if k in venn_down}\n",
+    "        create_venn(combo_venn)\n",
+    "        plt.title(print_overlap(combo_venn))\n",
+    "        plt.show()"
    ]
   },
   {
@@ -1064,6 +1150,8 @@
     "        df['Label'] = df['Perturbation'] + '_' + df['Drug']\n",
     "    elif source == 'L1000':\n",
     "        df['Label'] = df['Perturbation']\n",
+    "    elif source == 'Tahoe':\n",
+    "        df['Label'] = df['Drug'] + '-' + df['concentration'].astype(str) + '-' + df['Cell_ID_Cellosaur']\n",
     "    elif source == 'Deepcover MoA':\n",
     "        df['Label'] = df['Drug']\n",
     "        df['abs_Zscore'] = df['Zscore'].apply(abs)\n",
@@ -1184,6 +1272,27 @@
     "    display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq dataset', raw=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "7ada5a72",
+   "metadata": {},
+   "source": [
+    "### Tahoe-100M"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "679ff8cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if in_tahoe:\n",
+    "    create_bokeh_volcano_plot(tahoe_de, query_gene, '','Tahoe')\n",
+    "else:\n",
+    "    display_markdown(f'**{query_gene}** not found in Tahoe-100M dataset', raw=True)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "8c3df5f5",
@@ -1225,9 +1334,11 @@
     "\n",
     "[5] “LINCS L1000 Reverse Search.” n.d. Accessed September 5, 2025. https://lincs-reverse-search-dashboard.dev.maayanlab.cloud/.\n",
     "\n",
-    "[6] Wang, Zichen, Edward He, Kevin Sani, Kathleen M. Jagodnik, Moshe C. Silverstein, and Avi Ma’ayan. 2019. “Drug Gene Budger (DGB): An Application for Ranking Drugs to Modulate a Specific Gene Based on Transcriptomic Signatures.” Bioinformatics (Oxford, England) 35 (7): 1247–48.\n",
+    "[6] Mitchell, Dylan C., Miljan Kuljanin, Jiaming Li, Jonathan G. Van Vranken, Nathan Bulloch, Devin K. Schweppe, Edward L. Huttlin, and Steven P. Gygi. 2023. “A Proteome-Wide Atlas of Drug Mechanism of Action.” Nature Biotechnology 41 (6): 845–57.\n",
+    "\n",
+    "[7] Zhang, Jesse, Airol A. Ubas, Richard de Borja, Valentine Svensson, Nicole Thomas, Neha Thakar, Aidan Winters, et al. 2025. “Tahoe-100M: A Giga-Scale Single-Cell Perturbation Atlas for Context-Dependent Gene Function and Cellular Modeling.” bioRxiv. https://doi.org/10.1101/2025.02.20.639398.\n",
     "\n",
-    "[7] Mitchell, Dylan C., Miljan Kuljanin, Jiaming Li, Jonathan G. Van Vranken, Nathan Bulloch, Devin K. Schweppe, Edward L. Huttlin, and Steven P. Gygi. 2023. “A Proteome-Wide Atlas of Drug Mechanism of Action.” Nature Biotechnology 41 (6): 845–57."
+    "[8] Wang, Zichen, Edward He, Kevin Sani, Kathleen M. Jagodnik, Moshe C. Silverstein, and Avi Ma’ayan. 2019. “Drug Gene Budger (DGB): An Application for Ranking Drugs to Modulate a Specific Gene Based on Transcriptomic Signatures.” Bioinformatics (Oxford, England) 35 (7): 1247–48."
    ]
   }
  ],