Skip to content

Commit 3da5211

Browse files
committed
Add python libraries in a separate conda env
1 parent a0f3015 commit 3da5211

6 files changed

Lines changed: 107 additions & 81 deletions

File tree

features/src/workbench-tools/devcontainer-feature.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@
44
"name": "Workbench Tools",
55
"description": "Installs common tools for Workbench Apps. Currently it only supports Debian-based systems (e.g. Ubuntu) on x86_64.",
66
"options": {
7+
"libEnv": {
8+
"type": "string",
9+
"default": "/opt/conda/envs/workbench-ds",
10+
"description": "The path to the conda environment where Python libraries (e.g. numpy, plotly, scipy) should be installed. This could be a pre-existing environment or a new one."
11+
},
12+
"libPythonVersion": {
13+
"type": "string",
14+
"default": "3.10",
15+
"description": "The Python version to use for the installed libraries."
16+
},
717
"cloud": {
818
"type": "string",
919
"default": "",

features/src/workbench-tools/install-conda.sh

Lines changed: 0 additions & 57 deletions
This file was deleted.

features/src/workbench-tools/install.sh

Lines changed: 78 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env bash
22

3-
# install.sh installs common workbench tools in the devcontainer. Currently it
4-
# only supports Debian-based systems (e.g. Ubuntu) on x86_64.
3+
# install.sh installs common workbench tools in the devcontainer.
4+
# Designed as an isolated feature that does not interfere with the host image.
55

66
set -o errexit
77
set -o nounset
@@ -10,6 +10,8 @@ set -o xtrace
1010

1111
readonly CLOUD="${CLOUD:-""}"
1212
readonly USERNAME="${USERNAME:-"root"}"
13+
readonly LIB_ENV="${LIBENV:-"/opt/conda/envs/workbench-ds"}"
14+
readonly LIB_PYTHON_VERSION="${LIBPYTHONVERSION:-"3.10"}"
1315
USER_HOME_DIR="${USERHOMEDIR:-"/home/${USERNAME}"}"
1416
if [[ "${USER_HOME_DIR}" == "/home/root" ]]; then
1517
USER_HOME_DIR="/root"
@@ -23,10 +25,15 @@ WORKDIR="$(mktemp -d)"
2325
readonly WORKDIR
2426

2527
readonly WORKBENCH_TOOLS_DIR="/opt/workbench-tools"
28+
export MAMBA_ROOT_PREFIX="${WORKBENCH_TOOLS_DIR}/conda-root"
2629

2730
function cleanup() {
2831
rm -rf "${WORKDIR:?}"
2932
rm -rf /var/lib/apt/lists/*
33+
# Clean up micromamba caches to save image space
34+
if command -v micromamba &> /dev/null; then
35+
micromamba clean --all --yes
36+
fi
3037
}
3138

3239
trap 'cleanup' EXIT
@@ -61,23 +68,27 @@ check_packages \
6168
git \
6269
sed \
6370
sudo \
64-
tar
71+
tar \
72+
bzip2
6573

66-
if ! mamba --version &>/dev/null; then
67-
source ./install-conda.sh
74+
# Install Micromamba
75+
if ! type micromamba &>/dev/null; then
76+
echo "Installing Micromamba..."
77+
curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -C /usr/local -xvj bin/micromamba
6878
fi
6979

7080
# Install the samtools family of tools in a separate environment since some of
71-
# the other tools depend on old versions of these. This will take priority in
72-
# the PATH.
81+
# the other tools depend on old versions of these.
7382
CONDA_PACKAGES_1=(
7483
"bcftools"
7584
"htslib" # includes bgzip and tabix
7685
"samtools"
7786
)
7887

88+
# Environment 2 contains the genomics CLI tools. They will be added to the
89+
# PATH but will not be usable as Python libraries.
7990
CONDA_PACKAGES_2=(
80-
"python=3.9"
91+
"python=3.10"
8192
"pip"
8293
"perl==5.32.1"
8394
"bedtools"
@@ -91,13 +102,53 @@ CONDA_PACKAGES_2=(
91102
"vcftools"
92103
)
93104

105+
# Environment 3 contains data science Python libraries. These should be
106+
# accessible from the user's default Python environment, which is why we install
107+
# them separately and give the user control over whether to inject them into an
108+
# existing environment or create a new one.
109+
CONDA_PACKAGES_3=(
110+
"python=${LIB_PYTHON_VERSION}"
111+
"google-cloud-storage"
112+
"ipykernel"
113+
"ipywidgets"
114+
"jupyter"
115+
"openai"
116+
"matplotlib"
117+
"numpy"
118+
"plotly"
119+
"pandas"
120+
"seaborn"
121+
"scikit-learn"
122+
"scipy"
123+
"tqdm"
124+
)
125+
126+
# Build isolated environments
94127
mkdir -p "${WORKBENCH_TOOLS_DIR}"
95-
mamba create --prefix "${WORKBENCH_TOOLS_DIR}/1" -c bioconda -y "${CONDA_PACKAGES_1[@]}"
96-
mamba create --prefix "${WORKBENCH_TOOLS_DIR}/2" -c bioconda -y "${CONDA_PACKAGES_2[@]}"
128+
echo "Building Environment 1 (Samtools family)..."
129+
micromamba create --prefix "${WORKBENCH_TOOLS_DIR}/1" -c bioconda -c conda-forge -y "${CONDA_PACKAGES_1[@]}"
130+
131+
echo "Building Environment 2 (Genomics CLI Tools)..."
132+
micromamba create --prefix "${WORKBENCH_TOOLS_DIR}/2" -c bioconda -c conda-forge -y "${CONDA_PACKAGES_2[@]}"
133+
134+
echo "Building Environment 3 (Python Libraries)..."
135+
LIB_ENV_EXISTS=0
136+
137+
if [ -d "${LIB_ENV}" ]; then
138+
# SCENARIO A: Target environment already exists on host. Inject packages into it.
139+
LIB_ENV_EXISTS=1
140+
echo "Host environment detected at ${LIB_ENV}. Injecting data science packages..."
141+
micromamba install --prefix "${LIB_ENV}" -c conda-forge -y "${CONDA_PACKAGES_3[@]}"
142+
else
143+
# SCENARIO B: Target environment does not exist. Create it from scratch.
144+
echo "No host environment found. Creating standalone environment at ${LIB_ENV}..."
145+
mkdir -p "$(dirname "${LIB_ENV}")"
146+
micromamba create --prefix "${LIB_ENV}" -c conda-forge -y "${CONDA_PACKAGES_3[@]}"
147+
fi
97148

98149
# Install dsub via pip if on GCP. The conda version is outdated.
99150
if [[ "${CLOUD}" == "gcp" ]]; then
100-
"${WORKBENCH_TOOLS_DIR}/2/bin/pip" install dsub
151+
"${WORKBENCH_TOOLS_DIR}/2/bin/pip" install --no-cache-dir dsub
101152
fi
102153

103154
# Force the perl and python scripts to use the correct perl/python
@@ -106,28 +157,31 @@ find -L "${WORKBENCH_TOOLS_DIR}/2/bin" -type f -executable -exec \
106157
-e "1s|^#\!/usr/bin/env perl\\r\?$|#\!${WORKBENCH_TOOLS_DIR}/2/bin/perl|" \
107158
-e "1s|^#\!/usr/bin/env python\\r\?$|#\!${WORKBENCH_TOOLS_DIR}/2/bin/python|" {} \;
108159

109-
# Make the login user the owner of the conda environment
160+
# Make the login user the owner of the conda environments
110161
chown -R "${USERNAME}:" "${WORKBENCH_TOOLS_DIR}"
162+
chown -R "${USERNAME}:" "${LIB_ENV}"
111163

112164
{
165+
echo "# Workbench Tools Configuration"
166+
167+
# If we created a standalone Python libraries environment from scratch, make it the default terminal Python.
168+
# If it already existed (LIB_ENV_EXISTS=1), we leave the host image's PATH untouched to prevent shadowing.
169+
if [[ "${LIB_ENV_EXISTS}" == "0" ]]; then
170+
printf 'export PATH="%s/bin:$PATH"\n' "${LIB_ENV}"
171+
fi
172+
113173
# Set PATH to include workbench-tools binaries
114174
# shellcheck disable=SC2016 # we want $PATH to be evaluated at runtime
115-
printf 'export PATH="%s:$PATH"\n' "${WORKBENCH_TOOLS_DIR}/1/bin:${WORKBENCH_TOOLS_DIR}/2/bin"
116-
117-
# Set CROMWELL_JAR environment variable
118-
printf 'export CROMWELL_JAR="%s"\n' "${WORKBENCH_TOOLS_DIR}/2/share/cromwell/cromwell.jar"
119-
120-
# Make dsub a function that includes the correct PYTHONPATH. NeMo sets
121-
# PYTHONPATH so we need to override it here. We use a function instead of an
122-
# alias because aliases are not expanded in non-interactive shells.
123-
# shellcheck disable=SC2016 # we want $PYTHONPATH to be evaluated at runtime
124-
printf 'function dsub() (PYTHONPATH="%s/2/lib/python3.9/site-packages:${PYTHONPATH:-}" "%s/2/bin/dsub" "$@")\n' "${WORKBENCH_TOOLS_DIR}" "${WORKBENCH_TOOLS_DIR}"
175+
printf 'export PATH="$PATH:%s/1/bin:%s/2/bin"\n' "${WORKBENCH_TOOLS_DIR}" "${WORKBENCH_TOOLS_DIR}"
176+
177+
# Set Cromwell JAR
178+
printf 'export CROMWELL_JAR="%s/2/share/cromwell/cromwell.jar"\n' "${WORKBENCH_TOOLS_DIR}"
125179
} >> "${USER_HOME_DIR}/.bashrc"
126180

127181
# Allow .bashrc to be sourced in non-interactive shells
128182
sed -i '/^# If not running interactively/,/esac/d' "${USER_HOME_DIR}/.bashrc" || true
129183

130184
# Make sure the login user is the owner of their .bashrc
131-
chown -R "${USERNAME}:" "${USER_HOME_DIR}/.bashrc"
185+
chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc"
132186

133-
echo "Done!"
187+
echo "Workbench tools installation complete!"

src/custom-workbench-jupyter-template/.devcontainer.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
],
2323
"features": {
2424
"./.devcontainer/features/workbench-tools": {
25+
"libEnv": "/opt/conda/envs/jupyter", // Use the jupyter conda environment
26+
"libPythonVersion": "3.10", // This needs to match the Python version in the Dockerfile
2527
"cloud": "${templateOption:cloud}",
2628
"username": "jupyter",
2729
"userHomeDir": "/home/jupyter"

src/jupyter-aou/.devcontainer.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
"postStartCommand": "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\"",
1212
"features": {
1313
"./.devcontainer/features/workbench-tools": {
14+
"libEnv": "/opt/conda/envs/jupyter", // Use the jupyter conda environment
15+
"libPythonVersion": "3.10", // This needs to match the Python version in the Dockerfile
1416
"cloud": "${templateOption:cloud}",
1517
"username": "jupyter",
1618
"userHomeDir": "/home/jupyter"

test/test.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,21 @@ if [[ "$HAS_WORKBENCH_TOOLS" == "true" ]]; then
4444
check "vep: filter_vep" "filter_vep --help > /dev/null"
4545
check "vep: variant_recoder" "variant_recoder --help | head -n10"
4646
check "vep: haplo" "haplo --help | head -n10"
47+
48+
# Python packages
49+
check "python: google-cloud-storage" 'python3 -c "import google.cloud.storage"'
50+
check "python: ipykernel" 'python3 -c "import ipykernel"'
51+
check "python: ipywidgets" 'python3 -c "import ipywidgets"'
52+
check "python: jupyter" 'python3 -c "import jupyter"'
53+
check "python: openai" 'python3 -c "import openai"'
54+
check "python: matplotlib" 'python3 -c "import matplotlib"'
55+
check "python: numpy" 'python3 -c "import numpy"'
56+
check "python: plotly" 'python3 -c "import plotly"'
57+
check "python: pandas" 'python3 -c "import pandas"'
58+
check "python: seaborn" 'python3 -c "import seaborn"'
59+
check "python: scikit-learn" 'python3 -c "import sklearn"'
60+
check "python: scipy" 'python3 -c "import scipy"'
61+
check "python: tqdm" 'python3 -c "import tqdm"'
4762
fi
4863

4964
# The postgres-client feature should install these

0 commit comments

Comments
 (0)