Skip to content

Commit 22a8a27

Browse files
Add python libraries in a separate conda env (#333)
This change adds back the python libraries removed in #322. Rather than adding them to `CONDA_PACKAGES_2`, we create a third conda environment explicitly for python libraries. This keeps the environments more distinct: one for CLI tools, another for samtools (separated due to dependency version conflicts), and a third for python libraries. The python libraries environment is intended to be the user's main python environment, so we should not need to manipulate `PYTHONPATH`. This python libraries environment can either be an existing environment or a new one. PHP-132843
1 parent 8e5872e commit 22a8a27

10 files changed

Lines changed: 136 additions & 42 deletions

File tree

features/src/workbench-tools/devcontainer-feature.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@
44
"name": "Workbench Tools",
55
"description": "Installs common tools for Workbench Apps. Currently it only supports Debian-based systems (e.g. Ubuntu) on x86_64.",
66
"options": {
7+
"libEnv": {
8+
"type": "string",
9+
"default": "/opt/conda/envs/workbench-ds",
10+
"description": "The path to the conda environment where Python libraries (e.g. numpy, plotly, scipy) should be installed. This could be a pre-existing environment or a new one."
11+
},
12+
"libPythonVersion": {
13+
"type": "string",
14+
"default": "3.14",
15+
"description": "The Python version to use for the installed libraries."
16+
},
717
"cloud": {
818
"type": "string",
919
"default": "",

features/src/workbench-tools/install-conda.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ install_miniforge() {
1919
local download_url
2020
download_url="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-$(uname -m).sh"
2121

22-
check_packages curl ca-certificates
2322
mkdir -p /tmp/miniforge
2423
(
2524
cd /tmp/miniforge

features/src/workbench-tools/install.sh

Lines changed: 93 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ set -o xtrace
1010

1111
readonly CLOUD="${CLOUD:-""}"
1212
readonly USERNAME="${USERNAME:-"root"}"
13+
readonly LIBRARIES_ENV_DIR="${LIBENV:-"/opt/conda/envs/workbench-ds"}"
14+
readonly LIB_PYTHON_VERSION="${LIBPYTHONVERSION:-"3.14"}"
1315
USER_HOME_DIR="${USERHOMEDIR:-"/home/${USERNAME}"}"
1416
if [[ "${USER_HOME_DIR}" == "/home/root" ]]; then
1517
USER_HOME_DIR="/root"
@@ -68,66 +70,122 @@ if ! mamba --version &>/dev/null; then
6870
fi
6971

7072
# Install the samtools family of tools in a separate environment since some of
71-
# the other tools depend on old versions of these. This will take priority in
72-
# the PATH.
73-
CONDA_PACKAGES_1=(
74-
"bcftools"
75-
"htslib" # includes bgzip and tabix
76-
"samtools"
73+
# the other tools depend on old versions of these.
74+
readonly CONDA_PACKAGES_SAMTOOLS=(
75+
"bioconda::bcftools>=1.23"
76+
"bioconda::htslib>=1.23" # includes bgzip and tabix
77+
"bioconda::samtools>=1.23"
7778
)
78-
79-
CONDA_PACKAGES_2=(
80-
"python=3.9"
81-
"pip"
82-
"perl==5.32.1"
83-
"bedtools"
79+
readonly SAMTOOLS_ENV_DIR="${WORKBENCH_TOOLS_DIR}/samtools"
80+
81+
# Environment 2 contains the genomics CLI tools. They will be added to the
82+
# PATH but will not be usable as Python libraries.
83+
readonly CONDA_PACKAGES_BINARIES=(
84+
"conda-forge::python"
85+
"conda-forge::pip"
86+
"conda-forge::perl>=5.32"
87+
"bioconda::bedtools"
8488
"conda-forge::bgenix"
8589
"conda-forge::cromwell"
86-
"ensembl-vep>=115.1"
87-
"nextflow"
88-
"plink"
89-
"plink2"
90-
"regenie"
91-
"vcftools"
90+
"bioconda::ensembl-vep>=115"
91+
"bioconda::nextflow"
92+
"bioconda::plink"
93+
"bioconda::plink2"
94+
"bioconda::regenie"
95+
"bioconda::vcftools"
96+
)
97+
readonly BINARIES_ENV_DIR="${WORKBENCH_TOOLS_DIR}/binaries"
98+
99+
# Environment 3 contains data science Python libraries. These should be
100+
# accessible from the user's default Python environment, which is why we install
101+
# them separately and give the user control over whether to inject them into an
102+
# existing environment or create a new one.
103+
CONDA_PACKAGES_LIBRARIES=(
104+
"conda-forge::google-cloud-storage"
105+
"conda-forge::ipykernel"
106+
"conda-forge::ipywidgets"
107+
"conda-forge::jupyter"
108+
"conda-forge::openai"
109+
"conda-forge::matplotlib"
110+
"conda-forge::numpy"
111+
"conda-forge::plotly"
112+
"conda-forge::pandas"
113+
"conda-forge::seaborn"
114+
"conda-forge::scikit-learn"
115+
"conda-forge::scipy"
116+
"conda-forge::tqdm"
92117
)
93118

119+
# Build isolated environments
94120
mkdir -p "${WORKBENCH_TOOLS_DIR}"
95-
mamba create --prefix "${WORKBENCH_TOOLS_DIR}/1" -c bioconda -y "${CONDA_PACKAGES_1[@]}"
96-
mamba create --prefix "${WORKBENCH_TOOLS_DIR}/2" -c bioconda -y "${CONDA_PACKAGES_2[@]}"
121+
echo "Building Environment 1 (Samtools family)..."
122+
mamba create --prefix "${SAMTOOLS_ENV_DIR}" -y "${CONDA_PACKAGES_SAMTOOLS[@]}"
123+
124+
echo "Building Environment 2 (Genomics CLI Tools)..."
125+
mamba create --prefix "${BINARIES_ENV_DIR}" -y "${CONDA_PACKAGES_BINARIES[@]}"
126+
127+
echo "Building Environment 3 (Python Libraries)..."
128+
LIB_ENV_EXISTS=0
129+
130+
if [ -d "${LIBRARIES_ENV_DIR}" ]; then
131+
# SCENARIO A: Target environment already exists on host. Inject packages into it.
132+
LIB_ENV_EXISTS=1
133+
echo "Host environment detected at ${LIBRARIES_ENV_DIR}. Injecting data science packages..."
134+
135+
if mamba list -p "${LIBRARIES_ENV_DIR}" --full-name python --json | jq -e 'length == 0' >/dev/null; then
136+
echo "No Python installation found in host environment. Adding python=${LIB_PYTHON_VERSION} to package list."
137+
CONDA_PACKAGES_LIBRARIES+=("conda-forge::python=${LIB_PYTHON_VERSION}")
138+
fi
139+
mamba install --prefix "${LIBRARIES_ENV_DIR}" -y "${CONDA_PACKAGES_LIBRARIES[@]}"
140+
else
141+
# SCENARIO B: Target environment does not exist. Create it from scratch.
142+
echo "No host environment found. Creating standalone environment at ${LIBRARIES_ENV_DIR}..."
143+
mkdir -p "$(dirname "${LIBRARIES_ENV_DIR}")"
144+
145+
CONDA_PACKAGES_LIBRARIES+=("conda-forge::python=${LIB_PYTHON_VERSION}")
146+
mamba create --prefix "${LIBRARIES_ENV_DIR}" -y "${CONDA_PACKAGES_LIBRARIES[@]}"
147+
fi
97148

98149
# Install dsub via pip if on GCP. The conda version is outdated.
150+
# dsub is installed in LIBRARIES_ENV_DIR because it can be used as a Python
151+
# library, and users may want to install additional packages alongside it.
152+
# PYTHONNOUSERSITE=1 prevents pip from seeing/modifying packages in user site-packages.
99153
if [[ "${CLOUD}" == "gcp" ]]; then
100-
"${WORKBENCH_TOOLS_DIR}/2/bin/pip" install dsub
154+
PYTHONNOUSERSITE=1 "${LIBRARIES_ENV_DIR}/bin/pip" install dsub
101155
fi
102156

103157
# Force the perl and python scripts to use the correct perl/python
104-
find -L "${WORKBENCH_TOOLS_DIR}/2/bin" -type f -executable -exec \
158+
find -L "${BINARIES_ENV_DIR}/bin" -type f -executable -exec \
105159
sed -i --follow-symlinks \
106-
-e "1s|^#\!/usr/bin/env perl\\r\?$|#\!${WORKBENCH_TOOLS_DIR}/2/bin/perl|" \
107-
-e "1s|^#\!/usr/bin/env python\\r\?$|#\!${WORKBENCH_TOOLS_DIR}/2/bin/python|" {} \;
160+
-e "1s|^#\!/usr/bin/env perl\\r\?$|#\!${BINARIES_ENV_DIR}/bin/perl|" \
161+
-e "1s|^#\!/usr/bin/env python\\r\?$|#\!${BINARIES_ENV_DIR}/bin/python|" {} \;
108162

109-
# Make the login user the owner of the conda environment
163+
# Make the login user the owner of the conda environments
110164
chown -R "${USERNAME}:" "${WORKBENCH_TOOLS_DIR}"
165+
chown -R "${USERNAME}:" "${LIBRARIES_ENV_DIR}"
111166

112167
{
168+
echo "# Workbench Tools Configuration"
169+
170+
# If we created a standalone Python libraries environment from scratch, make it the default terminal Python.
171+
# If it already existed (LIB_ENV_EXISTS=1), we leave the host image's PATH untouched to prevent shadowing.
172+
if [[ "${LIB_ENV_EXISTS}" == "0" ]]; then
173+
# shellcheck disable=SC2016 # we want $PATH to be evaluated at runtime
174+
printf 'export PATH="%s:$PATH"\n' "${LIBRARIES_ENV_DIR}/bin"
175+
fi
176+
113177
# Set PATH to include workbench-tools binaries
114178
# shellcheck disable=SC2016 # we want $PATH to be evaluated at runtime
115-
printf 'export PATH="%s:$PATH"\n' "${WORKBENCH_TOOLS_DIR}/1/bin:${WORKBENCH_TOOLS_DIR}/2/bin"
179+
printf 'export PATH="$PATH:%s"\n' "${SAMTOOLS_ENV_DIR}/bin:${BINARIES_ENV_DIR}/bin"
116180

117181
# Set CROMWELL_JAR environment variable
118-
printf 'export CROMWELL_JAR="%s"\n' "${WORKBENCH_TOOLS_DIR}/2/share/cromwell/cromwell.jar"
119-
120-
# Make dsub a function that includes the correct PYTHONPATH. NeMo sets
121-
# PYTHONPATH so we need to override it here. We use a function instead of an
122-
# alias because aliases are not expanded in non-interactive shells.
123-
# shellcheck disable=SC2016 # we want $PYTHONPATH to be evaluated at runtime
124-
printf 'function dsub() (PYTHONPATH="%s/2/lib/python3.9/site-packages:${PYTHONPATH:-}" "%s/2/bin/dsub" "$@")\n' "${WORKBENCH_TOOLS_DIR}" "${WORKBENCH_TOOLS_DIR}"
182+
printf 'export CROMWELL_JAR="%s"\n' "${BINARIES_ENV_DIR}/share/cromwell/cromwell.jar"
125183
} >> "${USER_HOME_DIR}/.bashrc"
126184

127185
# Allow .bashrc to be sourced in non-interactive shells
128186
sed -i '/^# If not running interactively/,/esac/d' "${USER_HOME_DIR}/.bashrc" || true
129187

130188
# Make sure the login user is the owner of their .bashrc
131-
chown -R "${USERNAME}:" "${USER_HOME_DIR}/.bashrc"
189+
chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc"
132190

133-
echo "Done!"
191+
echo "Workbench tools installation complete!"

src/custom-workbench-jupyter-template/.devcontainer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
],
2323
"features": {
2424
"./.devcontainer/features/workbench-tools": {
25+
"libEnv": "/opt/conda/envs/jupyter", // Use the jupyter conda environment
2526
"cloud": "${templateOption:cloud}",
2627
"username": "jupyter",
2728
"userHomeDir": "/home/jupyter"

src/jupyter-aou/.devcontainer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"postStartCommand": "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\"",
1212
"features": {
1313
"./.devcontainer/features/workbench-tools": {
14+
"libEnv": "/opt/conda/envs/jupyter", // Use the jupyter conda environment
1415
"cloud": "${templateOption:cloud}",
1516
"username": "jupyter",
1617
"userHomeDir": "/home/jupyter"

src/nemo_jupyter/.devcontainer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
},
2626
"ghcr.io/dhoeric/features/google-cloud-cli@sha256:fa5d894718825c5ad8009ac8f2c9f0cea3d1661eb108a9d465cba9f3fc48965f": {},
2727
"./.devcontainer/features/workbench-tools": {
28+
"libPythonVersion": "3.12", // Must match python version in nemo image
2829
"cloud": "${templateOption:cloud}",
2930
"username": "jupyter",
3031
"userHomeDir": "/home/jupyter"

src/nemo_jupyter/Dockerfile

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,15 @@ RUN useradd --uid ${NB_UID} --gid ${NB_GID} --create-home --home-dir ${WORKDIR}
1717
# Fix ownership for common dirs
1818
&& mkdir -p /workspace \
1919
&& chown -R ${NB_UID}:${NB_GID} ${WORKDIR} /workspace /tmp \
20-
&& chown -R ${NB_UID}:${NB_GID} /opt/conda || true \
21-
# Add alias for weightsbiases
22-
&& printf 'alias weightsbiases="/usr/local/bin/wb"\nalias wb="/usr/bin/wb"\n' >> ${WORKDIR}/.bashrc
20+
&& chown -R ${NB_UID}:${NB_GID} /opt/conda || true
21+
22+
# Add shell functions to override wb commands (functions work in non-interactive shells, unlike aliases)
23+
RUN <<EOF cat >> ${WORKDIR}/.bashrc
24+
weightsbiases() { /usr/local/bin/wb "\$@"; }
25+
wb() { /usr/bin/wb "\$@"; }
26+
wandb() { /opt/venv/bin/wb "\$@"; }
27+
export -f weightsbiases wb wandb
28+
EOF
2329

2430
# Environment and working directory
2531
ENV HOME=${WORKDIR}

src/nemo_jupyter_aou/.devcontainer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
},
2727
"ghcr.io/dhoeric/features/google-cloud-cli@sha256:fa5d894718825c5ad8009ac8f2c9f0cea3d1661eb108a9d465cba9f3fc48965f": {},
2828
"./.devcontainer/features/workbench-tools": {
29+
"libPythonVersion": "3.12", // Must match python version in nemo image
2930
"cloud": "${templateOption:cloud}",
3031
"username": "jupyter",
3132
"userHomeDir": "/home/jupyter"

startupscript/post-startup.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ cat << EOF >> "${USER_BASHRC}"
201201
### BEGIN: Workbench-specific customizations ###
202202
203203
# Prepend "/usr/bin" (if not already in the path)
204-
if [[ "\${PATH}:" != "/usr/bin:"* ]]; then
204+
if [[ ":\${PATH}:" != *":/usr/bin:"* ]]; then
205205
export PATH=/usr/bin:\${PATH}
206206
fi
207207

test/test.sh

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,18 @@ function check() {
1111
check_user "${TEST_USER}" "$@"
1212
}
1313

14+
sudo -u "${TEST_USER}" bash -l -c "echo PATH: $PATH"
15+
1416
# Template specific tests
1517
check "gcsfuse" gcsfuse -v
1618
check "wb cli" wb version
1719
check "fuse.conf user_allow_other" grep -qE "^[[:space:]]*[^#]*user_allow_other" "/etc/fuse.conf"
1820

1921
# The workbench-tools feature should install these
2022
if [[ "$HAS_WORKBENCH_TOOLS" == "true" ]]; then
21-
check "python3" python3 --version
23+
check "python3" "which python3 && python3 --version"
2224
check "python3: venv" 'python3 -c "import venv"'
23-
check "pip3" pip3 --version
25+
check "pip3" "which pip3 && pip3 --version"
2426
if [[ "$TEMPLATE_ID" != "nemo_jupyter" ]] && [[ "$TEMPLATE_ID" != "nemo_jupyter_aou" ]]; then
2527
check "cromwell" cromwell --version
2628
fi
@@ -44,6 +46,21 @@ if [[ "$HAS_WORKBENCH_TOOLS" == "true" ]]; then
4446
check "vep: filter_vep" "filter_vep --help > /dev/null"
4547
check "vep: variant_recoder" "variant_recoder --help | head -n10"
4648
check "vep: haplo" "haplo --help | head -n10"
49+
50+
# Python packages
51+
check "python: google-cloud-storage" 'python3 -c "import google.cloud.storage"'
52+
check "python: ipykernel" 'python3 -c "import ipykernel"'
53+
check "python: ipywidgets" 'python3 -c "import ipywidgets"'
54+
check "python: jupyter" 'python3 -c "import jupyter"'
55+
check "python: openai" 'python3 -c "import openai"'
56+
check "python: matplotlib" 'python3 -c "import matplotlib"'
57+
check "python: numpy" 'python3 -c "import numpy"'
58+
check "python: plotly" 'python3 -c "import plotly"'
59+
check "python: pandas" 'python3 -c "import pandas"'
60+
check "python: seaborn" 'python3 -c "import seaborn"'
61+
check "python: scikit-learn" 'python3 -c "import sklearn"'
62+
check "python: scipy" 'python3 -c "import scipy"'
63+
check "python: tqdm" 'python3 -c "import tqdm"'
4764
fi
4865

4966
# The postgres-client feature should install these

0 commit comments

Comments
 (0)