@@ -10,6 +10,8 @@ set -o xtrace
1010
1111readonly CLOUD=" ${CLOUD:- " " } "
1212readonly USERNAME=" ${USERNAME:- " root" } "
13+ readonly LIB_ENV=" ${LIBENV:- " /opt/conda/envs/workbench-ds" } "
14+ readonly LIB_PYTHON_VERSION=" ${LIBPYTHONVERSION:- " 3.10" } "
1315USER_HOME_DIR=" ${USERHOMEDIR:- " /home/${USERNAME} " } "
1416if [[ " ${USER_HOME_DIR} " == " /home/root" ]]; then
1517 USER_HOME_DIR=" /root"
@@ -27,6 +29,10 @@ readonly WORKBENCH_TOOLS_DIR="/opt/workbench-tools"
2729function cleanup() {
2830 rm -rf " ${WORKDIR:? } "
2931 rm -rf /var/lib/apt/lists/*
32+ # Clean up micromamba caches to save image space
33+ if command -v micromamba & > /dev/null; then
34+ micromamba clean --all --yes
35+ fi
3036}
3137
3238trap ' cleanup' EXIT
@@ -68,32 +74,73 @@ if ! mamba --version &>/dev/null; then
6874fi
6975
7076# Install the samtools family of tools in a separate environment since some of
71- # the other tools depend on old versions of these. This will take priority in
72- # the PATH.
77+ # the other tools depend on old versions of these.
7378CONDA_PACKAGES_1=(
74- " bcftools"
75- " htslib" # includes bgzip and tabix
76- " samtools"
79+ " bioconda:: bcftools"
80+ " bioconda:: htslib" # includes bgzip and tabix
81+ " bioconda:: samtools"
7782)
7883
84+ # Environment 2 contains the genomics CLI tools. They will be added to the
85+ # PATH but will not be usable as Python libraries.
7986CONDA_PACKAGES_2=(
80- " python=3.9 "
81- " pip"
82- " perl==5.32.1"
83- " bedtools"
87+ " conda-forge:: python=3.10 "
88+ " conda-forge:: pip"
89+ " conda-forge:: perl==5.32.1"
90+ " bioconda:: bedtools"
8491 " conda-forge::bgenix"
8592 " conda-forge::cromwell"
86- " ensembl-vep>=115.1"
87- " nextflow"
88- " plink"
89- " plink2"
90- " regenie"
91- " vcftools"
93+ " bioconda::ensembl-vep>=115.1"
94+ " bioconda::nextflow"
95+ " bioconda::plink"
96+ " bioconda::plink2"
97+ " bioconda::regenie"
98+ " bioconda::vcftools"
99+ )
100+
101+ # Environment 3 contains data science Python libraries. These should be
102+ # accessible from the user's default Python environment, which is why we install
103+ # them separately and give the user control over whether to inject them into an
104+ # existing environment or create a new one.
105+ CONDA_PACKAGES_3=(
106+ " conda-forge::python=${LIB_PYTHON_VERSION} "
107+ " conda-forge::google-cloud-storage"
108+ " conda-forge::ipykernel"
109+ " conda-forge::ipywidgets"
110+ " conda-forge::jupyter"
111+ " conda-forge::openai"
112+ " conda-forge::matplotlib"
113+ " conda-forge::numpy"
114+ " conda-forge::plotly"
115+ " conda-forge::pandas"
116+ " conda-forge::seaborn"
117+ " conda-forge::scikit-learn"
118+ " conda-forge::scipy"
119+ " conda-forge::tqdm"
92120)
93121
122+ # Build isolated environments
94123mkdir -p " ${WORKBENCH_TOOLS_DIR} "
95- mamba create --prefix " ${WORKBENCH_TOOLS_DIR} /1" -c bioconda -y " ${CONDA_PACKAGES_1[@]} "
96- mamba create --prefix " ${WORKBENCH_TOOLS_DIR} /2" -c bioconda -y " ${CONDA_PACKAGES_2[@]} "
124+ echo " Building Environment 1 (Samtools family)..."
125+ mamba create --prefix " ${WORKBENCH_TOOLS_DIR} /1" -y " ${CONDA_PACKAGES_1[@]} "
126+
127+ echo " Building Environment 2 (Genomics CLI Tools)..."
128+ mamba create --prefix " ${WORKBENCH_TOOLS_DIR} /2" -y " ${CONDA_PACKAGES_2[@]} "
129+
130+ echo " Building Environment 3 (Python Libraries)..."
131+ LIB_ENV_EXISTS=0
132+
133+ if [ -d " ${LIB_ENV} " ]; then
134+ # SCENARIO A: Target environment already exists on host. Inject packages into it.
135+ LIB_ENV_EXISTS=1
136+ echo " Host environment detected at ${LIB_ENV} . Injecting data science packages..."
137+ mamba install --prefix " ${LIB_ENV} " -y " ${CONDA_PACKAGES_3[@]} "
138+ else
139+ # SCENARIO B: Target environment does not exist. Create it from scratch.
140+ echo " No host environment found. Creating standalone environment at ${LIB_ENV} ..."
141+ mkdir -p " $( dirname " ${LIB_ENV} " ) "
142+ mamba create --prefix " ${LIB_ENV} " -y " ${CONDA_PACKAGES_3[@]} "
143+ fi
97144
98145# Install dsub via pip if on GCP. The conda version is outdated.
99146if [[ " ${CLOUD} " == " gcp" ]]; then
@@ -106,28 +153,32 @@ find -L "${WORKBENCH_TOOLS_DIR}/2/bin" -type f -executable -exec \
106153 -e " 1s|^#\!/usr/bin/env perl\\ r\?$|#\!${WORKBENCH_TOOLS_DIR} /2/bin/perl|" \
107154 -e " 1s|^#\!/usr/bin/env python\\ r\?$|#\!${WORKBENCH_TOOLS_DIR} /2/bin/python|" {} \;
108155
109- # Make the login user the owner of the conda environment
156+ # Make the login user the owner of the conda environments
110157chown -R " ${USERNAME} :" " ${WORKBENCH_TOOLS_DIR} "
158+ chown -R " ${USERNAME} :" " ${LIB_ENV} "
111159
112160{
161+ echo " # Workbench Tools Configuration"
162+
163+ # If we created a standalone Python libraries environment from scratch, make it the default terminal Python.
164+ # If it already existed (LIB_ENV_EXISTS=1), we leave the host image's PATH untouched to prevent shadowing.
165+ if [[ " ${LIB_ENV_EXISTS} " == " 0" ]]; then
166+ # shellcheck disable=SC2016 # we want $PATH to be evaluated at runtime
167+ printf ' export PATH="%s/bin:$PATH"\n' " ${LIB_ENV} "
168+ fi
169+
113170 # Set PATH to include workbench-tools binaries
114171 # shellcheck disable=SC2016 # we want $PATH to be evaluated at runtime
115- printf ' export PATH="%s: $PATH"\n' " ${WORKBENCH_TOOLS_DIR} /1/bin:${WORKBENCH_TOOLS_DIR} /2/bin"
172+ printf ' export PATH="$PATH:%s "\n' " ${WORKBENCH_TOOLS_DIR} /1/bin:${WORKBENCH_TOOLS_DIR} /2/bin"
116173
117174 # Set CROMWELL_JAR environment variable
118175 printf ' export CROMWELL_JAR="%s"\n' " ${WORKBENCH_TOOLS_DIR} /2/share/cromwell/cromwell.jar"
119-
120- # Make dsub a function that includes the correct PYTHONPATH. NeMo sets
121- # PYTHONPATH so we need to override it here. We use a function instead of an
122- # alias because aliases are not expanded in non-interactive shells.
123- # shellcheck disable=SC2016 # we want $PYTHONPATH to be evaluated at runtime
124- printf ' function dsub() (PYTHONPATH="%s/2/lib/python3.9/site-packages:${PYTHONPATH:-}" "%s/2/bin/dsub" "$@")\n' " ${WORKBENCH_TOOLS_DIR} " " ${WORKBENCH_TOOLS_DIR} "
125176} >> " ${USER_HOME_DIR} /.bashrc"
126177
127178# Allow .bashrc to be sourced in non-interactive shells
128179sed -i ' /^# If not running interactively/,/esac/d' " ${USER_HOME_DIR} /.bashrc" || true
129180
130181# Make sure the login user is the owner of their .bashrc
131- chown -R " ${USERNAME} :" " ${USER_HOME_DIR} /.bashrc"
182+ chown " ${USERNAME} :" " ${USER_HOME_DIR} /.bashrc"
132183
133- echo " Done !"
184+ echo " Workbench tools installation complete !"
0 commit comments