diff --git a/README.md b/README.md index f74e9861..990c0c84 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# Intro to Google Cloud Platform (GCP) for Machine Learning and AI +# Intro to CHTC for Machine Learning and AI -This lesson teaches core workflows for building, training, and tuning ML/AI models using Google Cloud's Vertex AI platform. Participants learn to set up data storage, configure Vertex AI Workbench notebooks as lightweight controllers, launch training and hyperparameter tuning jobs, and optimize resource costs effectively within GCP. The workshop also includes a section on building retrieval-augmented generation (RAG) pipelines using Gemini models. +This lesson teaches core workflows for building, training, and tuning ML/AI models using UW-Madison's Center for High Throughput Computing (CHTC). Participants learn to manage data on CHTC filesystems, configure HTCondor submit files, launch training and hyperparameter tuning jobs on CPUs and GPUs, and follow best practices for shared research computing infrastructure. The workshop also includes a section on building retrieval-augmented generation (RAG) pipelines. ## Prerequisites @@ -9,16 +9,15 @@ This lesson teaches core workflows for building, training, and tuning ML/AI mode ## Episodes -1. Overview of Google Cloud for Machine Learning -2. Data Storage: Setting up GCS -3. Notebooks as Controllers -4. Accessing and Managing Data in GCS -5. Using GitHub PAT in Vertex AI Notebooks -6. Training Models in Vertex AI: XGBoost (CPU) -7. Training Models in Vertex AI: PyTorch (GPU) -8. Hyperparameter Tuning in Vertex AI -9. Resource Management & Cleanup -10. Retrieval-Augmented Generation (RAG) +1. Overview of CHTC for Machine Learning +2. Connecting to CHTC +3. Data Management on CHTC +4. Training Models on CHTC (XGBoost, CPU) +5. Training Models on CHTC (PyTorch, GPU) +6. Hyperparameter Tuning with HTCondor +7. Retrieval-Augmented Generation (RAG) on CHTC +8. Advanced HTCondor Workflows +9. Resource Management & Best Practices ## Setup diff --git a/config.yaml b/config.yaml index 27b26723..b873e2d8 100644 --- a/config.yaml +++ b/config.yaml @@ -18,13 +18,13 @@ carpentry: 'incubator' carpentry_description: 'Lesson Description' # Overall title for pages. -title: 'Intro to Google Cloud Platform (GCP) for Machine Learning and AI' +title: 'Intro to CHTC for Machine Learning and AI' # Date the lesson was created (YYYY-MM-DD, this is empty by default) -created: 2025-08-26 +created: 2025-08-26 # Comma-separated list of keywords for the lesson -keywords: 'cloud, GCP, lesson, The Carpentries, ML, AI, GPU' +keywords: 'HTC, CHTC, HTCondor, lesson, The Carpentries, ML, AI, GPU' # Life cycle stage of the lesson # possible values: pre-alpha, alpha, beta, stable @@ -67,20 +67,20 @@ contact: 'endemann@wisc.edu' # Order of episodes in your lesson episodes: - 01-Introduction.md -- 02-Notebooks-as-controllers.md -- 03-Data-storage-and-access.md -- 04-Training-models-in-VertexAI.md -- 05-Training-models-in-VertexAI-GPUs.md +- 02-Connecting-to-CHTC.md +- 03-Data-management.md +- 04-Training-models-on-CHTC.md +- 05-Training-models-on-CHTC-GPUs.md - 06-Hyperparameter-tuning.md - 07-Retrieval-augmented-generation.md -- 08-CLI-workflows.md -- 09-Resource-management-cleanup.md +- 08-Advanced-HTCondor-workflows.md +- 09-Resource-management-best-practices.md # Information for Learners learners: - reference.md - compute-for-ML.md -- uw-madison-cloud-resources.md +- uw-madison-chtc-resources.md - github-pat.md # Information for Instructors diff --git a/episodes/01-Introduction.md b/episodes/01-Introduction.md index bba0502f..967dfa27 100644 --- a/episodes/01-Introduction.md +++ b/episodes/01-Introduction.md @@ -1,135 +1,123 @@ --- -title: "Overview of Google Cloud for Machine Learning and AI" +title: "Overview of CHTC for Machine Learning and AI" teaching: 10 exercises: 2 --- ::::::::::::::::::::::::::::::::::::: questions -- Why would I run ML/AI experiments in the cloud instead of on my laptop or an HPC cluster? -- What does GCP offer for ML/AI, and how is it organized? -- What is the "notebook as controller" pattern? +- Why would I run ML/AI experiments on CHTC instead of on my laptop? +- What does CHTC offer for ML/AI, and how is it organized? +- What is the "submit node as controller" pattern? :::::::::::::::::::::::::::::::::::::::::::::::: ::::::::::::::::::::::::::::::::::::: objectives -- Identify when cloud compute makes sense for ML/AI work. -- Describe what GCP and Vertex AI provide for ML/AI researchers. -- Explain the notebook-as-controller pattern used throughout this workshop. +- Identify when high-throughput computing makes sense for ML/AI work. +- Describe what CHTC and HTCondor provide for ML/AI researchers. +- Explain the submit-node pattern used throughout this workshop. :::::::::::::::::::::::::::::::::::::::::::::::: -## Why run ML/AI in the cloud? +## Why run ML/AI on CHTC? -You have ML/AI code that works on your laptop. But at some point you need more — a bigger GPU (or multiple GPUs), a dataset that won't fit on disk, or the ability to run dozens of training experiments overnight. You could invest in local hardware or compete for time on a shared HPC cluster, but cloud platforms let you rent exactly the hardware you need, for exactly as long as you need it, and then shut it down. +You have ML/AI code that works on your laptop. But at some point you need more — a bigger GPU (or multiple GPUs), a dataset that won't fit in memory, or the ability to run dozens of training experiments overnight. You could invest in local hardware, but UW-Madison's **Center for High Throughput Computing (CHTC)** lets you access powerful shared hardware — including cutting-edge GPUs — for free, on demand. -### Cloud vs. university HPC clusters +### What is CHTC? -Most universities offer shared HPC clusters with GPUs. These are excellent resources — but they have tradeoffs worth understanding: +[CHTC](https://chtc.cs.wisc.edu/) is a research computing center at UW-Madison that provides large-scale computing resources to the campus community. It uses **HTCondor**, a job scheduling system developed at UW-Madison, to manage and distribute computational work across a large pool of shared machines. -| Factor | University HPC | Cloud (GCP) | -|--------|---------------|-------------| -| **Cost** | Free or subsidized | Pay per hour | -| **GPU availability** | Shared queue; wait times during peak periods and per-job runtime limits (often 24–72 hrs) that may require checkpointing long training runs | On-demand (subject to quota); jobs run as long as needed | -| **Hardware variety** | Fixed hardware refresh cycle (3–5 years) | Latest GPUs available immediately (A100, H100, L4) | -| **Scaling** | Limited by cluster size | Spin up hundreds of jobs in parallel | -| **Multi-GPU / NVLink** | Sometimes available, depends on cluster | Available on demand (e.g., A2/A3 instances with NVLink-connected multi-GPU nodes) — essential for training, fine-tuning, or serving large LLMs that don't fit in a single GPU's memory | -| **Job orchestration** | Writing scheduler scripts, packaging environments, and wiring up parallel job arrays can take days of refactoring | A few SDK calls: define a job, set hardware, call `.run()` — parallelism (e.g., tuning trials) is built in | -| **Software environment** | Module system; some clusters support Apptainer/Singularity containers — research computing staff can often help with setup | Vertex AI provides [prebuilt containers](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers) for common ML frameworks (PyTorch, XGBoost, TensorFlow); add extra packages via a `requirements` list, or bring your own Docker image for full control | -| **Power & cooling** | Paid for by the university; campus data centers often spend nearly as much energy on cooling as on the computers themselves | Google's data centers are roughly twice as energy-efficient as a typical campus facility — and power, cooling, and hardware failures are their problem, not yours | +Key features for ML/AI researchers: -**The short version:** use your university cluster when it has the hardware you need and the queue isn't blocking you. Use the cloud when you need hardware your cluster doesn't have, need to scale beyond what the queue allows, or need a specific software environment you can't easily get on-campus. +- **Free for UW-Madison researchers** — no billing, no credits to manage, no surprise charges. +- **GPU Lab** with NVIDIA A100 (40/80 GB), H100 (80 GB), and H200 (141 GB) GPUs — enough to fine-tune models up to ~70B parameters with quantization. +- **Docker container support** — bring your own software environment via Docker images. +- **Massive throughput** — run hundreds of independent jobs in parallel (e.g., hyperparameter sweeps). +- **Dedicated support** — CHTC's facilitation team helps researchers optimize their workflows. -Many researchers use both — develop and test on HPC, then scale to cloud for large experiments or specialized hardware. This workshop teaches the cloud side of that workflow. +### Laptop vs. CHTC -### When does model size justify cloud compute? +| Factor | Laptop | CHTC | +|--------|--------|------| +| **Cost** | Your own hardware | Free for UW researchers | +| **GPU availability** | Whatever you bought | A100, H100, H200 — shared queue | +| **Scaling** | One machine | Hundreds of jobs in parallel | +| **Software environment** | Manage yourself | Docker containers, reproducible | +| **Job runtime** | Limited by your patience | 12 hrs (short), 24 hrs (medium), 7 days (long) | +| **Storage** | Local disk | Home, staging, and SQUID filesystems | -Not every model needs cloud hardware. Here's a rough guide: +**The short version:** use your laptop for development and quick tests. Use CHTC when you need more hardware, more parallelism, or longer runtimes than your laptop can provide. -| Model scale | Parameters | Example models | Where to run | -|-------------|-----------|----------------|--------------| -| Small | < 10M | Logistic regression, small CNNs, XGBoost | Laptop or HPC — cloud adds overhead without much benefit | -| Medium | 10M–500M | ResNets, BERT-base, mid-sized transformers | HPC with a single GPU (RTX 2080 Ti, L40) or cloud (T4, L4) | -| Large | 500M–10B | GPT-2, LLaMA-7B, fine-tuning large transformers | HPC with A100 (40/80 GB) or cloud — both work well | -| Very large | 10B–70B | LLaMA-70B, Mixtral | HPC with H100/H200 (80–141 GB) or cloud multi-GPU nodes | -| Frontier | 70B+ | GPT-4-scale, multi-expert models | Cloud — requires multi-node clusters beyond what most HPC queues offer | - -**CHTC's [GPU Lab](https://chtc.cs.wisc.edu/uw-research-computing/gpu-lab) covers more than you might think.** The GPU Lab includes A100s (40 and 80 GB), H100s (80 GB), and H200s (141 GB) — enough VRAM to run inference or fine-tune models up to ~70B parameters on a single GPU with quantization. For many UW researchers, this hardware handles "large model" workloads without needing cloud. Jobs have time limits (12 hrs for short, 24 hrs for medium, 7 days for long jobs), so plan your training runs accordingly. - -Cloud becomes the clear choice when you need interconnected multi-GPU nodes (NVLink) for large distributed training, hardware beyond what the GPU Lab queue offers, or when queue wait times are blocking a deadline. - -### A note on cloud costs - -Cloud computing is not free, but it's worth putting costs in context: - -- **Hardware is expensive and ages fast.** A single A100 GPU costs ~ `$15,000` and is outdated within a few years. Cloud lets you rent the latest hardware by the hour. -- **You pay only for what you use.** Stop a VM and the meter stops — valuable for bursty research workloads. -- **Managed services save development time.** You don't have to build DAGs, write scheduling logic, package custom containers, or maintain orchestration infrastructure — GCP handles that plumbing so you can focus on the ML. -- **Budgets and alerts keep you safe.** GCP billing dashboards and budget alerts help prevent surprise bills. We cover cleanup in [Episode 9](09-Resource-management-cleanup.md). +### When does model size justify CHTC? -The key habit: choose the right machine size, stop resources when idle, and monitor spending. We'll reinforce this throughout. +Not every model needs CHTC. Here's a rough guide: -::::::::::::::::::::::::::::::::::::: callout +| Model scale | Parameters | Example models | Where to run | +|-------------|-----------|----------------|--------------| +| Small | < 10M | Logistic regression, small CNNs, XGBoost | Laptop — CHTC adds overhead without much benefit | +| Medium | 10M–500M | ResNets, BERT-base, mid-sized transformers | CHTC with a single GPU (T4, L40, A100) | +| Large | 500M–10B | GPT-2, LLaMA-7B, fine-tuning large transformers | CHTC GPU Lab with A100 (40/80 GB) | +| Very large | 10B–70B | LLaMA-70B, Mixtral | CHTC GPU Lab with H100/H200 (80–141 GB) | +| Frontier | 70B+ | GPT-4-scale, multi-expert models | Cloud platforms — requires multi-node clusters beyond what most HTC queues offer | -### For UW-Madison researchers +**CHTC's [GPU Lab](https://chtc.cs.wisc.edu/uw-research-computing/gpu-lab) covers more than you might think.** It includes A100s (40 and 80 GB), H100s (80 GB), and H200s (141 GB) — enough VRAM to run inference or fine-tune models up to ~70B parameters on a single GPU with quantization. For many UW researchers, this hardware handles "large model" workloads without needing cloud. -UW-Madison offers reduced-overhead cloud billing, NIH STRIDES discounts, Google Cloud research credits (up to `$5,000`), free on-campus GPUs via [CHTC](https://chtc.cs.wisc.edu/), and dedicated support from the [Public Cloud Team](mailto:cloud-services@cio.wisc.edu). See the [UW-Madison Cloud Resources](../uw-madison-cloud-resources.html) page for details. +Cloud becomes the clear choice when you need interconnected multi-GPU nodes (NVLink) for large distributed training, or hardware beyond what the GPU Lab queue offers. -:::::::::::::::::::::::::::::::::::::::::::::::: +### A note on cost (or lack thereof) -Google Cloud Platform (GCP) is one of several clouds that supports this. The rest of this episode explains what GCP offers for ML/AI and how the pieces fit together. +Unlike cloud platforms, **CHTC is free for UW-Madison researchers**. There are no per-hour charges, no billing accounts, and no surprise invoices. The "cost" is shared resources: be a good citizen, request only what you need, and clean up after yourself. We'll cover resource etiquette in [Episode 9](09-Resource-management-best-practices.md). -## What GCP provides for ML/AI +## What CHTC provides for ML/AI -GCP gives you three things that matter for applied ML/AI research: +CHTC gives you three things that matter for applied ML/AI research: -**Flexible compute.** You pick the hardware that fits your workload: +**Flexible compute.** You request the hardware that fits your workload: - **CPUs** for lightweight models, preprocessing, or feature engineering. -- **GPUs** (NVIDIA T4, L4, V100, A100, H100) for training deep learning models. For help choosing, see [Compute for ML](../compute-for-ML.html). -- **TPUs** (Tensor Processing Units) — Google's custom hardware for matrix-heavy workloads. TPUs work best with TensorFlow and JAX; PyTorch support is improving but still less mature. +- **GPUs** (NVIDIA A100, H100, H200, L40, T4) for training deep learning models. For help choosing, see [Compute for ML](../compute-for-ML.html). -**Scalable storage.** Google Cloud Storage (GCS) buckets give you a place to store datasets, scripts, and model artifacts that any job or notebook can access. Think of it as a shared filesystem for your project. +**Scalable storage.** CHTC provides multiple storage tiers for different use cases: -**Managed ML/AI services.** Vertex AI is Google's ML/AI platform. It wraps compute, storage, and tooling into a set of services designed for ML/AI workflows — managed notebooks, training jobs, hyperparameter tuning, model hosting, and access to foundation models like Gemini. +- `/home` — small files, submit scripts, code (~20 GB quota). +- `/staging` — larger datasets and outputs transferred to/from jobs. +- **SQUID** — large, read-only datasets shared across many jobs via HTTP. -## How the pieces fit together: Vertex AI +**Containerized environments.** HTCondor runs your jobs inside Docker containers, so you get a fully reproducible software environment (PyTorch, XGBoost, TensorFlow, etc.) without installing anything on the shared machines. -Google Cloud has many products and brand names. Here are the ones you'll use in this workshop and how they relate: +## How the pieces fit together: HTCondor + +Here are the key components you'll use in this workshop: | Term | What it is | |------|-----------| -| **GCP** | Google Cloud Platform — the overall cloud: compute, storage, networking. | -| **Vertex AI** | Google's ML platform — notebooks, training jobs, tuning, model hosting. Everything below lives under this umbrella. | -| **Workbench** | Managed Jupyter notebooks that run on a Compute Engine VM. Your interactive environment. | -| **Training & tuning jobs** | How you run code on Vertex AI hardware. You submit a script and a machine spec; Vertex AI provisions the VM, runs it, and shuts it down. The SDK offers several flavors — `CustomTrainingJob` (Ep 4–5), `HyperparameterTuningJob` (Ep 6) — and the CLI equivalent is `gcloud ai custom-jobs` (Ep 8). | -| **Cloud Storage (GCS)** | Object storage for files. Similar to AWS S3. | -| **Compute Engine** | Virtual machines you configure with CPUs, GPUs, or TPUs. Workbench and training jobs run on Compute Engine under the hood. | -| **Gemini** | Google's family of large language models, accessed through the Vertex AI API. | +| **CHTC** | Center for High Throughput Computing — UW-Madison's research computing center. | +| **HTCondor** | The job scheduling system that manages compute resources. You submit jobs; HTCondor finds machines to run them. | +| **Submit node** | The server you SSH into. This is where you write code, prepare data, and submit jobs. It is *not* for heavy computation. | +| **Execute node** | The machine where your job actually runs. HTCondor assigns one automatically based on your resource request. | +| **Submit file** (`.sub`) | A configuration file that tells HTCondor what to run, what resources you need, and where to find your data. | +| **Docker container** | A packaged software environment. You specify a Docker image in your submit file, and HTCondor runs your code inside it. | +| **DAGMan** | HTCondor's workflow manager for multi-step or dependent jobs (used in [Episode 8](08-Advanced-HTCondor-workflows.md)). | For a full list of terms, see the [Glossary](../learners/reference.md). -## The notebook-as-controller pattern - -The central idea of this workshop is simple: you work in a lightweight **Vertex AI Workbench** notebook — a small, cheap VM — and use the **Vertex AI Python SDK** to dispatch work to managed services. The notebook itself does not run heavy compute. Instead, it orchestrates: +## The submit-node pattern -- **Training jobs** (Eps 4–5) — run your script on auto-provisioned GPU hardware, then shut down when complete. -- **Hyperparameter tuning jobs** (Ep 6) — search a parameter space across parallel trials and return the best configuration. -- **Cloud Storage** (Ep 3) — shared persistent storage for datasets, model artifacts, logs, and results. -- **Gemini API** (Ep 7) — embeddings and generation for Retrieval-Augmented Generation (RAG) pipelines. +The central idea of this workshop is simple: you work on a **submit node** — a shared server — and use **HTCondor submit files** to dispatch work to execute nodes. The submit node itself does not run heavy compute. Instead, it orchestrates: -All of these are accessed via SDK calls from the notebook. This keeps costs low (the notebook VM stays small) and keeps your work reproducible (each job is a clean, logged run on dedicated hardware). +- **Training jobs** (Eps 4–5) — run your script on a machine with the hardware you requested, then release resources when complete. +- **Hyperparameter tuning** (Ep 6) — search a parameter space across parallel jobs and collect results. +- **Data staging** (Ep 3) — prepare and transfer data between storage tiers and job execution. +- **RAG pipelines** (Ep 7) — run embedding and generation workloads as HTCondor jobs. -![Notebook as controller — overview of workshop architecture](https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/images/diagram4_notebook_as_controller.svg){alt="Architecture diagram showing a Workbench notebook at the center orchestrating four managed services via SDK calls: Training Jobs (Eps 4-5), HP Tuning Jobs (Ep 6), Cloud Storage (Ep 3), and Gemini API (Ep 7)."} +All of these are managed through submit files and command-line tools. This keeps the submit node free for other users and ensures your work is reproducible (each job is a clean, logged run on dedicated hardware). ::::::::::::::::::::::::::::::::::::: callout -### Console, notebooks, or CLI — your choice - -This workshop uses the **GCP web console** and **Workbench notebooks** for most tasks because they're visual and easy to follow for beginners. But nearly everything we do can also be done from the **`gcloud` command-line tool** — submitting training jobs, managing buckets, checking quotas. [Episode 8](08-CLI-workflows.md) covers the CLI equivalents. If you prefer terminal-based workflows or need to automate jobs in scripts and CI/CD pipelines, that episode shows you how. +### Terminal-based workflow -**One important caveat:** whether you use the console, notebooks, or CLI, resources you create (VMs, training jobs, endpoints) keep running and billing until you explicitly stop them. There's no automatic shutdown. We cover cleanup habits in [Episode 9](09-Resource-management-cleanup.md), but the short version is: always check for running resources before you walk away. +Unlike cloud platforms with web consoles, CHTC workflows are entirely **terminal-based**. You'll SSH into a submit node, write submit files in a text editor, and manage jobs with commands like `condor_submit`, `condor_q`, and `condor_status`. If you're comfortable with the command line, you'll feel right at home. If you're new to it, don't worry — we'll walk through every step. :::::::::::::::::::::::::::::::::::::::::::::::: @@ -140,8 +128,8 @@ This workshop uses the **GCP web console** and **Workbench notebooks** for most Think about how you currently run ML experiments: - What hardware do you use — laptop, HPC cluster, cloud? -- What's the biggest infrastructure pain point in your workflow (GPU access, environment setup, data transfer, cost)? -- What would you most like to offload to a managed service? +- What's the biggest infrastructure pain point in your workflow (GPU access, environment setup, data transfer, runtime limits)? +- What would you most like to offload to shared compute? Take 3–5 minutes to discuss with a partner or share in the workshop chat. @@ -149,9 +137,10 @@ Take 3–5 minutes to discuss with a partner or share in the workshop chat. ::::::::::::::::::::::::::::::::::::: keypoints -- Cloud platforms let you rent hardware on demand instead of buying or waiting for shared resources. -- GCP organizes its ML/AI services under Vertex AI — notebooks, training jobs, tuning, and model hosting. -- The notebook-as-controller pattern keeps your notebook cheap while offloading heavy training to dedicated Vertex AI jobs. -- Everything in this workshop can also be done from the `gcloud` CLI ([Episode 8](08-CLI-workflows.md)). +- CHTC provides free, shared compute resources for UW-Madison researchers — no billing required. +- HTCondor schedules your jobs onto available hardware, including GPUs, automatically. +- The submit-node pattern keeps the shared login server free while your jobs run on dedicated execute nodes. +- CHTC's GPU Lab includes A100, H100, and H200 GPUs — sufficient for most research ML/AI workloads. +- Everything in this workshop uses terminal-based workflows with HTCondor submit files. :::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/02-Connecting-to-CHTC.md b/episodes/02-Connecting-to-CHTC.md new file mode 100644 index 00000000..0768dd54 --- /dev/null +++ b/episodes/02-Connecting-to-CHTC.md @@ -0,0 +1,294 @@ +--- +title: "Connecting to CHTC" +teaching: 20 +exercises: 10 +--- + +:::::::::::::::::::::::::::::::::::::: questions + +- How do I connect to CHTC and start working on a submit node? +- What can (and can't) I do on the submit node? +- What tools do I need to know to check the status of HTCondor and my jobs? + +:::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: objectives + +- Log in to a CHTC submit node via SSH. +- Navigate the CHTC filesystem and understand the purpose of the `/home` directory. +- Distinguish between the submit node (controller) and execute nodes (workers). +- Run basic HTCondor commands to inspect the pool and job queue. +- Set up a working environment by cloning the workshop repository and downloading data. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## Connecting to a submit node + +All work in this workshop begins on a **submit node** — a shared server that acts as your control plane for launching HTCondor jobs. Think of it the same way you would think of a lightweight controller notebook in a cloud workflow: you use it to prepare code, stage data, and submit jobs, but you never run heavy computation on it directly. + +CHTC provides several submit nodes. For this workshop, connect to one of the access points (your instructor will confirm which one to use): + +```bash +ssh username@ap2002.chtc.wisc.edu +``` + +Replace `username` with your UW-Madison NetID. You will authenticate with your UW-Madison password (and possibly Duo two-factor authentication, depending on the server configuration). + +::::::::::::::::::::::::::::::::::::: callout + +#### First time connecting? + +The first time you SSH into a new server, you'll see a message like: + +``` +The authenticity of host 'ap2002.chtc.wisc.edu' can't be established. +ED25519 key fingerprint is SHA256:... +Are you sure you want to continue connecting (yes/no)? +``` + +Type `yes` and press Enter. This adds the server's fingerprint to your `~/.ssh/known_hosts` file so you won't be asked again. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: callout + +#### Connecting from different operating systems + +- **macOS / Linux:** Open a terminal and use the `ssh` command shown above. +- **Windows:** Use the built-in **Windows Terminal** or **PowerShell** (both include an SSH client on Windows 10+). Alternatively, install [PuTTY](https://www.chiark.greenend.org.uk/~sgtatham/putty/) or [MobaXterm](https://mobaxterm.mobatek.net/). +- **Chromebook / tablet:** Use a browser-based SSH client or install an SSH app from your platform's store. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## Navigating the CHTC filesystem + +Once logged in, you land in your **home directory** (`/home/username`). This is your primary workspace on the submit node. + +```bash +pwd +# /home/username + +ls -la +``` + +### Home directory basics + +Your `/home` directory has a quota of approximately **20 GB**. It is designed for: + +- Submit files (`.sub`) and job scripts +- Small code repositories +- Configuration files and logs + +It is **not** designed for large datasets. For bigger files, CHTC provides `/staging` and **SQUID** — we'll cover those in [Episode 3](03-Data-storage-and-access.md). + +Check your current disk usage and quota: + +```bash +quota -vs +``` + +::::::::::::::::::::::::::::::::::::: callout + +#### CHTC storage tiers at a glance + +| Location | Purpose | Typical quota | Persists between jobs? | +|----------|---------|---------------|----------------------| +| `/home` | Code, submit files, small inputs/outputs | ~20 GB | Yes | +| `/staging` | Large datasets, model checkpoints | ~200 GB+ (by request) | Yes | +| **SQUID** (`/squid`) | Large read-only data shared across many jobs | By request | Yes | +| Job working directory | Temporary scratch space on execute node | Varies | No — cleaned up after job completes | + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## Submit node vs. execute node + +This distinction is central to everything we do in this workshop: + +| | Submit node | Execute node | +|--|-------------|-------------| +| **What it is** | The shared server you SSH into | A worker machine assigned by HTCondor | +| **Who uses it** | Many researchers, simultaneously | One job (or a few), temporarily | +| **What to do here** | Edit code, write submit files, submit jobs, check job status | Run your actual computation (training, inference, preprocessing) | +| **What NOT to do** | Run training loops, load large models, GPU-intensive work | Nothing — HTCondor manages these automatically | + +The submit node is a **shared resource**. Running heavy computation on it slows things down for every other user on the same server. HTCondor enforces this by design: you describe what you need in a submit file, and HTCondor finds an execute node with matching resources (CPUs, memory, GPUs) to run your job. + +This is the same "controller" pattern used in cloud workflows — a lightweight orchestrator delegates expensive work to dedicated compute. The difference is that on CHTC, the "controller" is a shared server rather than a personal VM, and the "compute" is managed by HTCondor rather than a cloud API. + +## Basic HTCondor commands + +Before we submit any jobs, let's get familiar with the two most important HTCondor commands for checking the state of the system. + +### `condor_status` — What resources are available? + +```bash +condor_status +``` + +This shows all the execute nodes (also called "slots") in the HTCondor pool. You'll see columns for the machine name, operating system, architecture, state (e.g., `Unclaimed`, `Claimed`), and resource details. + +To see a summary instead of the full list: + +```bash +condor_status -total +``` + +To check what GPU resources are available: + +```bash +condor_status -constraint 'TotalGPUs > 0' -compact +``` + +### `condor_q` — What jobs are in the queue? + +```bash +condor_q +``` + +This shows **your** jobs in the queue. Right now it should be empty — we haven't submitted anything yet. You'll see output like: + +``` +-- Schedd: ap2002.chtc.wisc.edu : <...> +OWNER BATCH_NAME SUBMITTED DONE RUN IDLE TOTAL JOB_IDS + +Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended +``` + +To see jobs from all users (useful for understanding how busy the system is): + +```bash +condor_q -all +``` + +::::::::::::::::::::::::::::::::::::: callout + +#### Other useful HTCondor commands + +You'll use these in later episodes, but here's a preview: + +| Command | Purpose | +|---------|---------| +| `condor_submit job.sub` | Submit a job described in `job.sub` | +| `condor_q -hold` | Show held jobs and the reason they're held | +| `condor_rm ` | Remove (cancel) a job | +| `condor_history` | Show your completed jobs | +| `condor_status -gpus` | Show GPU availability across the pool | + +:::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1: Explore the HTCondor pool + +Run `condor_status -total` and answer the following: + +1. How many total slots (machines) are in the pool? +2. How many are currently `Unclaimed` (idle) vs. `Claimed` (running a job)? +3. Based on what you see, is the cluster busy or relatively free right now? + +:::::::::::::::: solution + +The output of `condor_status -total` shows a summary table with rows for each machine state. Look for: + +- **Total**: the total number of slots available. +- **Unclaimed**: slots available to run new jobs. +- **Claimed**: slots currently running someone's job. + +If most slots are `Unclaimed`, the cluster has plenty of capacity. If most are `Claimed`, your jobs may wait in the queue before starting. This is normal on a shared system — HTCondor manages the queue fairly. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## Setting up your working environment + +Now let's prepare the files we'll use for the rest of the workshop. We'll clone the workshop repository and verify that everything is in place. + +### Clone the workshop repository + +```bash +cd ~ +git clone https://github.com/qualiaMachine/Intro_GCP_for_ML.git +cd Intro_GCP_for_ML +ls +``` + +This repository contains: + +- **Submit files** (`.sub`) for each episode's HTCondor jobs. +- **Training scripts** (Python) that will run on execute nodes. +- **Sample data** and configuration files. + +### Download the workshop dataset + +Some episodes require a small dataset that isn't stored in the Git repository. Download it into your working directory: + +```bash +cd ~/Intro_GCP_for_ML +wget -q https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/data/penguins.csv -P data/ +ls -lh data/ +``` + +::::::::::::::::::::::::::::::::::::: callout + +#### Keep your home directory tidy + +Your `/home` quota is limited. A few good habits: + +- **Don't store large datasets in `/home`.** Use `/staging` for anything over a few hundred MB (covered in [Episode 3](03-Data-storage-and-access.md)). +- **Clean up job output files** (`*.log`, `*.out`, `*.err`) after you've reviewed them. +- **Remove old Conda/pip caches** if you install packages locally. These can grow quickly. +- **Check usage regularly** with `quota -vs`. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +### Verify your setup + +Run a quick check to make sure everything is ready: + +```bash +echo "Home directory: $HOME" +echo "Current directory: $(pwd)" +echo "Files in repo:" +ls ~/Intro_GCP_for_ML/ +echo "" +echo "HTCondor status:" +condor_q +``` + +You should see the cloned repository contents and an empty job queue. If `condor_q` returns an error, double-check that you're on a CHTC submit node (not a different server). + +:::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2: Submit node etiquette + +A colleague tells you they ran a deep learning training script directly on the submit node because "it was faster than writing a submit file." What problems could this cause, and what should they do instead? + +:::::::::::::::: solution + +Running heavy computation on the submit node causes several problems: + +- **It slows down the server for everyone.** The submit node is shared by many researchers. A single training job can consume most of the CPU and memory, making the server sluggish for others trying to edit files, submit jobs, or check job status. +- **It won't have GPU access.** Submit nodes typically don't have GPUs attached, so the training would run on CPU only — much slower than using a GPU execute node. +- **It's not reproducible.** Running interactively means there's no submit file to re-run later, no automatic logging, and no record of what resources were used. +- **CHTC may kill the process.** Administrators monitor submit nodes and may terminate long-running or resource-heavy processes without warning. + +**What to do instead:** Write a submit file (`.sub`) that describes the job's resource needs and let HTCondor run it on an appropriate execute node. We'll do exactly this starting in [Episode 4](04-Training-models-in-VertexAI.md). + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## What's next + +You're now connected to CHTC, familiar with the filesystem, and have the workshop materials ready. In the next episode, we'll set up data storage and learn how to move data between the submit node, `/staging`, and your HTCondor jobs. + +::::::::::::::::::::::::::::::::::::: keypoints + +- Connect to CHTC via SSH to a submit node (e.g., `ap2002.chtc.wisc.edu`) — this is your controller for the workshop. +- The submit node is for lightweight work only: editing code, writing submit files, and managing jobs. Heavy computation goes to execute nodes via HTCondor. +- Use `condor_status` to check available resources and `condor_q` to check your job queue. +- Your `/home` directory (~20 GB) holds code and submit files; larger data belongs in `/staging` or SQUID. +- Clone the workshop repository to get submit files and training scripts for the remaining episodes. + +:::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/02-Notebooks-as-controllers.md b/episodes/02-Notebooks-as-controllers.md deleted file mode 100644 index 830ae799..00000000 --- a/episodes/02-Notebooks-as-controllers.md +++ /dev/null @@ -1,210 +0,0 @@ ---- -title: "Notebooks as Controllers" -teaching: 20 -exercises: 10 ---- - -:::::::::::::::::::::::::::::::::::::: questions - -- How do you set up and use Vertex AI Workbench notebooks for machine learning tasks? -- How can you manage compute resources efficiently using a "controller" notebook approach in GCP? - -:::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: objectives - -- Describe how Vertex AI Workbench notebooks fit into ML/AI workflows on GCP. -- Set up a Jupyter-based Workbench Instance as a lightweight controller to manage compute tasks. -- Configure a Workbench Instance with appropriate machine type, labels, and idle shutdown for cost-efficient orchestration. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -## Setting up our notebook environment -Google Cloud Workbench provides JupyterLab-based environments that can be used to orchestrate ML/AI workflows. In this workshop, we will use a **Workbench Instance**—the recommended option going forward, as other Workbench environments are being deprecated. - -> Workbench Instances come with JupyterLab 3 pre-installed and are configured with GPU-enabled ML frameworks (TensorFlow, PyTorch, etc.), making it easy to start experimenting without additional setup. Learn more in the [Workbench Instances documentation](https://cloud.google.com/vertex-ai/docs/workbench/instances/introduction). - -## Using the notebook as a controller -The notebook instance functions as a *controller* to manage more resource-intensive tasks. By selecting a modest machine type (e.g., `n2-standard-2`), you can perform lightweight operations locally in the notebook while using the **Vertex AI Python SDK** to launch compute-heavy jobs on larger machines (e.g., GPU-accelerated) when needed. - -This approach minimizes costs while giving you access to scalable infrastructure for demanding tasks like model training, batch prediction, and hyperparameter tuning. - -One practical advantage of Workbench notebooks: **authentication is automatic.** A Workbench VM inherits the permissions of its attached service account, so calls to Cloud Storage, Vertex AI, and the Gemini API work with no extra credential setup — no API keys or login commands needed. If you later run the same code from your laptop or an HPC cluster, you'll need to set up credentials separately (see the [GCP authentication docs](https://cloud.google.com/docs/authentication)). (Prefer working from a terminal? [Episode 8: CLI Workflows](08-CLI-workflows.md) covers how to do everything in this workshop using `gcloud` commands instead of notebooks.) - -We will follow these steps to create our first Workbench Instance: - -### 1. Navigate to Workbench - -- Open the **Google Cloud Console** ([console.cloud.google.com](https://console.cloud.google.com/)) — this is the web dashboard where you manage all GCP resources. Search for "Workbench." -- Click the "Instances" tab (this is the supported path going forward). - -### 2. Create a new Workbench Instance - -#### Initial settings - -- Click **Create New** near the top of the Workbench page -- **Name**: Use the convention `lastname-purpose` (e.g., `doe-workshop`). GCP resource names only allow lowercase letters, numbers, and hyphens. We'll use a single instance for training, tuning, RAG, and more, so `workshop` is a good general-purpose label. -- **Region**: Select `us-central1`. When we create a storage bucket in [Episode 3](03-Data-storage-and-access.md), we'll use the same region — keeping compute and storage co-located avoids cross-region transfer charges and keeps data access fast. -- **Zone:** `us-central1-a` (or another zone in `us-central1`, like `-b` or `-c`) - - If capacity or GPU availability is limited in one zone, switch to another zone in the same region. -- **NVIDIA T4 GPU:** Leave unchecked for now - - We will request GPUs for training jobs separately. Attaching here increases idle costs. -- **Apache Spark and BigQuery Kernels:** Leave unchecked - - BigQuery kernels let you run SQL analytics directly in a notebook, but we won't need them in this workshop. Leave unchecked to avoid pulling extra container images. -- **Network in this project:** If you're working in a shared workshop environment, select the network provided by your administrator (shared environments typically do not allow using external or default networks). If using a personal GCP project, the default network is fine. -- **Network / Subnetwork:** Leave as pre-filled. -![Notebook settings (part 1)](https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/images/new-instance-settings1.jpg){alt="Notebook settings (part1)"} - -#### Advanced settings: Details (tagging) - -- **IMPORTANT:** Open the "Advanced options" menu next. - - **Labels (required for cost tracking):** Under the Details menu, add the following tags (all lowercase) so that you can track the total cost of your activity on GCP later: - - `name = firstname-lastname` - - `purpose = workshop` - -![Required tags for notebook.](https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/images/new-instance-tags.jpg){alt="Screenshot showing required tags for notebook"} - -#### Advanced Settings: Environment - -Leave environment settings at their defaults for this workshop. Workbench uses JupyterLab 3 by default with NVIDIA GPU drivers, CUDA, and common ML frameworks preinstalled. For future reference, you can optionally select JupyterLab 4, provide a custom Docker image, or specify a post-startup script (`gs://path/to/script.sh`) to auto-configure the instance at boot. - -#### Advanced settings: Machine Type - -- **Machine type**: Select a small machine (e.g., `n2-standard-2`, ~ `$0.07`/hr) to act as the controller. - - This keeps costs low while you delegate heavy lifting to training jobs. - - For guidance on common machine types and their costs, see [Compute for ML](../compute-for-ML.html). For help deciding when you need cloud hardware at all, see [When does model size justify cloud compute?](01-Introduction.md#when-does-model-size-justify-cloud-compute) in Episode 1. - -- **Set idle shutdown**: To save on costs when you aren't doing anything in your notebook, lower the default idle shutdown time to **60 (minutes)**. - -![Enable Idle Shutdown](https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/images/new-instance-idleshutdown.jpg){alt="Set Idle Shutdown"} - -#### Advanced Settings: Disks - -Leave disk settings at their defaults for this workshop. Each Workbench Instance has two disks: a **boot disk** (100 GB — holds the OS and libraries) and a **data disk** (150 GB default — holds your datasets and outputs). Both use Balanced Persistent Disks. Keep "Delete to trash" unchecked so deleted files free space immediately. - -**Rule of thumb:** allocate ≈ 2× your dataset size for the data disk, and keep bulk data in Cloud Storage (`gs://`) rather than on local disk — PDs cost ~ `$0.10`/GB/month vs. ~ `$0.02`/GB/month for Cloud Storage. - -::::::::::::::::::::::::::::::::::::: callout - -#### Disk sizing and cost details - -- **Boot disk:** Rarely needs resizing. Increase to 150–200 GB only for large custom environments or multiple frameworks. -- **Data disk:** Use SSD PD only for high-I/O workloads. Disks can be resized anytime without downtime, so start small and expand when needed. -- **Cost comparison:** A 200 GB dataset costs ~ `$24`/month on a PD but only ~ `$5`/month in Cloud Storage. -- **Pricing:** [Persistent Disk pricing](https://cloud.google.com/compute/disks-image-pricing) · [Cloud Storage pricing](https://cloud.google.com/storage/pricing) - -:::::::::::::::::::::::::::::::::::::::::::::::: - - -#### Advanced settings: Networking - External IP Access - -- **Assign External IP address**: Leave this option checked — you need an external IP. - -### Create notebook - -- Click **Create** to create the instance. Provisioning typically takes 3–5 minutes. You'll see the status change from "Provisioning" to "Active" with a green checkmark. While waiting, work through the challenges below. - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 1: Notebook Roles - -Your university provides different compute options: laptops, on-prem HPC, and GCP. - -- What role does a **Workbench Instance notebook** play compared to an HPC login node or a laptop-based JupyterLab? -- Which tasks should stay in the notebook (lightweight control, visualization) versus being launched to larger cloud resources? - -:::::::::::::::: solution - -The notebook serves as a lightweight control plane. - -- Like an HPC login node, it is not meant for heavy computation. -- Suitable for small preprocessing, visualization, and orchestrating jobs. -- Resource-intensive tasks (training, tuning, batch jobs) should be submitted to scalable cloud resources (GPU/large VM instances) via the Vertex AI SDK. - -::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 2: Controller Cost Estimate - -Your controller notebook uses an `n2-standard-2` instance (~ `$0.07`/hr — see [Compute for ML](../compute-for-ML.html) for other common machine types and costs). - -- Estimate the monthly cost if you use it 8 hours/day, 5 days/week, with idle shutdown enabled. -- Compare that to leaving it running 24/7 for the same month. - -:::::::::::::::: solution - -- **With idle shutdown:** 8 hrs × 5 days × 4 weeks = 160 hrs → 160 × `$0.07` ≈ **`$11.20`/month** -- **Running 24/7:** 24 hrs × 30 days = 720 hrs → 720 × `$0.07` ≈ **`$50.40`/month** -- Idle shutdown saves you ~ `$39`/month on a single small controller instance. The savings are even larger for bigger machine types. - -::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::: - -### Managing your instance - -You don't have to wait for idle shutdown — you can **manually stop** your instance anytime from the Workbench Instances list by selecting the checkbox and clicking **Stop**. To resume work, click **Start**. You only pay for compute while the instance is running (disk charges continue while stopped). - -To permanently remove an instance, select it and click **Delete**. Full cleanup is covered in Episode 9. - -### Managing training and tuning with the controller notebook -In the following episodes, we will use the **Vertex AI Python SDK (`google-cloud-aiplatform`)** from this notebook to submit compute-heavy tasks on more powerful machines. Examples include: - -- Training a model on a GPU-backed instance. -- Running hyperparameter tuning jobs managed by Vertex AI. - -Here's how the notebook, jobs, and storage connect: - -![Training and tuning workflow](https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/images/diagram1_training_and_tuning.svg){alt="Architecture diagram showing how a lightweight Workbench notebook uses the Vertex AI SDK to launch training jobs and HP tuning jobs on powerful GPUs, with all artifacts stored in GCS."} - -This pattern keeps costs low by running your notebook on a modest VM while only incurring charges for larger resources when they are actively in use. - -::::::::::::::::::::::::::::::::::::: callout - -#### You don't need a notebook to use Vertex AI - -We start with **Vertex AI Workbench** notebooks because they give you authenticated access to buckets, training jobs, and other GCP services out of the box — no credential setup required. The Console UI also lets you see and manage running jobs directly, which matters when you're learning: accidentally submitting a duplicate training job is easy to spot and cancel in the Console, harder to notice from a terminal. - -[Episode 8](08-CLI-workflows.md) introduces the **`gcloud` CLI** once these concepts are familiar. **Notebooks are not required** for any of the workflows covered here — everything we do through the Python SDK can also be done from: - -- A **plain Python script** run from your terminal or an HPC scheduler. -- The **`gcloud` CLI** (e.g., `gcloud ai custom-jobs create ...`). -- A **CI/CD pipeline** (GitHub Actions, Cloud Build, etc.). - -The real work happens in the training scripts and SDK calls — the notebook is just a convenient starting point. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: callout - -#### Troubleshooting - -- **VM stuck in "Provisioning" for more than 5 minutes?** Try deleting the instance and re-creating it in a different zone within the same region (e.g., `us-central1-b` instead of `us-central1-a`). -- **Instance stopped unexpectedly?** Check the idle shutdown setting — it may have timed out. Restart from the Instances list by clicking **Start**. -- **Can't see the project or get permission errors?** Ensure you're signed into the correct Google account and that IAM permissions have propagated (this can take a few minutes after initial setup). - -:::::::::::::::::::::::::::::::::::::::::::::::: - -### Load pre-filled Jupyter notebooks -Once your instance shows as "Active" (green checkmark), click **Open JupyterLab**. From the Launcher, select **Python 3 (ipykernel)** under Notebook to create a new notebook — we don't need the TensorFlow or PyTorch kernels yet, as those are used in later episodes for training jobs. - -Run the following command to clone the lesson repository. This contains pre-filled notebooks for each episode and the training scripts we'll use later, so you won't need to write boilerplate code from scratch. - -```sh -!git clone https://github.com/qualiaMachine/Intro_GCP_for_ML.git -``` - -Then, navigate to `/Intro_GCP_for_ML/notebooks/03-Data-storage-and-access.ipynb` to begin the next episode. - -::::::::::::::::::::::::::::::::::::: keypoints - -- Use a small Workbench Instance as a controller — delegate heavy training to Vertex AI jobs. -- Workbench VMs inherit service account permissions automatically, simplifying authentication. -- Choose the same region for your Workbench Instance and storage bucket to avoid extra transfer costs. -- Apply labels to all resources for cost tracking, and enable idle auto-stop to avoid surprise charges. - -:::::::::::::::::::::::::::::::::::::::::::::::: - diff --git a/episodes/03-Data-management.md b/episodes/03-Data-management.md new file mode 100644 index 00000000..cb7d2ee5 --- /dev/null +++ b/episodes/03-Data-management.md @@ -0,0 +1,474 @@ +--- +title: "Data Management on CHTC" +teaching: 30 +exercises: 15 +--- + +:::::::::::::::::::::::::::::::::::::: questions + +- What storage options are available on CHTC, and which should I use for different types of data? +- How does HTCondor transfer files to and from jobs? +- How do I prepare datasets like the Titanic CSV files for use in HTCondor jobs? +- What are CHTC's data policies, quotas, and best practices? + +:::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: objectives + +- Identify the three main CHTC storage tiers (`/home`, `/staging`, and SQUID) and explain when to use each one. +- Use HTCondor's `transfer_input_files` and `transfer_output_files` directives to move data into and out of jobs. +- Check disk usage and manage files within CHTC quota limits. +- Prepare the Titanic dataset for training jobs submitted through HTCondor. +- Apply best practices for file sizes, storage etiquette, and sensitive data on shared research infrastructure. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +ML/AI projects depend on data, so understanding where to store files and how to get them into your jobs is essential. Unlike cloud platforms such as GCP or AWS, CHTC does not charge for storage — but shared resources come with **quotas** and **community etiquette** expectations that are just as important to follow. + +> #### Consult your institution before handling sensitive data on CHTC +> **Do not store restricted or sensitive data (HIPAA, FERPA, proprietary) on CHTC systems unless explicitly approved by your institution's IT or compliance office.** CHTC general-use systems are not certified for regulated data. If you work with sensitive datasets, contact the [CHTC facilitation team](https://chtc.cs.wisc.edu/uw-research-computing/get-help) to discuss options. + +## CHTC storage tiers overview + +CHTC provides three main storage locations. Each is designed for a different purpose and has different size limits. + +| Location | Typical Quota | Purpose | Accessible from jobs? | +|-----------|--------------|---------|----------------------| +| `/home/` | ~20 GB | Code, scripts, submit files, small config files | Yes (default working directory) | +| `/staging/` | ~100+ GB (by request) | Larger datasets transferred to/from jobs | Yes (via `transfer_input_files`) | +| SQUID (`/squid/`) | ~20 GB (by request) | Large, **read-only** data shared across many jobs via HTTP | Yes (via HTTP URL in `transfer_input_files`) | + +::::::::::::::::::::::::::::::::::::: callout + +#### No storage charges, but quotas matter + +Unlike GCS buckets where you pay per GB per month, CHTC storage is free. However, these are **shared filesystems** — exceeding your quota or storing unnecessary files affects every researcher on the cluster. Treat quotas the way you would treat a shared lab refrigerator: take only the space you need, label your things, and clean up when you are done. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +### `/home` — your home directory + +Your home directory at `/home/` is where you land when you log in to a CHTC submit server. It is suitable for: + +- Submit files (`.sub`) +- Executable scripts (`.sh`, `.py`) +- Small configuration and parameter files +- Small output logs + +The default quota is approximately **20 GB**. Do **not** store large datasets here — it is backed by a shared network filesystem and is not designed for high-throughput I/O. + +### `/staging` — for larger data + +The `/staging/` directory is intended for datasets that are too large for `/home` but need to be transferred into jobs. Typical use cases: + +- Training datasets (CSV, Parquet, image archives) +- Pre-trained model weights +- Large output files produced by jobs + +You must **request** a `/staging` directory by emailing [chtc@cs.wisc.edu](mailto:chtc@cs.wisc.edu) or filling out a quota request. Default allocations start around 100 GB and can be increased for justified research needs. + +Files in `/staging` are referenced in your submit file using `transfer_input_files` and are copied to the job's working directory at runtime. + +### SQUID — large read-only data via HTTP + +SQUID (located at `/squid/` on the submit server) is a web-cache-based system for distributing **large, read-only** files to many jobs simultaneously. Data placed in SQUID is served via HTTP, so it is ideal for: + +- Reference datasets shared across hundreds or thousands of jobs +- Pre-trained model files that every job needs but never modifies + +To use SQUID data in a job, reference it by its HTTP URL: + +``` +transfer_input_files = http://proxy.chtc.wisc.edu/SQUID//my_large_file.tar.gz +``` + +SQUID is **not** for output or frequently changing files. For complete details, see the [CHTC large data guide](https://chtc.cs.wisc.edu/uw-research-computing/file-avail-largedata). + +::::::::::::::::::::::::::::::::::::: callout + +#### Which storage tier should I use? + +A quick rule of thumb based on file size: + +- **< 100 MB per file** and < 500 MB total per job: `/home` is fine. +- **100 MB – a few GB per file**: use `/staging` with `transfer_input_files`. +- **> 1 GB read-only data shared across many jobs**: consider SQUID. +- **Very large data (> 10 GB per file)**: contact the CHTC facilitation team for guidance. You may need to split the data or use special transfer mechanisms. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## Checking your disk usage + +Before adding new files, check how much space you are using. Log in to your CHTC submit server and run: + +```bash +# Check home directory usage +du -sh /home/ +``` + +```bash +# Check quota (if your system supports it) +quota -s +``` + +```bash +# Check staging usage (if you have a staging directory) +du -sh /staging/ +``` + +To find the largest files and directories: + +```bash +# Top 10 largest items in your home directory +du -h /home/ | sort -rh | head -10 +``` + +::::::::::::::::::::::::::::::::::::: callout + +#### Keep an eye on your usage + +Run `du -sh` regularly, especially after jobs complete. Output files can accumulate quickly if you run many jobs. Remove or move results you no longer need on the submit server. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## How HTCondor file transfer works + +HTCondor jobs run on **execute machines** that are separate from the submit server. Your job does not have direct access to `/home` or `/staging` on the submit server while it runs. Instead, HTCondor copies files back and forth using its **file transfer mechanism**. + +### Sending files to a job: `transfer_input_files` + +In your submit file, list the files your job needs: + +``` +executable = train_model.sh +arguments = titanic_train.csv + +transfer_input_files = /staging//titanic_train.csv, train_model.py + +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +log = job_$(Cluster).log +output = job_$(Cluster).out +error = job_$(Cluster).err + +request_cpus = 1 +request_memory = 2GB +request_disk = 1GB + +queue +``` + +Key points: + +- `transfer_input_files` accepts a comma-separated list of file paths (absolute or relative to the submit directory) and URLs. +- All listed files are placed in the job's **top-level working directory** on the execute machine, regardless of their original directory structure. +- The `executable` script is transferred automatically — you do not need to list it again. + +### Getting files back: `transfer_output_files` + +By default, HTCondor transfers **all new and modified files** from the job's working directory back to the submit directory when the job finishes. If you only want specific outputs returned, use: + +``` +transfer_output_files = model_output.pkl, metrics.csv +``` + +This is good practice — it avoids transferring temporary files or large intermediate results you do not need. + +::::::::::::::::::::::::::::::::::::: callout + +#### Avoid transferring unnecessary files + +If your job creates large temporary files (e.g., extracted archives, intermediate checkpoints), either delete them in your script before the job exits or use `transfer_output_files` to specify only what you need. Transferring unnecessary data wastes network bandwidth and fills up your submit directory. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## Preparing the Titanic dataset for HTCondor jobs + +The Titanic dataset is small (under 100 KB total for both CSVs), so it fits comfortably in `/home`. For this workshop, we will keep it simple and transfer the files directly from the submit directory. + +### 1. Download the data + +On the CHTC submit server: + +```bash +# Navigate to your home directory +cd /home/ + +# Create a working directory for this workshop +mkdir -p ml-workshop && cd ml-workshop + +# Download the dataset +wget https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/data/data.zip + +# Unzip +unzip data.zip +``` + +Verify the files are present: + +```bash +ls -lh *.csv +``` + +You should see `titanic_train.csv` and `titanic_test.csv`, each well under 1 MB. + +### 2. Reference the data in a submit file + +Create a minimal submit file to verify that file transfer works: + +``` +# test_transfer.sub +executable = test_transfer.sh + +transfer_input_files = titanic_train.csv, titanic_test.csv + +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +log = test_$(Cluster).log +output = test_$(Cluster).out +error = test_$(Cluster).err + +request_cpus = 1 +request_memory = 512MB +request_disk = 100MB + +queue +``` + +Create the corresponding script: + +```bash +#!/bin/bash +# test_transfer.sh — verify that input files arrived +echo "=== Files in working directory ===" +ls -lh + +echo "" +echo "=== First 5 lines of titanic_train.csv ===" +head -5 titanic_train.csv + +echo "" +echo "=== Row counts ===" +echo "Train rows: $(wc -l < titanic_train.csv)" +echo "Test rows: $(wc -l < titanic_test.csv)" +``` + +Make the script executable and submit: + +```bash +chmod +x test_transfer.sh +condor_submit test_transfer.sub +``` + +Check the status of your job: + +```bash +condor_q +``` + +Once the job completes, inspect the output: + +```bash +cat test_*.out +``` + +You should see the file listing, the first few rows of the training data, and the row counts, confirming that HTCondor successfully transferred both CSV files into the job. + +:::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1: Transfer files from `/staging` + +Suppose you have a larger dataset (e.g., 500 MB of image files packaged as `images.tar.gz`) stored in `/staging//`. Write a submit file snippet that transfers this archive into a job and extracts it. What would the executable script look like? + +:::::::::::::::: solution + +**Submit file snippet:** + +``` +transfer_input_files = /staging//images.tar.gz, process_images.py +``` + +**Executable script (`process_images.sh`):** + +```bash +#!/bin/bash +# Extract the archive +tar -xzf images.tar.gz + +# Run processing +python3 process_images.py + +# Clean up extracted files before job exits to avoid transferring them back +rm -rf images/ +rm images.tar.gz +``` + +Key points: +- The archive is referenced with its full path in `/staging`. +- The script removes extracted files before exiting so that only genuine output files are transferred back. +- If you only need specific outputs, also add `transfer_output_files = results.csv` to the submit file. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2: Check your disk usage + +Log in to the CHTC submit server and determine: + +1. How much space is your home directory currently using? +2. What are the three largest files or directories in your home directory? +3. If you have a `/staging` directory, how much space is it using? + +:::::::::::::::: solution + +```bash +# 1. Total home usage +du -sh /home/ + +# 2. Three largest items +du -h /home/ --max-depth=1 | sort -rh | head -5 + +# 3. Staging usage (if applicable) +du -sh /staging/ +``` + +If `du` takes a long time on your home directory, that itself may be a sign you have many files and should consider cleanup. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## Best practices for data management on CHTC + +### What goes where + +| File type | Recommended location | Example | +|-----------|---------------------|---------| +| Submit files, scripts | `/home` | `train_model.sub`, `train_model.sh` | +| Small datasets (< 100 MB total) | `/home` (submit directory) | `titanic_train.csv` | +| Medium datasets (100 MB – few GB) | `/staging` | `image_dataset.tar.gz` | +| Large read-only reference data | SQUID | `pretrained_bert_model.tar.gz` | +| Job output logs | `/home` (automatically returned) | `job_12345.out` | +| Large output files | `/staging` (move after job completes) | `trained_model_weights.h5` | + +### File size guidelines for `transfer_input_files` + +- Individual files should ideally be **under a few GB**. Very large single files slow down transfers and can cause jobs to be held. +- If you have many small files (thousands of images, for example), **tar and compress** them into a single archive before transferring. HTCondor handles one large file much more efficiently than thousands of small ones. + +```bash +# Package many small files into a single archive +tar -czf training_images.tar.gz images/ + +# In your job script, extract them +tar -xzf training_images.tar.gz +``` + +### Clean up after yourself + +- Remove completed job logs you no longer need from `/home`. +- Delete intermediate data in `/staging` once you have your final results. +- Do not leave large files in SQUID indefinitely — remove them when your analysis campaign is complete. + +::::::::::::::::::::::::::::::::::::: callout + +#### CHTC data policies + +- **No backups**: CHTC storage (including `/home`, `/staging`, and SQUID) is **not backed up**. Keep copies of irreplaceable data elsewhere (e.g., your department server, institutional storage, or a version control system for code). +- **Quotas are enforced**: Exceeding your quota can prevent you from submitting new jobs or receiving output files. Monitor your usage regularly. +- **Data retention**: Inactive data may be flagged for removal. If CHTC staff contact you about storage usage, respond promptly. +- **No guaranteed performance**: These are shared filesystems. Large transfers during peak times affect other users. Schedule bulk transfers during off-peak hours when possible. + +For the latest policies, see the [CHTC data storage guide](https://chtc.cs.wisc.edu/uw-research-computing/file-avail-largedata). + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## Sensitive data considerations + +CHTC general-purpose systems (`ap2002.chtc.wisc.edu`, etc.) are **not** designed for regulated or sensitive data. Specifically: + +- Execute machines are shared among many users — your job files are not encrypted at rest on the execute node. +- Data in `/home`, `/staging`, and SQUID may be readable by system administrators. +- Network transfers within HTCondor are not encrypted by default. + +If your research involves sensitive data (PHI, student records, proprietary datasets), contact the [CHTC facilitation team](https://chtc.cs.wisc.edu/uw-research-computing/get-help) **before** uploading any data. They can advise on options such as dedicated secure submit nodes or restricted pools. + +## Comparison with cloud storage + +If you have used cloud platforms before, the table below highlights the key differences: + +| Feature | GCS (Google Cloud) | CHTC Storage | +|---------|-------------------|--------------| +| Cost | ~$0.02/GB/month + egress fees | Free (quota-based) | +| Access method | `gs://` URIs, APIs | File paths, HTCondor transfer | +| Scalability | Virtually unlimited (pay-as-you-go) | Limited by quotas (request increases) | +| Data durability | Highly redundant, 99.999999999% | No backups — user responsibility | +| Access control | IAM roles and policies | Unix file permissions | +| Sharing with jobs | SDK reads from bucket | `transfer_input_files` copies to job | +| Sensitive data | Configurable (VPC-SC, CMEK) | Not certified for regulated data (general systems) | + +The biggest practical difference: on CHTC, your data must be **explicitly transferred** to each job. There is no shared filesystem that jobs can read from directly (unlike a GCS bucket that any authorized VM can access). This means you need to plan your data flow carefully — but it also means you never get a surprise bill. + +:::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 3: Plan storage for a real project + +You are starting a new image classification project with the following data: + +- **Training images**: 15,000 JPEG files totaling 2.3 GB +- **Validation images**: 3,000 JPEG files totaling 450 MB +- **Pre-trained model weights**: 800 MB (read-only, used by all jobs) +- **Python training script**: 12 KB +- **Output**: each job produces a ~50 MB model file + +Where would you store each component, and how would you structure your `transfer_input_files`? + +:::::::::::::::: solution + +1. **Training and validation images**: Package each set into a tar archive and store in `/staging`: + ```bash + tar -czf train_images.tar.gz train_images/ + tar -czf val_images.tar.gz val_images/ + # Move to staging + mv train_images.tar.gz /staging// + mv val_images.tar.gz /staging// + ``` + +2. **Pre-trained model weights**: Since these are read-only and potentially shared across many jobs, SQUID is a good choice: + ```bash + cp pretrained_weights.h5 /squid// + ``` + +3. **Python script**: Small, stays in `/home` in your submit directory. + +4. **Submit file:** + ``` + transfer_input_files = /staging//train_images.tar.gz, \ + /staging//val_images.tar.gz, \ + http://proxy.chtc.wisc.edu/SQUID//pretrained_weights.h5, \ + train_model.py + + transfer_output_files = trained_model.pkl + ``` + +5. **Output**: The `transfer_output_files` directive ensures only the 50 MB model file is returned, not the extracted images or temporary files. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: keypoints + +- CHTC provides three storage tiers: `/home` (small files, code), `/staging` (larger datasets), and SQUID (large read-only data via HTTP). +- HTCondor copies files to and from jobs — use `transfer_input_files` and `transfer_output_files` in your submit file to control what is transferred. +- Storage on CHTC is free but quota-limited. Monitor your usage with `du -sh` and clean up after jobs complete. +- Package many small files into tar archives before transferring to improve efficiency. +- CHTC general-purpose systems are not certified for sensitive or regulated data — consult the facilitation team if your data has restrictions. +- Unlike cloud storage, there are no surprise bills — but there are also no backups, so keep copies of irreplaceable data elsewhere. + +:::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/03-Data-storage-and-access.md b/episodes/03-Data-storage-and-access.md deleted file mode 100644 index f80e118f..00000000 --- a/episodes/03-Data-storage-and-access.md +++ /dev/null @@ -1,430 +0,0 @@ ---- -title: "Data Storage and Access" -teaching: 35 -exercises: 15 ---- - -:::::::::::::::::::::::::::::::::::::: questions - -- How can I store and manage data effectively in GCP for Vertex AI workflows? -- What are the advantages of Google Cloud Storage (GCS) compared to local or VM storage for machine learning projects? -- How can I load data from GCS into a Vertex AI Workbench notebook? - -:::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: objectives - -- Explain data storage options in GCP for machine learning projects. -- Set up a GCS bucket and upload data. -- Read data directly from a GCS bucket into memory in a Vertex AI notebook. -- Monitor storage usage and estimate costs. -- Upload new files from the Vertex AI environment back to the GCS bucket. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -ML/AI projects rely on data, making efficient storage and management essential. Google Cloud offers several storage options, but the most common for ML/AI workflows are **Virtual Machine (VM) disks** and **Google Cloud Storage (GCS) buckets**. - -> #### Consult your institution's IT before handling sensitive data in GCP -> As with AWS, **do not upload restricted or sensitive data to GCP services unless explicitly approved by your institution's IT or cloud security team**. For regulated datasets (HIPAA, FERPA, proprietary), work with your institution to ensure encryption, restricted access, and compliance with policies. - -## Options for storage: VM Disks or GCS - -### What is a VM disk? -A VM disk is the storage volume attached to a Compute Engine VM or a Vertex AI Workbench notebook. It can store datasets and intermediate results, but it is tied to the lifecycle of the VM. - -#### When to store data directly on a VM disk - -- Useful for small, temporary datasets processed interactively. -- Data persists if the VM is stopped, but storage costs continue as long as the disk exists. -- Not ideal for collaboration, scaling, or long-term dataset storage. - -::::::::::::::::::::::::::::::::::::: callout - -#### Limitations of VM disk storage - -- **Scalability**: Limited by disk size quota. -- **Sharing**: Harder to share across projects or team members. -- **Cost**: More expensive per GB compared to GCS for long-term storage. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -### What is a GCS bucket? -For most ML/AI workflows in GCP, **Google Cloud Storage (GCS) buckets** are recommended. A GCS bucket is a container in Google's object storage service where you can store an essentially unlimited number of files. Data in GCS can be accessed from Vertex AI training jobs, Workbench notebooks, and other GCP services using a *GCS URI* (e.g., `gs://your-bucket-name/your-file.csv`). Think of GCS URIs as cloud file paths — you'll use them throughout the workshop to reference data in training scripts, notebooks, and SDK calls. - -::::::::::::::::::::::::::::::::::::: callout - -#### Benefits of using GCS (recommended for ML/AI workflows) - -- **Separation of storage and compute**: Data remains available even if VMs or notebooks are deleted. -- **Easy sharing**: Buckets can be accessed by collaborators with the right IAM roles. -- **Integration with Vertex AI and BigQuery**: Read and write data directly using other GCP tools. -- **Scalability**: Handles datasets of any size without disk limits. -- **Cost efficiency**: Lower cost than persistent disks (VM storage) for long-term storage. -- **Data persistence**: Durable and highly available across regions. -- **Filesystem mounting**: GCS buckets can be mounted as local directories using [Cloud Storage FUSE](https://cloud.google.com/storage/docs/cloud-storage-fuse/overview), making them accessible like regular filesystems for tools that expect local file paths. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -## Creating a GCS bucket - -### 1. Sign in to Google Cloud Console - -- Go to [console.cloud.google.com](https://console.cloud.google.com/) and log in with your credentials. -- Select your project from the project dropdown at the top of the page. If you're using the shared workshop project, the instructor will provide the project name. - -### 2. Navigate to Cloud Storage - -- In the search bar, type **Storage**. -- Click **Cloud Storage > Buckets**. - -### 3. Create a new bucket - -- Click **Create bucket** and configure the following settings: - -- **Bucket name**: Enter a globally unique name using the convention `lastname-dataname` (e.g., `doe-titanic`). -- **Labels**: Add cost-tracking labels (same keys you used for the Workbench Instance in [Episode 2](02-Notebooks-as-controllers.md), plus a `dataset` tag): - - `name = firstname-lastname` - - `purpose = workshop` - - `dataset = titanic` - - In shared accounts, labels are *mandatory*. -- **Location**: Choose **Region** → `us-central1` (same region as your compute to avoid egress charges). -- **Storage class**: **Standard** (best for active ML/AI workflows). -- **Access control**: **Uniform** (simpler IAM-based permissions). -- **Protection**: Leave default **soft delete** enabled; skip versioning and retention policies. - -Click **Create** if everything looks good. - -### 4. Upload files to the bucket - -- If you haven't yet, download the data for this workshop (Right-click → Save as): - [data.zip](https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/data/data.zip) - - Extract the zip folder contents (Right-click → Extract all on Windows; double-click on macOS). - - The zip contains the **Titanic dataset** — passenger information (age, class, fare, etc.) with a survival label. This is a classic binary classification task we'll use for training in later episodes. -- In the bucket dashboard, click **Upload Files**. -- Select your Titanic CSVs (`titanic_train.csv` and `titanic_test.csv`) and upload. - -**Note the GCS URI for your data** After uploading, click on a file and find its **gs:// URI** (e.g., `gs://doe-titanic/titanic_test.csv`). This URI will be used to access the data in your notebook. - -## Adjust bucket permissions - -Your bucket exists, but your notebooks and training jobs don't automatically have permission to use it. GCP follows the **principle of least privilege** — services only get the access you explicitly grant. In this section we'll find the service account that Vertex AI uses and give it the right roles on your bucket. - -#### Check your project ID - -First, confirm which project your notebook is connected to. Run this cell in your Workbench notebook: - -```python -from google.cloud import storage -client = storage.Client() -print(client.project) -``` - -Copy the output — you'll paste it into Cloud Shell commands below. - -::::::::::::::::::::::::::::::::::::: callout - -#### These commands run in Cloud Shell, not in a notebook - -Open [**Cloud Shell**](https://shell.cloud.google.com/?show=terminal) — a browser-based terminal built into the Google Cloud Console (click the **>\_** icon in the top-right toolbar). Copy the commands below and paste them into that terminal. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -#### Set your project - -If Cloud Shell doesn't already know your project, set it first: - -```sh -gcloud config set project YOUR_PROJECT_ID -``` - -Replace `YOUR_PROJECT_ID` with the project ID you copied above. For the shared MLM25 workshop the project ID is **`doit-rci-mlm25-4626`**. - -#### Find your service account - -When you create a GCP project, Google automatically provisions a **Compute Engine default service account**. This is the identity that Vertex AI Workbench notebooks and training jobs use when they call other GCP services (like Cloud Storage). By default this account may not have access to your bucket, so we need to grant it the right IAM roles explicitly. - -First, look up the service account email: - -```sh -gcloud iam service-accounts list --filter="displayName:Compute Engine default service account" --format="value(email)" -``` - -This will return an email like `123456789-compute@developer.gserviceaccount.com`. Copy it — you'll paste it into the commands below. - - - -#### Grant permissions - -Now we give that service account the ability to read from and write to your bucket. Without these roles, your notebooks would get "Access Denied" errors when trying to load training data or save model artifacts. - -Replace `YOUR_BUCKET_NAME` and `YOUR_SERVICE_ACCOUNT`, then run: - -```sh -# objectViewer — lets notebooks READ data (e.g., load CSVs for training) -gcloud storage buckets add-iam-policy-binding gs://YOUR_BUCKET_NAME \ - --member="serviceAccount:YOUR_SERVICE_ACCOUNT" \ - --role="roles/storage.objectViewer" - -# objectCreator — lets training jobs WRITE outputs (e.g., saved models, logs) -gcloud storage buckets add-iam-policy-binding gs://YOUR_BUCKET_NAME \ - --member="serviceAccount:YOUR_SERVICE_ACCOUNT" \ - --role="roles/storage.objectCreator" - -# objectAdmin — adds OVERWRITE and DELETE (only needed if you want to -# re-run jobs that replace existing files or clean up old artifacts) -gcloud storage buckets add-iam-policy-binding gs://YOUR_BUCKET_NAME \ - --member="serviceAccount:YOUR_SERVICE_ACCOUNT" \ - --role="roles/storage.objectAdmin" -``` - - - -::::::::::::::::::::::::::::::::::::: callout - -#### `gcloud storage` vs. `gsutil` -Older tutorials often reference `gsutil` for Cloud Storage operations. Google now recommends `gcloud storage` as the primary CLI. Both work, but `gcloud storage` is actively maintained and consistent with the rest of the `gcloud` CLI. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -## Data transfer & storage costs - -GCS costs are based on three things: **storage class** (how you store data), **data transfer** (moving data in or out of GCP), and **operations** (API requests). Operations are the individual actions your code performs against Cloud Storage — every time a notebook reads a file or a training job writes a model, that's an API request. - -- **Standard storage**: ~ `$0.02` per GB per month in `us-central1`. -- **Uploading data (ingress)**: Free. -- **Downloading data out of GCP (egress)**: ~ `$0.12` per GB. -- **Cross-region access**: ~ `$0.01`–`$0.02` per GB within North America. -- **GET requests** (reading/downloading objects): ~ `$0.004` per 10,000 requests. -- **PUT/POST requests** (creating/uploading objects): ~ `$0.05` per 10,000 requests. -- **Deleting data**: Free (but Nearline/Coldline/Archive early-deletion fees apply). - -***For detailed pricing, see [GCS Pricing Information](https://cloud.google.com/storage/pricing).*** - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 1: Estimating Storage Costs - -**1. Estimate the total cost of storing 1 GB in GCS Standard storage (us-central1) for one month assuming:** -- Dataset read from the bucket 100 times within GCP (e.g., each training or tuning run fetches the data via a GET request — this stays inside Google's network, so no egress charge) -- Data is downloaded once out of GCP to your laptop at the end of the project (this *does* incur an egress charge) - -**2. Repeat the above calculation for datasets of 10 GB, 100 GB, and 1 TB (1024 GB).** - -**Hints**: Storage `$0.02`/GB/month, Egress `$0.12`/GB, GET requests negligible at this scale. - -:::::::::::::::: solution - -1. **1 GB**: Storage `$0.02` + Egress `$0.12` = **`$0.14`** -2. **10 GB**: `$0.20` + `$1.20` = **`$1.40`** -3. **100 GB**: `$2.00` + `$12.00` = **`$14.00`** -4. **1 TB**: `$20.48` + `$122.88` = **`$143.36`** - -::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::: - - -## Accessing data from your notebook - -Now that our bucket is set up, let's use it from the Workbench notebook you created in the previous episode. - -If you haven't already cloned the repository, open JupyterLab from your Workbench Instance and run `!git clone https://github.com/qualiaMachine/Intro_GCP_for_ML.git`. Then navigate to `/Intro_GCP_for_ML/notebooks/03-Data-storage-and-access.ipynb`. - -### Set up GCP environment -If you haven't already, initialize the storage client (same code from the permissions section earlier). The `storage.Client()` call creates a connection using the credentials already attached to your Workbench VM. - -```python -from google.cloud import storage -client = storage.Client() -print(client.project) -``` - -### Reading data directly into memory - -The code below downloads a CSV from your bucket and loads it into a pandas DataFrame. The `blob.download_as_bytes()` call pulls the file contents as raw bytes, and `io.BytesIO` wraps those bytes in a file-like object that `pd.read_csv` can read — no temporary file on disk needed. - -```python -import pandas as pd -import io - -bucket_name = "doe-titanic" # ADJUST to your bucket's name - -bucket = client.bucket(bucket_name) -blob = bucket.blob("titanic_train.csv") -train_data = pd.read_csv(io.BytesIO(blob.download_as_bytes())) -print(train_data.shape) -train_data.head() -``` - -The Titanic dataset contains passenger information (age, class, fare, etc.) and a binary survival label — we'll train a classifier on this data in Episode 4. - -```python -train_data.info() -train_data.describe() -``` - -::::::::::::::::::::::::::::::::::::: callout - -### Alternative: reading directly with pandas - -Vertex AI Workbench comes with `gcsfs` pre-installed, which lets pandas read GCS URIs directly — no `BytesIO` conversion needed: - -```python -train_data = pd.read_csv("gs://doe-titanic/titanic_train.csv") # ADJUST bucket name -``` - -This is convenient for quick exploration. We use the `storage.Client` approach above because it gives you more control (listing blobs, checking sizes, uploading), which you'll need in the sections that follow. - -::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: callout - -### Common errors - -- **`Forbidden (403)`** — Your service account lacks permission. Revisit the **Adjust bucket permissions** section above. -- **`NotFound (404)`** — The bucket name or file path is wrong. Double-check `bucket_name` and the blob path with `client.list_blobs(bucket_name)`. -- **`DefaultCredentialsError`** — The notebook cannot find credentials. Make sure you are running on a Vertex AI Workbench Instance (not a local machine). - -::::::::::::::::::::::::::::::::::::: - -## Monitoring storage usage and costs - -It's good practice to periodically check how much storage your bucket is using. The code below sums up all object sizes. - -```python -total_size_bytes = 0 -bucket = client.bucket(bucket_name) - -for blob in client.list_blobs(bucket_name): - total_size_bytes += blob.size - -total_size_mb = total_size_bytes / (1024**2) -print(f"Total size of bucket '{bucket_name}': {total_size_mb:.2f} MB") -``` - -```python -storage_price_per_gb = 0.02 # $/GB/month for Standard storage -egress_price_per_gb = 0.12 # $/GB for internet egress (same-region transfers are free) -total_size_gb = total_size_bytes / (1024**3) - -monthly_storage = total_size_gb * storage_price_per_gb -egress_cost = total_size_gb * egress_price_per_gb - -print(f"Bucket size: {total_size_gb:.4f} GB") -print(f"Estimated monthly storage cost: ${monthly_storage:.4f}") -print(f"Estimated annual storage cost: ${monthly_storage*12:.4f}") -print(f"One-time full download (egress) cost: ${egress_cost:.4f}") -``` - -## Writing output files to GCS - -```python -# Create a sample file locally on the notebook VM -file_path = "/home/jupyter/Notes.txt" -with open(file_path, "w") as f: - f.write("This is a test note for GCS.") -``` - -```python -bucket = client.bucket(bucket_name) -blob = bucket.blob("docs/Notes.txt") -blob.upload_from_filename(file_path) -print("File uploaded successfully.") -``` - -List bucket contents: - -```python -for blob in client.list_blobs(bucket_name): - print(blob.name) -``` - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 2: Read and explore the test dataset - -Read `titanic_test.csv` from your GCS bucket and display its shape. How does the test set compare to the training set in size and columns? - -:::::::::::::::: solution - -```python -blob = client.bucket(bucket_name).blob("titanic_test.csv") -test_data = pd.read_csv(io.BytesIO(blob.download_as_bytes())) -print("Test shape:", test_data.shape) -print("Train shape:", train_data.shape) -print("Same columns?", list(test_data.columns) == list(train_data.columns)) -test_data.head() -``` - -Both datasets share the same 12 columns (including `Survived`). The test set is a smaller held-out subset (179 rows vs 712 in training) — roughly an 80/20 split used for final evaluation after the model is trained. - -::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 3: Upload a summary CSV to GCS - -Using `train_data`, compute the survival rate by passenger class (`Pclass`) and upload the result as `results/survival_by_class.csv` to your bucket. - -:::::::::::::::: solution - -```python -summary = train_data.groupby("Pclass")["Survived"].mean().reset_index() -summary.columns = ["Pclass", "SurvivalRate"] -print(summary) - -# Save locally then upload -summary.to_csv("/home/jupyter/survival_by_class.csv", index=False) -blob = client.bucket(bucket_name).blob("results/survival_by_class.csv") -blob.upload_from_filename("/home/jupyter/survival_by_class.csv") -print("Summary uploaded to GCS.") -``` - -::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::: - - -## Removing unused data (complete *after* the workshop) - -After you are done using your data, remove unused files/buckets to stop costs. - -You can delete files programmatically. Let's clean up the notes file we uploaded earlier: - -```python -blob = client.bucket(bucket_name).blob("docs/Notes.txt") -blob.delete() -print("docs/Notes.txt deleted.") -``` - -Verify it's gone: - -```python -for blob in client.list_blobs(bucket_name): - print(blob.name) -``` - -For larger clean-up tasks, use the [Cloud Console](https://console.cloud.google.com/storage/browser): - -- **Delete files only** – In your bucket, select the files you want to remove and click **Delete**. -- **Delete the bucket entirely** – In **Cloud Storage > Buckets**, select your bucket and click **Delete**. - -For a detailed walkthrough of cleaning up all workshop resources, see [Episode 9: Resource Management and Cleanup](09-Resource-management-cleanup.md). - -::::::::::::::::::::::::::::::::::::: keypoints - -- Use GCS for scalable, cost-effective, and persistent storage in GCP. -- Persistent disks are suitable only for small, temporary datasets. -- Load data from GCS into memory with `storage.Client` or directly via `pd.read_csv("gs://...")`. -- Periodically check storage usage and estimate costs to manage your GCS budget. -- Track your storage, transfer, and request costs to manage expenses. -- Regularly delete unused data or buckets to avoid ongoing costs. - -:::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/04-Training-models-in-VertexAI.md b/episodes/04-Training-models-in-VertexAI.md deleted file mode 100644 index d22995c4..00000000 --- a/episodes/04-Training-models-in-VertexAI.md +++ /dev/null @@ -1,524 +0,0 @@ ---- -title: "Training Models in Vertex AI: Intro" -teaching: 25 -exercises: 15 ---- - -:::::::::::::::::::::::::::::::::::::: questions - -- What are the differences between training locally in a Vertex AI notebook and using Vertex AI-managed training jobs? -- How do custom training jobs in Vertex AI streamline the training process for various frameworks? -- How does Vertex AI handle scaling across CPUs, GPUs, and TPUs? - -:::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: objectives - -- Understand the difference between local training in a Vertex AI Workbench notebook and submitting managed training jobs. -- Learn to configure and use Vertex AI custom training jobs for different frameworks (e.g., XGBoost, PyTorch, SKLearn). -- Understand scaling options in Vertex AI, including when to use CPUs, GPUs, or TPUs. -- Compare performance, cost, and setup between custom scripts and pre-built containers in Vertex AI. -- Conduct training with data stored in GCS and monitor training job status using the Google Cloud Console. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: callout - -### Cost awareness: training jobs - -Training jobs bill per VM-hour while the job is running. An `n1-standard-4` (CPU) costs ~ `$0.19`/hr; adding a T4 GPU brings the total to ~ `$0.54`/hr. Jobs automatically stop (and stop billing) when the script finishes. For a complete cost reference, see the [Compute for ML](../compute-for-ML.html) page and the cost table in [Episode 9](09-Resource-management-cleanup.md). - -:::::::::::::::::::::::::::::::::::::::::::::::: - -Here's the architecture we introduced in [Episode 2](02-Notebooks-as-controllers.md) — your lightweight notebook orchestrates training jobs that run on separate, more powerful VMs, with all artifacts stored in GCS: - -![Training and tuning workflow](https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/images/diagram1_training_and_tuning.svg){alt="Architecture diagram showing how a lightweight Workbench notebook uses the Vertex AI SDK to launch training jobs and HP tuning jobs on powerful GPUs, with all artifacts stored in GCS."} - -## Initial setup - -#### 1. Open pre-filled notebook -Navigate to `/Intro_GCP_for_ML/notebooks/04-Training-models-in-VertexAI.ipynb` to begin this notebook. - -#### 2. CD to instance home directory -To ensure we're all in the same starting spot, change directory to your Jupyter home directory. - -```python -%cd /home/jupyter/ -``` - -#### 3. Set environment variables -This code initializes the Vertex AI environment by importing the Python SDK, setting the project, region, and defining a GCS bucket for input/output data. - -- `PROJECT_ID`: Identifies your GCP project. -- `REGION`: Determines where training jobs run (choose a region close to your data). - -```python -from google.cloud import storage -client = storage.Client() -PROJECT_ID = client.project -REGION = "us-central1" -BUCKET_NAME = "doe-titanic" # ADJUST to your bucket's name -LAST_NAME = "DOE" # ADJUST to your last name or name -print(f"project = {PROJECT_ID}\nregion = {REGION}\nbucket = {BUCKET_NAME}") -``` - -::::::::::::::::::::::::::::::::::::: callout - -### How does `storage.Client()` know your project? - -When you call `storage.Client()` without arguments, the library automatically discovers your credentials and project ID. This works because Vertex AI Workbench VMs run on Google Compute Engine, which provides a **metadata server** at a known internal address. The client library queries this server to retrieve the project ID and a service-account token — no keys or config files needed. If you ran the same code on your laptop, you would need to authenticate first with `gcloud auth application-default login` (see [Episode 8](08-CLI-workflows.md) for details). - -::::::::::::::::::::::::::::::::::::: - -## Testing train_xgboost.py locally in the notebook - -Before submitting a managed training job to Vertex AI, let's first examine and test the training script on our notebook VM. This ensures the code runs without errors before we spend money on cloud compute. - -::::::::::::::::::::::::::::::::::::: callout - -### One script, two environments - -A key design goal of `train_xgboost.py` is that the **same script runs unchanged** on your laptop, inside a Workbench notebook, and as a Vertex AI managed training job. Two patterns make this possible: - -1. **GCS-aware I/O helpers** (`read_csv_any`, `save_model_any`): These functions check whether a path starts with `gs://`. If it does, they use the `google-cloud-storage` client to read or write. If not, they use plain local file I/O. This means you can pass `--train ./titanic_train.csv` for a local test and `--train=gs://my-bucket/titanic_train.csv` for a cloud job without changing any code. - -2. **`AIP_MODEL_DIR` environment variable**: When Vertex AI runs a CustomTrainingJob with `base_output_dir` set, it injects `AIP_MODEL_DIR` (a `gs://` path) into the container. The script reads this variable to decide where to save the model. Locally, the variable is unset, so it falls back to the current directory (`.`). - -This "write once, run anywhere" approach means you can **debug locally first** (fast, free) and then submit the exact same script to Vertex AI (scalable, managed) with confidence. - -::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::::: challenge - -### Understanding the XGBoost Training Script - -Take a moment to review `Intro_GCP_for_ML/scripts/train_xgboost.py`. This is a standard XGBoost training script — it handles preprocessing, training, and saving a model. What makes it cloud-ready is that it also supports GCS (`gs://`) paths and adapts to Vertex AI conventions (e.g., `AIP_MODEL_DIR`), so the same script runs locally or as a managed training job without changes. - -Try answering the following questions: - -1. **Data preprocessing**: What transformations are applied to the dataset before training? -2. **Training function**: What does the `train_model()` function do? Why print the training time? -3. **Command-line arguments**: What is the purpose of `argparse` in this script? How would you change the number of training rounds? -4. **Handling local vs. GCP runs**: How does the script let you run the same code locally, in Workbench, or as a Vertex AI job? Which environment variable controls where the model artifact is written? -5. **Training and saving the model**: What format is the dataset converted to before training, and why? How does the script save to a local path vs. a `gs://` destination? - -After reviewing, discuss any questions or observations with your group. - -::::::::::::::::::::::::::::::::::::::: solution - -### Solution - -1. **Data preprocessing**: The script fills missing values (`Age` with median, `Embarked` with mode), maps categorical fields to numeric (`Sex` → {male:1, female:0}, `Embarked` → {S:0, C:1, Q:2}), and drops non-predictive columns (`PassengerId`, `Name`, `Ticket`, `Cabin`). -2. **Training function**: `train_model()` constructs and fits an XGBoost model with the provided parameters and prints wall-clock training time. Timing helps compare runs and make sensible scaling choices. -3. **Command-line arguments**: `argparse` lets you set hyperparameters and file paths without editing code (e.g., `--max_depth`, `--eta`, `--num_round`, `--train`). To change rounds: `python train_xgboost.py --num_round 200` -4. **Handling local vs. GCP runs**: - - **Input**: You pass `--train` as either a local path (`train.csv`) or a GCS URI (`gs://bucket/path.csv`). The script automatically detects `gs://` and reads the file directly from Cloud Storage using the Python client. - - **Output**: If the environment variable `AIP_MODEL_DIR` is set (as it is in Vertex AI CustomJobs), the trained model is written there—often a `gs://` path. Otherwise, the model is saved in the current working directory, which works seamlessly in both local and Workbench environments. -5. **Training and saving the model**: - The training data is converted into an **XGBoost `DMatrix`**, an optimized format that speeds up training and reduces memory use. The trained model is serialized with `joblib`. When saving locally, the file is written directly to disk. If saving to a Cloud Storage path (`gs://...`), the model is first saved to a temporary file and then uploaded to the specified bucket. - -::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::::::::::::::: - - -Before scaling training jobs onto managed resources, it's essential to test your training script locally. This prevents wasting GPU/TPU time on bugs or misconfigured code. Skipping these checks can lead to silent data bugs, runtime blowups at scale, inefficient experiments, or broken model artifacts. - -### Sanity checks before scaling - -- **Reproducibility** – Do you get the same result each time? If not, set seeds controlling randomness. -- **Data loads correctly** – Dataset loads without errors, expected columns exist, missing values handled. -- **Overfitting check** – Train on a tiny dataset (e.g., 100 rows). If it doesn't overfit, something is off. -- **Loss behavior** – Verify training loss decreases and doesn't diverge. -- **Runtime estimate** – Get a rough sense of training time on small data before committing to large compute. -- **Memory estimate** – Check approximate memory use to choose the right machine type. -- **Save & reload** – Ensure model saves, reloads, and infers without errors. - - -## Download data into notebook environment -Sometimes it's helpful to keep a copy of data in your notebook VM for quick iteration, even though **GCS is the preferred storage location**. For example, downloading locally lets you test your training script without any GCS dependencies, making debugging faster. Once you've verified everything works, the actual Vertex AI job will read directly from GCS. - -```python -bucket = client.bucket(BUCKET_NAME) - -blob = bucket.blob("titanic_train.csv") -blob.download_to_filename("/home/jupyter/titanic_train.csv") - -print("Downloaded titanic_train.csv") -``` - -## Local test run of train_xgboost.py - -Running a quick test on the Workbench notebook VM is cheap — it's a lightweight machine that costs only ~$0.19/hr. The real cost comes later when you launch managed training jobs with larger machines or GPUs. Think of your notebook as a low-cost controller: use it to catch bugs and verify logic before spending on cloud compute. - -As you gain confidence, you can skip the notebook VM entirely and run these tests on your own laptop or lab machine — then submit jobs to Vertex AI via the `gcloud` CLI or Python SDK from anywhere (see [Episode 8](08-CLI-workflows.md)). That eliminates the VM cost altogether. - -- For large datasets, use a small representative sample of the total dataset when testing locally (i.e., just to verify that code is working and model overfits nearly perfectly after training enough epochs) -- For larger models, use smaller model equivalents (e.g., 100M vs 7B params) when testing locally - -```python -# Pin the same XGBoost version used by the Vertex AI prebuilt container -# (xgboost-cpu.2-1) so local and cloud results are identical. -!pip install xgboost==2.1.0 -``` - -```python -# Training configuration parameters for XGBoost -MAX_DEPTH = 3 # maximum depth of each decision tree (controls model complexity) -ETA = 0.1 # learning rate (how much each tree contributes to the overall model) -SUBSAMPLE = 0.8 # fraction of training samples used per boosting round (prevents overfitting) -COLSAMPLE = 0.8 # fraction of features (columns) sampled per tree (adds randomness and diversity) -NUM_ROUND = 100 # number of boosting iterations (trees) to train - -import time as t -start = t.time() - -# Run the custom training script with hyperparameters defined above -!python Intro_GCP_for_ML/scripts/train_xgboost.py \ - --max_depth $MAX_DEPTH \ - --eta $ETA \ - --subsample $SUBSAMPLE \ - --colsample_bytree $COLSAMPLE \ - --num_round $NUM_ROUND \ - --train titanic_train.csv - -print(f"Total local runtime: {t.time() - start:.2f} seconds") - -``` - -Training on this small dataset should take <1 minute. Log runtime as a baseline. You should see the following output file: - -- `xgboost-model` — Serialized XGBoost model (Booster) via joblib; load with `joblib.load()` for reuse. - -## Evaluate the trained model on validation data - -Now that we've trained and saved an XGBoost model, we want to do the most important sanity check: -**Does this model make reasonable predictions on unseen data?** - -This step: -1. Loads the serialized model artifact that was written by `train_xgboost.py` -2. Loads a test set of Titanic passenger data -3. Applies the same preprocessing as training -4. Generates predictions -5. Computes simple accuracy - -First, we'll download the test data - -```python -blob = bucket.blob("titanic_test.csv") -blob.download_to_filename("titanic_test.csv") - -print("Downloaded titanic_test.csv") -``` - -Then, we apply the same preprocessing function used by our training script before applying the model to our data. - -> **Note:** The `import` below treats the repo as a Python package. This works because we cloned the repo into `/home/jupyter/` and the directory contains an `__init__.py`. If you get an `ImportError`, make sure your working directory is `/home/jupyter/` (run `%cd /home/jupyter/` first). - -> **Note on test data:** The training script internally splits its input data 80/20 for training and validation. The `titanic_test.csv` file we use here is a **separate, held-out test set** that was never seen during training — not even by the internal validation split. This gives us an unbiased measure of model performance. - -```python -import pandas as pd -import xgboost as xgb -import joblib -from sklearn.metrics import accuracy_score -from Intro_GCP_for_ML.scripts.train_xgboost import preprocess_data # reuse same preprocessing - -# Load test data -test_df = pd.read_csv("titanic_test.csv") - -# Apply same preprocessing from training -X_test, y_test = preprocess_data(test_df) - -# Load trained model from local file -model = joblib.load("xgboost-model") - -# Predict on test data -dtest = xgb.DMatrix(X_test) -y_pred = model.predict(dtest) -y_pred_binary = (y_pred > 0.5).astype(int) - -# Compute accuracy -acc = accuracy_score(y_test, y_pred_binary) -print(f"Test accuracy: {acc:.3f}") -``` - -You should see test accuracy in the range of **0.78–0.82**. If accuracy is significantly lower, double-check that the test data downloaded correctly and that the preprocessing matches the training script. - -::::::::::::::::::::::::::::::::::::::: challenge - -### Experiment with hyperparameters - -Try changing `NUM_ROUND` to `200` and re-running the local training and evaluation cells above. Does accuracy improve? How does the runtime change? Then try `MAX_DEPTH = 6`. What happens to accuracy — does the model improve, or does it start overfitting? - -::::::::::::::::::::::::::::::::::::::: solution - -### Solution - -Increasing `NUM_ROUND` from 100 to 200 may marginally improve accuracy but roughly doubles runtime. Increasing `MAX_DEPTH` from 3 to 6 lets trees capture more complex patterns but can lead to overfitting on a small dataset like Titanic — you may see training accuracy increase while test accuracy stays flat or drops. This is why testing hyperparameters locally before scaling is important. - -::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::::::::::::::: - -## Training via Vertex AI custom training job -Unlike "local" training using our notebook's VM, this next approach launches a **managed training job** that runs on scalable compute. Vertex AI handles provisioning, scaling, logging, and saving outputs to GCS. - -### Which machine type to start with? -Start with a small CPU machine like `n1-standard-4`. Only scale up to GPUs/TPUs once you've verified your script. See [Compute for ML](https://qualiamachine.github.io/Intro_GCP_for_ML/compute-for-ML.html) for guidance. - -```python -MACHINE = 'n1-standard-4' -``` - -### Creating a custom training job with the SDK - -> **Reminder:** We're using the Python SDK from a notebook here, but the same `aiplatform.CustomTrainingJob` calls work identically in a standalone `.py` script, a shell session, or a CI pipeline. You can also submit jobs entirely from the command line with `gcloud ai custom-jobs create`. See the callout in Episode 2 for more details. - -We'll first initialize the Vertex AI platform with our environment variables. We'll also set a `RUN_ID` and `ARTIFACT_DIR` to help store outputs. - -```python -from google.cloud import aiplatform -import datetime as dt -RUN_ID = dt.datetime.now().strftime("%Y%m%d-%H%M%S") -ARTIFACT_DIR = f"gs://{BUCKET_NAME}/artifacts/xgb/{RUN_ID}/" # everything will live beside this -print(f"project = {PROJECT_ID}\nregion = {REGION}\nbucket = {BUCKET_NAME}\nartifact_dir = {ARTIFACT_DIR}") - -# Staging bucket is only for the SDK's temp code tarball (aiplatform-*.tar.gz) -aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=f"gs://{BUCKET_NAME}/.vertex_staging") -``` - -> ### What does `aiplatform.init()` do? -> -> `aiplatform.init()` sets **session-wide defaults** for the Vertex AI Python SDK. Every SDK call you make afterward (creating jobs, uploading models, querying metadata, etc.) will inherit these values so you don't have to repeat them each time. The three arguments we pass here are: -> -> | Argument | Purpose | -> |---|---| -> | `project` | The Google Cloud project that owns (and is billed for) all Vertex AI resources you create. | -> | `location` | The region where jobs run and artifacts are stored (e.g., `us-central1`). Must match the region of any buckets or endpoints you reference. | -> | `staging_bucket` | A Cloud Storage path where the SDK **automatically packages and uploads your training code** as a tarball (e.g., `aiplatform-2025-01-15-…-.tar.gz`). The training VM downloads this tarball at startup to run your script. We point it to a `.vertex_staging` subfolder to keep these temporary archives separate from your real data and model artifacts. | -> -> You only need to call `aiplatform.init()` once per notebook or script session. If you ever need to override a default for a single call (e.g., run a job in a different region), you can pass the argument directly to that method and it will take precedence. - -A [`CustomTrainingJob`](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.CustomTrainingJob) is the Vertex AI SDK object that ties together three things: **your training script**, a **container image** to run it in, and **metadata** such as a display name. Think of it as a reusable job definition — it doesn't start any compute by itself. Only when you call `job.run()` (next step) does Vertex AI actually provision a VM, ship your code to it, and execute the script. - -The code below creates a `CustomTrainingJob` that points to `train_xgboost.py`, uses Google's prebuilt XGBoost training container (which already includes common dependencies like `google-cloud-storage`), and sets a `display_name` for tracking the job in the Vertex AI console. - -> **Tip:** If your script needs packages not included in the prebuilt container, you can pass a `requirements` list to `CustomTrainingJob` (e.g., `requirements=["scikit-learn>=1.3"]`). - -#### Prebuilt containers for training -Vertex AI provides prebuilt Docker container images for model training. These containers are organized by machine learning frameworks and framework versions and include common dependencies that you might want to use in your training code. To learn more about prebuilt training containers, see [Prebuilt containers for custom training](https://docs.cloud.google.com/vertex-ai/docs/training/pre-built-containers). - -```python - -job = aiplatform.CustomTrainingJob( - display_name=f"{LAST_NAME}_xgb_{RUN_ID}", - script_path="Intro_GCP_for_ML/scripts/train_xgboost.py", - container_uri="us-docker.pkg.dev/vertex-ai/training/xgboost-cpu.2-1:latest", -) -``` - -> **Version alignment:** Notice that the container tag `xgboost-cpu.2-1` matches the `xgboost==2.1.0` we installed locally. This is intentional — pinning the same library version in both environments ensures that local and cloud training produce identical results given the same data and random seed. - -Finally, this next block launches the custom training job on Vertex AI using the configuration defined earlier. **We won't be charged for our selected `MACHINE` until we run the below code using `job.run()`.** For an `n1-standard-4` running 2–5 minutes, expect a cost of roughly **`$0.01`–`$0.02`** — negligible, but good to be aware of as you scale to larger machines. This marks the point when our script actually begins executing remotely on the Vertex training infrastructure. Once `job.run()` is called, Vertex handles packaging your training script, transferring it to the managed training environment, provisioning the requested compute instance, and monitoring the run. The job's status and logs can be viewed directly in the Vertex AI Console under Training → Custom jobs. - -If you need to cancel or modify a job mid-run, you can do so from the console or via the SDK by calling job.cancel(). When the job completes, Vertex automatically tears down the compute resources so you only pay for the active training time. - -- The `args` list passes command-line parameters directly into your training script, including hyperparameters and the path to the training data in GCS. -- `replica_count=1` means we run a single training worker. Increase this for distributed training across multiple machines (e.g., data-parallel training with large datasets). -- `base_output_dir` specifies where all outputs (model, metrics, logs) will be written in Cloud Storage. -- `machine_type` controls the compute resources used for training. -- When `sync=True`, the notebook waits until the job finishes before continuing, making it easier to inspect results immediately after training. - -```python -job.run( - args=[ - f"--train=gs://{BUCKET_NAME}/titanic_train.csv", - f"--max_depth={MAX_DEPTH}", - f"--eta={ETA}", - f"--subsample={SUBSAMPLE}", - f"--colsample_bytree={COLSAMPLE}", - f"--num_round={NUM_ROUND}", - ], - replica_count=1, - machine_type=MACHINE, # MACHINE variable defined above; adjust to something more powerful when needed - base_output_dir=ARTIFACT_DIR, # sets AIP_MODEL_DIR for your script - sync=True, -) - -print("Model + logs folder:", ARTIFACT_DIR) -``` - -This launches a managed training job with Vertex AI. It should take 2-5 minutes for the training job to complete. - -### Understanding the training output message - -After your job finishes, you may see a message like: `Training did not produce a Managed Model returning None.` This is expected when running a `CustomTrainingJob` without specifying deployment parameters. Vertex AI supports two modes: - -- **CustomTrainingJob (research/development)** – You control training and save models/logs to Cloud Storage via `AIP_MODEL_DIR`. This is ideal for experimentation and cost control. -- **CustomTrainingJob with model registration (for deployment)** – You include `model_serving_container_image_uri` and `model_display_name`, and Vertex automatically registers a *Managed Model* in the Model Registry for deployment to an endpoint. - -In our setup, we're intentionally using the simpler **CustomTrainingJob** path without model registration. Your trained model is safely stored under your specified artifact directory (e.g., `gs://{BUCKET_NAME}/artifacts/xgb/{RUN_ID}/`), and you can later register or deploy it manually when ready. - - -## Monitoring training jobs in the Console - -> **Why do I see both a Training Pipeline and a Custom Job?** -> Under the hood, `CustomTrainingJob.run()` creates a **TrainingPipeline** resource, which in turn launches a **CustomJob** to do the actual compute work. This is normal — the pipeline is a thin wrapper that manages job lifecycle and (optionally) model registration. You can monitor progress from either view, but **Custom Jobs** shows the most useful details (logs, machine type, status). - -1. Go to the [Google Cloud Console](https://console.cloud.google.com/vertex-ai/training/custom-jobs). -2. Navigate to **Vertex AI > Training > Custom Jobs**. -3. Click on your job name to see status, logs, and output model artifacts. -4. Cancel jobs from the console if needed (be careful not to stop jobs you don't own in shared projects). - -#### Visit the console to verify it's running. - -Navigate to **Vertex AI > Training > Custom Jobs** in the [Google Cloud Console](https://console.cloud.google.com/vertex-ai/training/custom-jobs) to view your running or completed jobs. - -### If your job fails - -Job failures are common when first getting started. Here's how to debug: - -1. **Check the logs first.** In the Console, click your job name → **Logs** tab. The error message is usually near the bottom. -2. **Common failure modes:** - - **Quota exceeded** — Your project may not have enough quota for the requested machine type. Check **IAM & Admin > Quotas**. - - **Script error** — A bug in your training script. The traceback will appear in the logs. Fix the bug and re-run locally before resubmitting. - - **Wrong container** — Mismatched framework version or CPU/GPU container. Verify your `container_uri`. - - **Permission denied on GCS** — The training service account can't access your bucket. Check bucket permissions. -3. **Re-test locally** with the same arguments before resubmitting to avoid burning compute time on the same error. - -## Training artifacts - -After the training run completes, we can manually view our bucket using the [Google Cloud Console](https://console.cloud.google.com/storage/browser) or run the below code. - -```python -total_size_bytes = 0 - -for blob in client.list_blobs(BUCKET_NAME): - total_size_bytes += blob.size - print(blob.name) - -total_size_mb = total_size_bytes / (1024**2) -print(f"Total size of bucket '{BUCKET_NAME}': {total_size_mb:.2f} MB") -``` - -#### Training Artifacts → `ARTIFACT_DIR` -This is your *intended output location*, set via `base_output_dir`. -It contains everything your training script explicitly writes. In our case, this includes: - -- **`{BUCKET_NAME}/artifacts/xgb/{RUN_ID}/model/xgboost-model`** — Serialized XGBoost model (Booster) saved via `joblib`; reload later with `joblib.load()` for reuse or deployment. - - -#### System-Generated Staging Files - -You'll also notice files under `.vertex_staging/` — one timestamped tarball per job submission: - -```text -.vertex_staging/aiplatform-2026-03-04-05:51:20.248-aiplatform_custom_trainer_script-0.1.tar.gz -.vertex_staging/aiplatform-2026-03-04-05:53:28.009-aiplatform_custom_trainer_script-0.1.tar.gz -... -``` - -Each time you call `job.run(...)`, the SDK packages your training script into a `.tar.gz`, uploads it here, and the training VM downloads it at startup. These accumulate quickly — the example above shows 19 archives from a single day of iteration. They are safe to delete once the job finishes, and you can automate cleanup with [Object Lifecycle Management](https://cloud.google.com/storage/docs/lifecycle) rules (e.g., auto-delete objects in `.vertex_staging/` after 7 days). - -To delete all staging files now, run: - -```python -!gsutil -m rm -r gs://{BUCKET_NAME}/.vertex_staging/ -``` - -This won't affect your model artifacts under `artifacts/`. - -## Evaluate the trained model stored on GCS - -Now let's compare the model produced by our Vertex AI job to the one we trained locally. This time, instead of loading from the local disk, we'll load both the test data and model artifact directly from GCS into memory — the recommended approach for production workflows. - -```python -import io - -# Load test data directly from GCS into memory -bucket = client.bucket(BUCKET_NAME) -blob = bucket.blob("titanic_test.csv") -test_df = pd.read_csv(io.BytesIO(blob.download_as_bytes())) - -# Apply same preprocessing logic used during training -X_test, y_test = preprocess_data(test_df) - -# Load the model artifact from GCS -MODEL_BLOB_PATH = f"artifacts/xgb/{RUN_ID}/model/xgboost-model" -model_blob = bucket.blob(MODEL_BLOB_PATH) -model_bytes = model_blob.download_as_bytes() -model = joblib.load(io.BytesIO(model_bytes)) - -# Run predictions and compute accuracy -dtest = xgb.DMatrix(X_test) -y_pred_prob = model.predict(dtest) -y_pred = (y_pred_prob >= 0.5).astype(int) - -acc = accuracy_score(y_test, y_pred) -print(f"Test accuracy (model from Vertex job): {acc:.3f}") -``` - -::::::::::::::::::::::::::::::::::::::: challenge - -### Compare local vs. Vertex AI accuracy - -Compare the test accuracy from your local training run with the accuracy from the Vertex AI job. Are they the same? Why or why not? - -::::::::::::::::::::::::::::::::::::::: solution - -### Solution - -The two accuracy values should be **very close** (within ~1–2 percentage points) but may not be byte-for-byte identical, even though both runs use the same script, hyperparameters, data, and random seed (`seed=42`). - -Why? The `subsample=0.8` and `colsample_bytree=0.8` settings randomly sample rows and columns each boosting round. A seed guarantees determinism only within the **exact same** library version, NumPy build, and BLAS/LAPACK backend. The Workbench notebook VM and the prebuilt training container ship different underlying numerical libraries (e.g., OpenBLAS vs. MKL), so even with identical XGBoost versions the random sampling sequence can diverge slightly — producing a different model and therefore a small accuracy difference. - -If you want exact reproducibility, set `subsample=1.0` and `colsample_bytree=1.0` (no random sampling) or accept that minor variation across environments is normal and expected in practice. - -::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::::: challenge - -### Explore job logs in the Console - -Navigate to **Vertex AI > Training > Custom Jobs** in the [Google Cloud Console](https://console.cloud.google.com/vertex-ai/training/custom-jobs). Find your most recent job and click on it. Can you locate: - -1. The **Logs** tab showing your script's `print()` output? -2. The training time printed by `train_model()`? -3. The output artifact path? - -::::::::::::::::::::::::::::::::::::::: solution - -### Solution - -1. Click your job name, then select the **Logs** tab (or **View logs** link). Your script's `print()` statements — including train/val sizes, training time, and model save path — appear in the log stream. -2. Look for the line `Training time: X.XX seconds` in the logs. This comes from the `train_model()` function in `train_xgboost.py`. -3. The artifact path is shown in the log line `Model saved to gs://...` and also appears in the job details panel under output configuration. - -::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::::::::::::::: - -### Looking ahead: when training takes too long - -The Titanic dataset is tiny, so our job finishes in minutes. In your real work, you'll encounter datasets and models where a single training run takes hours or days. When that happens, Vertex AI gives you two main levers: - -**Option 1: Upgrade to more powerful machine types** -- Use a larger machine or add GPUs (e.g., T4, V100, A100). This is the simplest approach and works well for datasets under ~10 GB. - -**Option 2: Use distributed training with multiple replicas** -- Split the dataset across replicas with synchronized gradient updates. This becomes worthwhile when datasets exceed 10–50 GB or single-machine training takes more than 10 hours. - -We'll explore both options hands-on in the next episode when we train a PyTorch neural network with GPU acceleration. - -::::::::::::::::::::::::::::::::::::: keypoints - -- **Environment initialization**: Use `aiplatform.init()` to set defaults for project, region, and bucket. -- **Local vs managed training**: Test locally before scaling into managed jobs. -- **Custom jobs**: Vertex AI lets you run scripts as managed training jobs using pre-built or custom containers. -- **Scaling**: Start small, then scale up to GPUs or distributed jobs as dataset/model size grows. -- **Monitoring**: Track job logs and artifacts in the Vertex AI Console. - -:::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/04-Training-models-on-CHTC.md b/episodes/04-Training-models-on-CHTC.md new file mode 100644 index 00000000..0fdde51a --- /dev/null +++ b/episodes/04-Training-models-on-CHTC.md @@ -0,0 +1,451 @@ +--- +title: "Training Models on CHTC with HTCondor" +teaching: 25 +exercises: 15 +--- + +:::::::::::::::::::::::::::::::::::::: questions + +- How do I write an HTCondor submit file to run a training job on CHTC? +- How do I use Docker containers to manage my software environment on CHTC? +- How do I monitor my jobs and troubleshoot failures? + +:::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: objectives + +- Write an HTCondor submit file that runs an XGBoost training script inside a Docker container. +- Test a training script locally on the submit node before submitting to the cluster. +- Submit, monitor, and inspect the results of an HTCondor training job. +- Interpret HTCondor job states (Idle, Running, Held, Completed) and use log files for debugging. +- Request appropriate compute resources (CPUs, memory, disk) for a training job. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +In [Episode 1](01-Introduction.md) we introduced the "submit node as controller" pattern — your submit node is a lightweight machine where you write code, prepare data, and launch jobs. The actual training happens on powerful execute nodes managed by HTCondor. In this episode we put that pattern into practice by submitting an XGBoost training job to CHTC. + +## The training script: train_xgboost.py + +We will use the same `train_xgboost.py` script introduced earlier in this workshop. It trains an XGBoost classifier on the Titanic dataset, accepts hyperparameters via command-line arguments, and saves a serialized model artifact. The key design feature is that the script is **self-contained** — it reads a local CSV file, trains the model, and writes output files to the current working directory. This makes it ideal for HTCondor, where each job runs in its own isolated scratch directory on an execute node. + +The script accepts the following arguments: + +- `--train` — path to the training CSV file +- `--max_depth` — maximum tree depth (controls model complexity) +- `--eta` — learning rate +- `--subsample` — fraction of rows sampled per boosting round +- `--colsample_bytree` — fraction of features sampled per tree +- `--num_round` — number of boosting iterations + +::::::::::::::::::::::::::::::::::::::: challenge + +### Understanding the training script + +Review `scripts/train_xgboost.py` and answer the following: + +1. What preprocessing steps does the script apply before training? +2. What output file(s) does the script produce? +3. How would you change the number of boosting rounds from the command line? + +::::::::::::::::::::::::::::::::::::::: solution + +### Solution + +1. The script fills missing values (`Age` with median, `Embarked` with mode), maps categorical fields to numeric values (`Sex` to 0/1, `Embarked` to 0/1/2), and drops non-predictive columns (`PassengerId`, `Name`, `Ticket`, `Cabin`). +2. It produces a file called `xgboost-model` — a serialized XGBoost Booster object saved with `joblib`. +3. Pass `--num_round 200` (or any integer) on the command line. + +::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::::::::::: + +## Testing locally on the submit node + +Before submitting to the cluster, always test your script on the submit node with a quick run. This catches bugs, missing dependencies, and data issues before you wait in the job queue. + +::::::::::::::::::::::::::::::::::::: callout + +### Keep local tests small + +The submit node is a shared resource — do not run heavy computations on it. For local testing, use a small dataset or a small number of training rounds (e.g., `--num_round 5`). The goal is to verify that the script runs without errors, not to produce a good model. + +::::::::::::::::::::::::::::::::::::: + +```bash +$ python3 train_xgboost.py \ + --train titanic_train.csv \ + --max_depth 3 \ + --eta 0.1 \ + --num_round 5 +``` + +If the script finishes without errors and produces an `xgboost-model` file, you are ready to submit to HTCondor. Remove the test output before submitting: + +```bash +$ rm -f xgboost-model +``` + +## Writing an HTCondor submit file + +An HTCondor submit file (`.sub`) tells HTCondor everything it needs to run your job: what to execute, which files to transfer, what resources to request, and where to write logs. Here is a complete submit file for our XGBoost training job: + +``` +# train_xgboost.sub — HTCondor submit file for XGBoost training + +universe = docker +docker_image = python:3.10 + +executable = train_xgboost.py +arguments = --train titanic_train.csv --max_depth 3 --eta 0.1 --num_round 100 + +transfer_input_files = train_xgboost.py, titanic_train.csv +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +log = job_$(Cluster).log +output = job_$(Cluster).out +error = job_$(Cluster).err + +request_cpus = 1 +request_memory = 2GB +request_disk = 1GB + +queue 1 +``` + +Let's walk through each section. + +### Container environment + +``` +universe = docker +docker_image = python:3.10 +``` + +These two lines tell HTCondor to run your job inside a Docker container. The `python:3.10` image from Docker Hub provides a clean Python environment. If your script needs additional packages (like `xgboost`, `pandas`, `scikit-learn`), you have two options: + +1. **Add a `pip install` step** — make your executable a wrapper shell script that installs packages before running the Python script. +2. **Build a custom Docker image** — create an image with all dependencies pre-installed and push it to Docker Hub. This is the recommended approach for reproducibility and faster job startup. + +::::::::::::::::::::::::::::::::::::: callout + +### Using a custom Docker image + +For production workflows, build a Docker image that includes all your dependencies. For example, if you have a `Dockerfile`: + +```dockerfile +FROM python:3.10 +RUN pip install xgboost==2.1.0 pandas scikit-learn joblib +``` + +Build and push it to Docker Hub: + +```bash +$ docker build -t yourusername/xgboost-train:v1 . +$ docker push yourusername/xgboost-train:v1 +``` + +Then reference it in your submit file: + +``` +universe = docker +docker_image = yourusername/xgboost-train:v1 +``` + +This avoids installing packages every time a job runs, which saves time and ensures consistent environments. + +::::::::::::::::::::::::::::::::::::: + +### Executable and arguments + +``` +executable = train_xgboost.py +arguments = --train titanic_train.csv --max_depth 3 --eta 0.1 --num_round 100 +``` + +The `executable` is the script HTCondor will run. The `arguments` line passes command-line arguments, just as you would on the command line. Note that the file paths in `arguments` are relative to the job's working directory on the execute node — HTCondor creates a temporary scratch directory for each job and places transferred files there. + +### File transfer + +``` +transfer_input_files = train_xgboost.py, titanic_train.csv +should_transfer_files = YES +when_to_transfer_output = ON_EXIT +``` + +HTCondor copies the listed input files from your submit directory to the execute node before the job starts. When the job finishes (`ON_EXIT`), any new files created in the working directory are transferred back to the submit directory. This is how you get your trained model (`xgboost-model`) back. + +### Log, output, and error files + +``` +log = job_$(Cluster).log +output = job_$(Cluster).out +error = job_$(Cluster).err +``` + +HTCondor writes three files for each job: + +| File | Contents | +|------|----------| +| `.log` | HTCondor system events: job submitted, started, finished, evicted, resource usage | +| `.out` | Everything your script writes to **stdout** (`print()` statements) | +| `.err` | Everything your script writes to **stderr** (warnings, errors, tracebacks) | + +The `$(Cluster)` macro is replaced with the job's cluster ID (a unique number), so each submission produces uniquely named files. + +### Resource requests + +``` +request_cpus = 1 +request_memory = 2GB +request_disk = 1GB +``` + +These tell HTCondor what your job needs. HTCondor uses these to match your job to a machine with sufficient resources. + +::::::::::::::::::::::::::::::::::::: callout + +### How to choose resource requests + +- **Start small.** Request only what you need. Over-requesting wastes shared resources and can make your job wait longer in the queue. +- **Check actual usage.** After a job completes, look at the `.log` file for a "Partitionable Resources" summary that shows how much memory and disk the job actually used. Use this to refine future requests. +- **Common starting points for ML jobs:** + - Small tabular data (XGBoost, sklearn): 1 CPU, 2 GB memory, 1 GB disk + - Medium neural networks: 1 CPU + 1 GPU, 8 GB memory, 5 GB disk + - Large models (fine-tuning LLMs): 1 CPU + 1 GPU (A100/H100), 32–64 GB memory, 20+ GB disk + +::::::::::::::::::::::::::::::::::::: + +### The queue command + +``` +queue 1 +``` + +This tells HTCondor to submit one instance of the job. In [Episode 6](06-Hyperparameter-tuning.md), we will use `queue` with multiple arguments to submit many jobs at once for hyperparameter sweeps. + +## Submitting the job + +Once your submit file is ready, submit it with `condor_submit`: + +```bash +$ condor_submit train_xgboost.sub +``` + +You should see output like: + +``` +Submitting job(s). +1 job(s) submitted to cluster 1234567. +``` + +The **cluster ID** (here `1234567`) is the unique identifier for your submission. You will use it to monitor the job and find your output files. + +## Monitoring your job + +### condor_q — check the job queue + +```bash +$ condor_q +``` + +This shows all your jobs currently in the queue. A typical output looks like: + +``` +OWNER BATCH_NAME SUBMITTED DONE RUN IDLE TOTAL JOB_IDS +user job_1234567 3/25 14:02 - 1 - 1 1234567.0 +``` + +### condor_watch_q — live updates + +```bash +$ condor_watch_q +``` + +This provides a live-updating view of your jobs, similar to `watch condor_q`. Press `Ctrl+C` to exit. + +### Understanding job states + +| State | Meaning | +|-------|---------| +| **Idle (I)** | Job is waiting for a matching execute node. This is normal — it may take seconds to minutes depending on cluster load and your resource requests. | +| **Running (R)** | Job is actively executing on an execute node. | +| **Held (H)** | Something went wrong and HTCondor has paused the job. Check the hold reason with `condor_q -hold`. Common causes include requesting more resources than available, Docker image pull failures, or file transfer errors. | +| **Completed (C)** | Job finished. Check `.out`, `.err`, and `.log` files for results. | + +::::::::::::::::::::::::::::::::::::: callout + +### What to do when a job is held + +A held job will not run until you fix the problem. To see why a job is held: + +```bash +$ condor_q -hold +``` + +Common hold reasons and fixes: + +- **"Docker image not found"** — Check for typos in `docker_image`. Verify the image exists on Docker Hub. +- **"Failed to transfer input files"** — Make sure all files listed in `transfer_input_files` exist in your submit directory. +- **"Memory limit exceeded"** — Your job used more memory than requested. Increase `request_memory` and resubmit. + +After fixing the issue, you can release the held job: + +```bash +$ condor_release +``` + +Or remove it and resubmit: + +```bash +$ condor_rm +``` + +::::::::::::::::::::::::::::::::::::: + +## Checking results after completion + +Once your job disappears from `condor_q` (meaning it has completed), check the output files in your submit directory: + +```bash +$ ls job_1234567.* +job_1234567.log job_1234567.out job_1234567.err +``` + +### Inspect stdout + +```bash +$ cat job_1234567.out +``` + +This should show the output from your training script, including dataset sizes, training time, and where the model was saved. + +### Check for errors + +```bash +$ cat job_1234567.err +``` + +If the job succeeded, this file is typically empty or contains only minor warnings. If the job failed, the Python traceback will appear here. + +### Check the HTCondor log + +```bash +$ cat job_1234567.log +``` + +The log file contains system-level information about your job's lifecycle. At the end of a completed job, you will see a resource usage summary: + +``` +Partitionable Resources : Usage Request Allocated + Cpus : 1 1 1 + Disk (KB) : 150000 1048576 4110820 + Memory (MB) : 450 2048 2048 +``` + +This tells you how much of each resource your job actually used compared to what you requested. Use this to right-size future requests — if you requested 2 GB of memory but only used 450 MB, you can safely reduce `request_memory` to `1GB` next time. + +### Check for the model artifact + +```bash +$ ls -lh xgboost-model +``` + +If this file exists, your training job completed successfully and transferred the model back to the submit node. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Submit and monitor a training job + +1. Create a file called `train_xgboost.sub` with the submit file contents shown above. +2. Make sure `train_xgboost.py` and `titanic_train.csv` are in the same directory. +3. Submit the job with `condor_submit train_xgboost.sub`. +4. Monitor it with `condor_q` and `condor_watch_q`. +5. After the job completes, examine the `.out`, `.err`, and `.log` files. +6. Verify that the `xgboost-model` file was transferred back. + +**Bonus:** Look at the resource usage summary in the `.log` file. How much memory did the job actually use? Could you reduce `request_memory` for future runs? + +::::::::::::::::::::::::::::::::::::::: solution + +### Solution + +```bash +$ condor_submit train_xgboost.sub +Submitting job(s). +1 job(s) submitted to cluster 1234567. + +$ condor_q +OWNER BATCH_NAME SUBMITTED DONE RUN IDLE TOTAL JOB_IDS +user job_1234567 3/25 14:02 - 1 - 1 1234567.0 + +$ cat job_1234567.out +# (training output: dataset size, training time, model saved message) + +$ cat job_1234567.err +# (should be empty or contain minor warnings) + +$ ls -lh xgboost-model +-rw-r--r-- 1 user user 48K Mar 25 14:05 xgboost-model +``` + +For the bonus: check the `Partitionable Resources` section of the `.log` file. The Titanic dataset is small, so memory usage will likely be well under 1 GB. You could safely reduce `request_memory` to `1GB` for this job, though 2 GB provides a comfortable margin. + +::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::: challenge + +### Diagnose a held job + +Suppose you submit a job and see this in `condor_q`: + +``` +OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS +user job_1234568 3/25 14:10 - - - 1 1 1234568.0 +``` + +The job is held. What steps would you take to diagnose and fix the problem? + +::::::::::::::::::::::::::::::::::::::: solution + +### Solution + +1. Run `condor_q -hold` to see the hold reason. For example: + ``` + 1234568.0: Error from slot: Failed to pull Docker image 'python:3.1' + ``` +2. The hold reason tells you the Docker image name is wrong — `python:3.1` does not exist (the correct tag is `python:3.10`). +3. Fix the `docker_image` line in your submit file. +4. Remove the held job with `condor_rm 1234568`. +5. Resubmit with `condor_submit train_xgboost.sub`. + +The general debugging workflow is: **check the hold reason** with `condor_q -hold`, **fix the underlying issue** in your submit file or script, **remove the broken job**, and **resubmit**. + +::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::::::::::: + +## Summary of useful HTCondor commands + +| Command | Purpose | +|---------|---------| +| `condor_submit file.sub` | Submit a job | +| `condor_q` | Check your jobs in the queue | +| `condor_watch_q` | Live-updating job status | +| `condor_q -hold` | See why a job is held | +| `condor_release ` | Release a held job | +| `condor_rm ` | Remove a job from the queue | +| `condor_history ` | Check details of a completed job | + +::::::::::::::::::::::::::::::::::::: keypoints + +- **Test locally first**: Always run your training script on the submit node with a small test before submitting to HTCondor. +- **Submit files are declarative**: An HTCondor `.sub` file specifies the executable, container image, input files, resource requests, and log file locations — everything HTCondor needs to run your job. +- **Docker containers provide reproducibility**: Use `universe = docker` and `docker_image` to run jobs in a consistent software environment across different execute nodes. +- **File transfer is automatic**: HTCondor transfers input files to the execute node before the job starts and transfers output files back when it finishes. +- **Monitor and debug with HTCondor tools**: Use `condor_q`, `condor_watch_q`, and `condor_q -hold` to track job status, and inspect `.out`, `.err`, and `.log` files to diagnose problems. +- **Right-size your resource requests**: Check actual resource usage in the `.log` file and adjust `request_cpus`, `request_memory`, and `request_disk` accordingly. + +:::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/05-Training-models-in-VertexAI-GPUs.md b/episodes/05-Training-models-in-VertexAI-GPUs.md deleted file mode 100644 index 64538400..00000000 --- a/episodes/05-Training-models-in-VertexAI-GPUs.md +++ /dev/null @@ -1,531 +0,0 @@ ---- -title: "Training Models in Vertex AI: PyTorch Example" -teaching: 20 -exercises: 10 ---- - -:::::::::::::::::::::::::::::::::::::: questions - -- When should you consider a GPU (or TPU) instance for PyTorch training in Vertex AI, and what are the trade‑offs for small vs. large workloads? -- How do you launch a script‑based training job and write **all** artifacts (model, metrics, logs) next to each other in GCS without deploying a managed model? - -:::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: objectives - -- Prepare the Titanic dataset and save train/val arrays to compressed `.npz` files in GCS. -- Submit a *CustomTrainingJob* that runs a PyTorch script and explicitly writes outputs to a chosen `gs://…/artifacts/.../` folder. -- Co‑locate artifacts: `model.pt` (or `.joblib`), `metrics.json`, `eval_history.csv`, and `training.log` for reproducibility. -- Choose CPU vs. GPU instances sensibly; understand when distributed training is (not) worth it. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -## Initial setup - -#### 1. Open pre-filled notebook -Navigate to `/Intro_GCP_for_ML/notebooks/05-Training-models-in-VertexAI-GPUs.ipynb` to begin this notebook. **Select the *PyTorch* environment (kernel).** Local PyTorch is only needed for local tests. Your *Vertex AI job* uses the container specified by `container_uri` (e.g., `pytorch-xla.2-4.py310` for CPU or `pytorch-gpu.2-4.py310` for GPU), so it brings its own framework at run time. - -#### 2. CD to instance home directory -To ensure we're all in the same starting spot, change directory to your Jupyter home directory. - -```python -%cd /home/jupyter/ -``` - -#### 3. Set environment variables -This code initializes the Vertex AI environment by importing the Python SDK, setting the project, region, and defining a GCS bucket for input/output data. - -```python -from google.cloud import aiplatform, storage -client = storage.Client() -PROJECT_ID = client.project -REGION = "us-central1" -BUCKET_NAME = "doe-titanic" # ADJUST to your bucket's name -LAST_NAME = 'DOE' # ADJUST to your last name. Since we're in a shared account environment, this will help us track down jobs in the Console - -print(f"project = {PROJECT_ID}\nregion = {REGION}\nbucket = {BUCKET_NAME}") - -# initializes the Vertex AI environment with the correct project and location. Staging bucket is used for storing the compressed software that's packaged for training/tuning jobs. -aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=f"gs://{BUCKET_NAME}/.vertex_staging") # store tar balls in staging folder -``` - -## Prepare data as `.npz` - -Unlike the XGBoost script from Episode 4 (which handles preprocessing internally from raw CSV), our PyTorch script expects pre-processed NumPy arrays. We'll prepare those here and save them as `.npz` files. - -Why `.npz`? NumPy's `.npz` files are compressed binary containers that can store multiple arrays (e.g., features and labels) together in a single file: - -- **Compact & fast:** smaller than CSV, and one file can hold multiple arrays (`X_train`, `y_train`). -- **Cloud-friendly:** each `.npz` is a single GCS object — one network call to read instead of streaming many small files, reducing latency and egress costs. -- **Vertex AI integration:** when you launch a training job, GCS objects are automatically staged to the job VM's local scratch disk, so `np.load(...)` reads from local storage at runtime. -- **Reproducible:** unlike CSV, `.npz` preserves exact dtypes and shapes across environments. - - -```python -import pandas as pd -import io -import numpy as np -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler, LabelEncoder - -# Load Titanic CSV (from local or GCS you've already downloaded to the notebook) -bucket = client.bucket(BUCKET_NAME) -blob = bucket.blob("titanic_train.csv") -df = pd.read_csv(io.BytesIO(blob.download_as_bytes())) - -# Minimal preprocessing to numeric arrays -sex_enc = LabelEncoder().fit(df["Sex"]) -df["Sex"] = sex_enc.transform(df["Sex"]) -df["Embarked"] = df["Embarked"].fillna("S") -emb_enc = LabelEncoder().fit(df["Embarked"]) -df["Embarked"] = emb_enc.transform(df["Embarked"]) -df["Age"] = df["Age"].fillna(df["Age"].median()) -df["Fare"] = df["Fare"].fillna(df["Fare"].median()) - -X = df[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]].values -y = df["Survived"].values - -scaler = StandardScaler() -X = scaler.fit_transform(X) - -X_train, X_val, y_train, y_val = train_test_split( - X, y, test_size=0.2, random_state=42) - -np.savez("/home/jupyter/train_data.npz", X_train=X_train, y_train=y_train) -np.savez("/home/jupyter/val_data.npz", X_val=X_val, y_val=y_val) - -``` - -We can then upload the files to our GCS bucket. - -```python -# Upload to GCS -bucket.blob("data/train_data.npz").upload_from_filename("/home/jupyter/train_data.npz") -bucket.blob("data/val_data.npz").upload_from_filename("/home/jupyter/val_data.npz") -print("Uploaded: gs://%s/data/train_data.npz and val_data.npz" % BUCKET_NAME) -``` - -Verify the upload by listing your bucket contents (same pattern as Episode 3): - -```python -for blob in client.list_blobs(BUCKET_NAME): - print(blob.name) -``` - -## Minimal PyTorch training script (`train_nn.py`) - local test - -Running a quick test on the Workbench notebook VM is cheap — it's a lightweight machine that costs only ~$0.19/hr. The real cost comes later when you launch managed training jobs with larger machines or GPUs. Think of your notebook as a low-cost controller: use it to catch bugs and verify logic before spending on cloud compute. - -As you gain confidence, you can skip the notebook VM entirely and run these tests on your own laptop or lab machine — then submit jobs to Vertex AI via the `gcloud` CLI or Python SDK from anywhere (see [Episode 8](08-CLI-workflows.md)). That eliminates the VM cost altogether. - -- For large datasets, use a small representative sample of the total dataset when testing locally (i.e., just to verify that code is working and model overfits nearly perfectly after training enough epochs) -- For larger models, use smaller model equivalents (e.g., 100M vs 7B params) when testing locally - -Find this file in our repo: `Intro_GCP_for_ML/scripts/train_nn.py`. It does three things: -1) loads `.npz` from local or GCS paths (transparently handles both) -2) trains a small neural network (a 3-layer MLP) with early stopping -3) writes all outputs side‑by‑side (model + metrics + eval history + training.log) to the folder specified by the `AIP_MODEL_DIR` environment variable (set automatically by Vertex AI via `base_output_dir`), falling back to the current directory for local runs. - -::::::::::::::::::::::::::::::::::::: callout -### What's inside `train_nn.py`? (Quick reference) -You don't need to understand every line of the PyTorch code for this workshop — the focus is on how to package and run *any* training script on Vertex AI. That said, here's a quick orientation: - -- **GCS helpers** (top of file): `read_npz_any()` and `save_*_any()` functions detect `gs://` paths and use the GCS Python client automatically. This is the key pattern that makes the same script work locally and in the cloud. -- **`AIP_MODEL_DIR`**: Vertex AI sets this environment variable to tell your script where to write artifacts. The script reads it at the top of `main()`. -- **Model**: A small feedforward network (`TitanicNet`) — the architecture details aren't important for this lesson. -- **Early stopping**: Training halts when validation loss stops improving (controlled by `--patience`). This saves compute time and cost on cloud jobs. -::::::::::::::::::::::::::::::::::::::::::::::::: - -To test this code, we can run the following: - -```python -# configure training hyperparameters to use in all model training runs downstream -MAX_EPOCHS = 500 -LR = 0.001 -PATIENCE = 50 - -# local training run -import time as t - -start = t.time() - -# Example: run your custom training script with args -!python /home/jupyter/Intro_GCP_for_ML/scripts/train_nn.py \ - --train /home/jupyter/train_data.npz \ - --val /home/jupyter/val_data.npz \ - --epochs $MAX_EPOCHS \ - --learning_rate $LR \ - --patience $PATIENCE - -print(f"Total local runtime: {t.time() - start:.2f} seconds") -``` - -::::::::::::::::::::::::::::::::::::::: callout -### NumPy version mismatch? -If the cell above fails with a NumPy error (e.g., `module 'numpy' has no attribute ...`), run this fix and then re-run the training cell: - -```python -!pip install --upgrade --force-reinstall "numpy<2" -``` -The PyTorch kernel occasionally ships with NumPy 2.x, which has breaking API changes. -:::::::::::::::::::::::::::::::::::::: - -### Reproducibility test -Without reproducibility, it's impossible to gain reliable insights into the efficacy of our methods. An essential component of applied ML/AI is ensuring our experiments are reproducible. Let's first rerun the same code we did above to verify we get the same result. - -* Take a look near the top of `Intro_GCP_for_ML/scripts/train_nn.py` where we are setting multiple numpy and torch seeds to ensure reproducibility. - -```python -import time as t - -start = t.time() - -# Example: run your custom training script with args -!python /home/jupyter/Intro_GCP_for_ML/scripts/train_nn.py \ - --train /home/jupyter/train_data.npz \ - --val /home/jupyter/val_data.npz \ - --epochs $MAX_EPOCHS \ - --learning_rate $LR \ - --patience $PATIENCE - -print(f"Total local runtime: {t.time() - start:.2f} seconds") -``` - -**Please don't use cloud resources for code that is not reproducible!** - -### Evaluate the locally trained model on the validation data - -Let's load the model we just trained and run it against the validation set. This confirms the saved weights produce the expected accuracy before we move to cloud training. - -```python -import sys, torch, numpy as np -sys.path.append("/home/jupyter/Intro_GCP_for_ML/scripts") -from train_nn import TitanicNet - -# load validation data -d = np.load("/home/jupyter/val_data.npz") -X_val, y_val = d["X_val"], d["y_val"] - -# Convert to PyTorch tensors with the dtypes the model expects: -# - Features → float32: neural-network layers (Linear, BatchNorm) operate on floats. -# - Labels → long (int64): nn.BCEWithLogitsLoss (and most classification losses) -# expect integer class labels, not floats. -X_val_t = torch.tensor(X_val, dtype=torch.float32) -y_val_t = torch.tensor(y_val, dtype=torch.long) - -# rebuild model and load weights -m = TitanicNet() -state = torch.load("/home/jupyter/model.pt", map_location="cpu", weights_only=True) -m.load_state_dict(state) -m.eval() - -with torch.no_grad(): - probs = m(X_val_t).squeeze(1) # [N], sigmoid outputs in (0,1) - preds_t = (probs >= 0.5).long() # [N] int64 - correct = (preds_t == y_val_t).sum().item() - acc = correct / y_val_t.shape[0] - -print(f"Local model val accuracy: {acc:.4f}") - -``` - -We should see an accuracy that matches our best epoch in the local training run. Note that in our setup, early stopping is based on validation loss; not accuracy. - -## Launch the training job - -In the previous episode, we trained an XGBoost model using Vertex AI's CustomTrainingJob interface. Here, we'll do the same for a PyTorch neural network. The structure is nearly identical — we define a training script, select a prebuilt container (CPU or GPU), and specify where to write all outputs in Google Cloud Storage (GCS). The main difference is that PyTorch requires us to save our own model weights and metrics inside the script rather than relying on Vertex to package a model automatically. - -### Set training job configuration vars - -::::::::::::::::::::::::::::::::::::: callout -### Check supported container versions -**Container URI format matters.** The container must be registered for *python package training* (used by `CustomTrainingJob`). Use the `pytorch-xla` variant with a Python-version suffix — e.g., `pytorch-xla.2-4.py310:latest`. The `pytorch-cpu` and `pytorch-gpu` variants may not be registered for python package training. - -Google periodically retires older versions. If you see an `INVALID_ARGUMENT` error about an unsupported image, check the current list at [Prebuilt containers for training](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers#pytorch) and update the version number. -::::::::::::::::::::::::::::::::::::: - -```python -import datetime as dt -RUN_ID = dt.datetime.now().strftime("%Y%m%d-%H%M%S") -ARTIFACT_DIR = f"gs://{BUCKET_NAME}/artifacts/pytorch/{RUN_ID}" -IMAGE = 'us-docker.pkg.dev/vertex-ai/training/pytorch-xla.2-4.py310:latest' -MACHINE = "n1-standard-4" # CPU fine for small datasets - -print(f"RUN_ID = {RUN_ID}\nARTIFACT_DIR = {ARTIFACT_DIR}\nMACHINE = {MACHINE}") -``` - -### Init the training job with configurations - -```python -# init job (this does not consume any resources) -DISPLAY_NAME = f"{LAST_NAME}_pytorch_nn_{RUN_ID}" -print(DISPLAY_NAME) - -# init the job. This does not consume resources until we run job.run() -job = aiplatform.CustomTrainingJob( - display_name=DISPLAY_NAME, - script_path="Intro_GCP_for_ML/scripts/train_nn.py", - container_uri=IMAGE) - -``` - -### Run the job, paying for our `MACHINE` on-demand. - -```python -job.run( - args=[ - f"--train=gs://{BUCKET_NAME}/data/train_data.npz", - f"--val=gs://{BUCKET_NAME}/data/val_data.npz", - f"--epochs={MAX_EPOCHS}", - f"--learning_rate={LR}", - f"--patience={PATIENCE}", - ], - replica_count=1, - machine_type=MACHINE, - base_output_dir=ARTIFACT_DIR, # sets AIP_MODEL_DIR used by your script - sync=True, -) -print("Artifacts folder:", ARTIFACT_DIR) -``` -#### Monitoring training jobs in the Console - -> **Why do I see both a Training Pipeline and a Custom Job?** -> Under the hood, `CustomTrainingJob.run()` creates a **TrainingPipeline** resource, which in turn launches a **CustomJob** to do the actual compute work. This is normal — the pipeline is a thin wrapper that manages job lifecycle. You can monitor progress from either view, but **Custom Jobs** shows the most useful details (logs, machine type, status). - -1. Go to the [Google Cloud Console](https://console.cloud.google.com/vertex-ai/training/custom-jobs). -2. Navigate to **Vertex AI > Training > Custom Jobs**. -3. Click on your job name to see status, logs, and output model artifacts. -4. Cancel jobs from the console if needed (be careful not to stop jobs you don't own in shared projects). - -After the job completes, your training script writes several output files to the GCS artifact directory. Here's what you'll find in `gs://…/artifacts/pytorch//`: - -- `model.pt` — PyTorch weights (`state_dict`). -- `metrics.json` — final val loss, hyperparameters, dataset sizes, device, model URI. -- `eval_history.csv` — per‑epoch validation loss (for plots/regression checks). -- `training.log` — complete stdout/stderr for reproducibility and debugging. - -### Evaluate the Vertex-trained model on the validation data - -We can check our work to see if this model gives the same result as our "locally" trained model above. - -To follow best practices, we will simply load this model into memory from GCS. - -```python -import sys, torch, numpy as np -sys.path.append("/home/jupyter/Intro_GCP_for_ML/scripts") -from train_nn import TitanicNet - -# ----------------- -# download model.pt straight into memory and load weights -# ----------------- - -ARTIFACT_PREFIX = f"artifacts/pytorch/{RUN_ID}/model" - -MODEL_PATH = f"{ARTIFACT_PREFIX}/model.pt" -model_blob = bucket.blob(MODEL_PATH) -model_bytes = model_blob.download_as_bytes() - -# load from bytes -model_pt = io.BytesIO(model_bytes) - -# rebuild model and load weights -state = torch.load(model_pt, map_location="cpu", weights_only=True) -m = TitanicNet() -m.load_state_dict(state) -m.eval(); -``` - -Evaluate using the same pattern from the CPU evaluation section above — load validation data from GCS, run predictions, and check accuracy. The results should match the CPU job since we set random seeds. - -```python -# Read validation data from GCS (reuses val data from local eval above) -VAL_PATH = "data/val_data.npz" -val_blob = bucket.blob(VAL_PATH) -val_bytes = val_blob.download_as_bytes() -d = np.load(io.BytesIO(val_bytes)) -X_val, y_val = d["X_val"], d["y_val"] -X_val_t = torch.tensor(X_val, dtype=torch.float32) # features → float for network layers -y_val_t = torch.tensor(y_val, dtype=torch.long) # labels → int64 for loss function - -with torch.no_grad(): - probs = m(X_val_t).squeeze(1) - preds_t = (probs >= 0.5).long() - correct = (preds_t == y_val_t).sum().item() - acc = correct / y_val_t.shape[0] - -print(f"Vertex model val accuracy: {acc:.4f}") -``` - -## GPU-Accelerated Training on Vertex AI - -Our CPU job above worked fine for this small dataset. In practice, you'd switch to a GPU when training takes too long on CPU — typically with larger models (millions of parameters) or larger datasets (hundreds of thousands of rows). For the Titanic dataset, the GPU will likely be *slower* end-to-end due to provisioning overhead, but we'll run it here to learn the workflow. - -The changes from CPU to GPU are minimal — this is one of the advantages of Vertex AI's container-based approach: - -- The container image switches to the GPU-enabled version (`pytorch-gpu.2-4.py310:latest`), which includes CUDA and cuDNN. -- The machine type (`n1-standard-8`) defines CPU and memory resources, while we add a GPU accelerator (`NVIDIA_TESLA_T4`, `NVIDIA_L4`, etc.). **For guidance on selecting a machine type and accelerator, visit the [Compute for ML](https://qualiamachine.github.io/Intro_GCP_for_ML/compute-for-ML.html) resource.** -- The training script, arguments, and artifact handling all stay the same. - -::::::::::::::::::::::::::::::::::::: callout -### GPU quota unavailable? -If your job fails with a quota error, don't worry — re-run using the CPU configuration from the previous section. The results will be identical, just slower. GPU quota requests can take 1–3 business days to process. -::::::::::::::::::::::::::::::::::::: - -```python -from google.cloud import aiplatform - -RUN_ID = dt.datetime.now().strftime("%Y%m%d-%H%M%S") - -# GCS folder where ALL artifacts (model.pt, metrics.json, eval_history.csv, training.log) will be saved. -# Your train_nn.py writes to AIP_MODEL_DIR, and base_output_dir (below) sets that variable for the job. -ARTIFACT_DIR = f"gs://{BUCKET_NAME}/artifacts/pytorch/{RUN_ID}" - -# ---- Container image ---- -# Use a prebuilt TRAINING image that has PyTorch + CUDA. This enables GPU at runtime. -IMAGE = "us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.2-4.py310:latest" - -# ---- Machine vs Accelerator (important!) ---- -# machine_type = the VM's CPU/RAM shape. It is NOT a GPU by itself. -# We often pick n1-standard-8 as a balanced baseline for single-GPU jobs. -MACHINE = "n1-standard-8" - -# To actually get a GPU, you *attach* one via accelerator_type + accelerator_count. -# Common choices: -# "NVIDIA_TESLA_T4" (cost-effective, widely available) -# "NVIDIA_L4" (newer, CUDA 12.x, good perf/$) -# "NVIDIA_TESLA_V100" / "NVIDIA_A100_40GB" (high-end, pricey) -ACCELERATOR_TYPE = "NVIDIA_TESLA_T4" -ACCELERATOR_COUNT = 1 # Increase (2,4) only if your code supports multi-GPU (e.g., DDP) - -# Alternative (GPU-bundled) machines: -# If you pick an A2 type like "a2-highgpu-1g", it already includes 1 A100 GPU. -# In that case, you can omit accelerator_type/accelerator_count entirely. -# Example: -# MACHINE = "a2-highgpu-1g" -# (and then remove the accelerator_* kwargs in job.run) - -print( - "RUN_ID =", RUN_ID, - "\nARTIFACT_DIR =", ARTIFACT_DIR, - "\nIMAGE =", IMAGE, - "\nMACHINE =", MACHINE, - "\nACCELERATOR_TYPE =", ACCELERATOR_TYPE, - "\nACCELERATOR_COUNT =", ACCELERATOR_COUNT, -) - -DISPLAY_NAME = f"{LAST_NAME}_pytorch_nn_{RUN_ID}" - -job = aiplatform.CustomTrainingJob( - display_name=DISPLAY_NAME, - script_path="Intro_GCP_for_ML/scripts/train_nn.py", # Your PyTorch trainer - container_uri=IMAGE, # Must be a *training* image (not prediction) -) - -job.run( - args=[ - f"--train=gs://{BUCKET_NAME}/data/train_data.npz", - f"--val=gs://{BUCKET_NAME}/data/val_data.npz", - f"--epochs={MAX_EPOCHS}", - f"--learning_rate={LR}", - f"--patience={PATIENCE}", - ], - replica_count=1, # One worker (simple, cheaper) - machine_type=MACHINE, # CPU/RAM shape of the VM (no GPU implied) - accelerator_type=ACCELERATOR_TYPE, # Attaches the selected GPU model - accelerator_count=ACCELERATOR_COUNT, # Number of GPUs to attach - base_output_dir=ARTIFACT_DIR, # Sets AIP_MODEL_DIR used by your script for all artifacts - sync=True, # Waits for job to finish so you can inspect outputs immediately -) - -print("Artifacts folder:", ARTIFACT_DIR) -``` - -Just as we did for the CPU job, let's evaluate the GPU-trained model to confirm it produces the same accuracy. We load the model weights directly from GCS into memory. - -```python -import sys, torch, numpy as np -sys.path.append("/home/jupyter/Intro_GCP_for_ML/scripts") -from train_nn import TitanicNet - -# ----------------- -# download model.pt straight into memory and load weights -# ----------------- - -ARTIFACT_PREFIX = f"artifacts/pytorch/{RUN_ID}/model" - -MODEL_PATH = f"{ARTIFACT_PREFIX}/model.pt" -model_blob = bucket.blob(MODEL_PATH) -model_bytes = model_blob.download_as_bytes() - -# load from bytes -model_pt = io.BytesIO(model_bytes) - -# rebuild model and load weights -state = torch.load(model_pt, map_location="cpu", weights_only=True) -m = TitanicNet() -m.load_state_dict(state) -m.eval(); -``` - -Evaluate the GPU model using the same pattern — results should match because we set random seeds in `train_nn.py`. - -```python -with torch.no_grad(): - probs = m(X_val_t).squeeze(1) - preds_t = (probs >= 0.5).long() - correct = (preds_t == y_val_t).sum().item() - acc = correct / y_val_t.shape[0] - -print(f"GPU model val accuracy: {acc:.4f}") -``` - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Cloud workflow review - -Now that you've run both a CPU and GPU training job, answer the following: - -1. **Artifact location**: Where did Vertex AI write your model artifacts? How does `base_output_dir` in `job.run()` relate to the `AIP_MODEL_DIR` environment variable inside the container? -2. **CPU vs. GPU job time**: Compare the wall-clock times of your CPU and GPU jobs (visible in the Console under **Vertex AI > Training > Custom Jobs**). Which was faster? Why might the GPU job be *slower* for this dataset? -3. **Container choice**: We used `pytorch-xla.2-4.py310` for the CPU job and `pytorch-gpu.2-4.py310` for the GPU job. What would happen if you used the CPU container but still passed `accelerator_type` and `accelerator_count`? -4. **Cost awareness**: You used `n1-standard-4` for CPU and `n1-standard-8` + T4 for GPU. Using the [Compute for ML](https://qualiamachine.github.io/Intro_GCP_for_ML/compute-for-ML.html) resource, estimate the relative hourly cost difference between these configurations. - -:::::::::::::::::::::::::::::::::::::::: solution - -### Solution - -1. `base_output_dir` tells the Vertex AI SDK to set the `AIP_MODEL_DIR` environment variable inside the training container. Your script reads `os.environ.get("AIP_MODEL_DIR", ".")` and writes all artifacts there. The result is everything lands under `gs:///artifacts/pytorch//model/`. -2. For the small Titanic dataset (~700 training rows), the CPU job is typically faster end-to-end. GPU jobs incur extra overhead: provisioning the accelerator, loading CUDA libraries, and transferring data to the GPU. GPU acceleration pays off when training itself is the bottleneck (larger models, larger batches). -3. The job would either fail or ignore the GPU. The CPU container doesn't include CUDA/cuDNN, so even if a GPU is attached to the VM, PyTorch can't use it. Always match your container image to your hardware configuration. -4. Approximate on-demand rates (us-central1): `n1-standard-4` is ~ `$0.19`/hr; `n1-standard-8` + 1x T4 is ~ `$0.54`/hr (VM) + ~ `$0.35`/hr (T4) = ~ `$0.89`/hr total. The GPU configuration is roughly 4–5x more expensive per hour — worth it only when training speedup exceeds that cost ratio. - -::::::::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::::::: - -### GPU and scaling considerations - -- On small problems, GPU startup/transfer overhead can erase speedups — benchmark before you scale. -- Stick to a single GPU unless your workload genuinely saturates it. Multi-GPU (data parallelism / DDP) and model parallelism exist for large-scale training but add significant complexity and cost — well beyond this workshop's scope. - -## Clean up staging files - -As in Episode 4, each `job.run()` call leaves a tarball under `.vertex_staging/`. Delete them to keep your bucket tidy: - -```python -!gsutil -m rm -r gs://{BUCKET_NAME}/.vertex_staging/ -``` - -## Additional resources -To learn more about PyTorch and Vertex AI integrations, visit the docs: [docs.cloud.google.com/vertex-ai/docs/start/pytorch](https://docs.cloud.google.com/vertex-ai/docs/start/pytorch) - -::::::::::::::::::::::::::::::::::::: keypoints - -- Use **CustomTrainingJob** with a prebuilt PyTorch container; your script reads `AIP_MODEL_DIR` (set automatically by `base_output_dir`) to know where to write artifacts. -- Keep artifacts **together** (model, metrics, history, log) in one GCS folder for reproducibility. -- `.npz` is a compact, cloud-friendly format — one GCS read per file, preserves exact dtypes. -- Start on CPU for small datasets; add a GPU only when training time justifies the extra provisioning overhead and cost. -- `staging_bucket` is just for the SDK's packaging tarball — `base_output_dir` is where your script's actual artifacts go. - -:::::::::::::::::::::::::::::::::::::::::::::::: - diff --git a/episodes/05-Training-models-on-CHTC-GPUs.md b/episodes/05-Training-models-on-CHTC-GPUs.md new file mode 100644 index 00000000..e1559eeb --- /dev/null +++ b/episodes/05-Training-models-on-CHTC-GPUs.md @@ -0,0 +1,380 @@ +--- +title: "Training Models on CHTC GPUs" +teaching: 20 +exercises: 10 +--- + +:::::::::::::::::::::::::::::::::::::: questions + +- How do you request and use GPUs for PyTorch training on CHTC? +- What GPU hardware is available in the CHTC GPU Lab, and how do you select the right one? +- When is GPU training worth the overhead compared to CPU-only training? + +:::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: objectives + +- Prepare the Titanic dataset and save train/val arrays to compressed `.npz` files on the submit node. +- Write an HTCondor submit file that requests GPU resources and uses a Docker container with PyTorch and CUDA. +- Submit a GPU training job to the CHTC GPU Lab and monitor its progress. +- Compare CPU vs. GPU training times and understand when GPU acceleration is beneficial. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +## Overview + +In the previous episode, we trained an XGBoost model using HTCondor on CHTC. Here, we'll do the same for a PyTorch neural network -- but this time we'll use GPUs. The CHTC GPU Lab provides shared access to high-end NVIDIA GPUs for researchers at UW-Madison. We'll learn how to request GPU resources in an HTCondor submit file, use a GPU-enabled Docker container, and understand when GPUs actually help. + +## The CHTC GPU Lab + +CHTC maintains a dedicated GPU Lab with several types of NVIDIA GPUs available to researchers. The current inventory includes: + +| GPU Model | VRAM | Typical Use Cases | +|-----------|------|-------------------| +| NVIDIA T4 | 16 GB | Inference, small-to-medium training | +| NVIDIA L40 | 48 GB | Medium training, graphics workloads | +| NVIDIA A100 (40 GB) | 40 GB | Large-scale training, mixed precision | +| NVIDIA A100 (80 GB) | 80 GB | Large models, large batch sizes | +| NVIDIA H100 | 80 GB | Cutting-edge training, transformer models | +| NVIDIA H200 | 141 GB | Very large models, high-memory workloads | + +::::::::::::::::::::::::::::::::::::: callout + +### GPU Lab access + +To use the GPU Lab, your HTCondor submit file must include `+WantGPULab = true`. Without this flag, your job will not be matched to GPU Lab hardware. All CHTC users with an active account can access the GPU Lab -- no special quota request is needed (unlike some cloud providers). + +::::::::::::::::::::::::::::::::::::: + +## Prepare data as `.npz` + +Unlike the XGBoost script from Episode 4 (which handles preprocessing internally from raw CSV), our PyTorch script expects pre-processed NumPy arrays. We prepare those on the submit node and save them as `.npz` files. + +Why `.npz`? NumPy's `.npz` files are compressed binary containers that can store multiple arrays (e.g., features and labels) together in a single file: + +- **Compact and fast:** smaller than CSV, and one file can hold multiple arrays (`X_train`, `y_train`). +- **Transfer-friendly:** each `.npz` is a single file -- one transfer operation instead of streaming many small files. +- **Reproducible:** unlike CSV, `.npz` preserves exact dtypes and shapes across environments. + +Run the following Python script on the submit node to create the data files. Save this as `prepare_data.py`: + +```python +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler, LabelEncoder + +# Load Titanic CSV +df = pd.read_csv("titanic_train.csv") + +# Minimal preprocessing to numeric arrays +sex_enc = LabelEncoder().fit(df["Sex"]) +df["Sex"] = sex_enc.transform(df["Sex"]) +df["Embarked"] = df["Embarked"].fillna("S") +emb_enc = LabelEncoder().fit(df["Embarked"]) +df["Embarked"] = emb_enc.transform(df["Embarked"]) +df["Age"] = df["Age"].fillna(df["Age"].median()) +df["Fare"] = df["Fare"].fillna(df["Fare"].median()) + +X = df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].values +y = df["Survived"].values + +scaler = StandardScaler() +X = scaler.fit_transform(X) + +X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=0.2, random_state=42) + +np.savez("train_data.npz", X_train=X_train, y_train=y_train) +np.savez("val_data.npz", X_val=X_val, y_val=y_val) + +print(f"Created train_data.npz ({X_train.shape[0]} samples) " + f"and val_data.npz ({X_val.shape[0]} samples)") +``` + +Run this on the submit node: + +```bash +python3 prepare_data.py +``` + +You should now have `train_data.npz` and `val_data.npz` in your working directory. + +## The training script: `train_nn.py` + +Find this file in our repo: `Intro_GCP_for_ML/scripts/train_nn.py`. It does three things: + +1. Loads `.npz` files from local paths. +2. Trains a small neural network (a 3-layer MLP) with early stopping. +3. Writes all outputs side-by-side: `model.pt`, `metrics.json`, `eval_history.csv`, and `training.log`. + +::::::::::::::::::::::::::::::::::::: callout + +### What's inside `train_nn.py`? (Quick reference) + +You don't need to understand every line of the PyTorch code for this workshop -- the focus is on how to package and run *any* training script on CHTC GPUs. Here is a quick orientation: + +- **Model**: A small feedforward network (`TitanicNet`) -- the architecture details are not important for this lesson. +- **Early stopping**: Training halts when validation loss stops improving (controlled by `--patience`). This saves compute time. +- **Device detection**: The script automatically detects whether a GPU is available (`torch.cuda.is_available()`) and moves the model and data accordingly. The same script works on both CPU and GPU without modification. + +::::::::::::::::::::::::::::::::::::: + +## The wrapper script: `run_training.sh` + +When using HTCondor's Docker universe, the executable must be a shell script that runs inside the container. Create a file called `run_training.sh`: + +```bash +#!/bin/bash + +python3 train_nn.py \ + --train train_data.npz \ + --val val_data.npz \ + --epochs 500 \ + --learning_rate 0.001 \ + --patience 50 +``` + +Make it executable: + +```bash +chmod +x run_training.sh +``` + +::::::::::::::::::::::::::::::::::::: callout + +### Why a wrapper script? + +In HTCondor's Docker universe, the `executable` field specifies a script that runs inside the container. You cannot directly set `executable = python3` with arguments in the same way you might on the command line. The wrapper script provides a clean way to call Python with all the necessary arguments. This pattern is standard practice for CHTC Docker jobs. + +::::::::::::::::::::::::::::::::::::: + +## The GPU submit file + +Here is the HTCondor submit file for running our PyTorch training on a GPU. Save this as `gpu_train.sub`: + +``` +universe = docker +docker_image = pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime + +executable = run_training.sh + +transfer_input_files = train_nn.py, train_data.npz, val_data.npz + +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +log = gpu_job_$(Cluster).log +output = gpu_job_$(Cluster).out +error = gpu_job_$(Cluster).err + +request_cpus = 1 +request_gpus = 1 +require_gpus = (GlobalMemoryMb >= 10000) +request_memory = 8GB +request_disk = 4GB + ++WantGPULab = true ++GPUJobLength = "short" + +queue 1 +``` + +Let's break down the key GPU-specific lines: + +### `universe = docker` and `docker_image` + +We use a prebuilt PyTorch Docker image that includes CUDA and cuDNN. This means we don't need to install any GPU drivers or libraries ourselves -- the container has everything PyTorch needs to use the GPU. + +### `request_gpus = 1` + +This tells HTCondor to match your job to a machine that has at least one available GPU and to allocate that GPU to your job. + +### `require_gpus = (GlobalMemoryMb >= 10000)` + +This is a constraint expression that filters GPU hardware. Here we require at least 10 GB of GPU memory, which excludes older/smaller GPUs. You can also constrain on specific GPU properties: + +- `(GlobalMemoryMb >= 40000)` -- require 40+ GB (matches A100 40GB and above) +- `(GlobalMemoryMb >= 80000)` -- require 80+ GB (matches A100 80GB, H100, H200) + +### `+WantGPULab = true` + +This flag tells HTCondor to route your job to the CHTC GPU Lab pool. Without it, your job will not be matched to GPU Lab machines. + +### `+GPUJobLength` + +CHTC GPU jobs have runtime limits to ensure fair sharing. You must declare your expected job length: + +| Value | Maximum Runtime | +|-------|----------------| +| `"short"` | 12 hours | +| `"medium"` | 24 hours | +| `"long"` | 7 days | + +Choose the shortest category that fits your job. Shorter jobs are scheduled faster because they can fill smaller gaps in the schedule. If your job exceeds the declared time limit, it will be held or evicted. + +## Submit and monitor the GPU job + +Submit the job: + +```bash +condor_submit gpu_train.sub +``` + +Monitor with standard HTCondor commands: + +```bash +# Check job status +condor_q + +# Watch job status update every 5 seconds +condor_q -nobatch + +# View detailed job information +condor_q -l + +# Check which GPU was assigned (after job starts running) +condor_q -af GPUs_DeviceName GPUs_GlobalMemoryMb +``` + +::::::::::::::::::::::::::::::::::::: callout + +### GPU job queue times + +GPU jobs may wait longer in the queue than CPU-only jobs because GPU hardware is a shared, limited resource. Jobs requesting `"short"` runtimes generally start sooner because they can backfill into smaller scheduling gaps. If your job is idle for a long time, check that your `require_gpus` constraint is not too restrictive. + +::::::::::::::::::::::::::::::::::::: + +Once the job completes, check the output: + +```bash +# View training output +cat gpu_job_*.out + +# Check for errors +cat gpu_job_*.err + +# List output files +ls -la model.pt metrics.json eval_history.csv training.log +``` + +## Comparing CPU vs. GPU training + +To compare, you can also submit a CPU-only version of the job. Create `cpu_train.sub`: + +``` +universe = docker +docker_image = pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime + +executable = run_training.sh + +transfer_input_files = train_nn.py, train_data.npz, val_data.npz + +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +log = cpu_job_$(Cluster).log +output = cpu_job_$(Cluster).out +error = cpu_job_$(Cluster).err + +request_cpus = 1 +request_memory = 8GB +request_disk = 4GB + +queue 1 +``` + +Notice that the CPU version simply omits `request_gpus`, `require_gpus`, `+WantGPULab`, and `+GPUJobLength`. The same Docker image works for both -- PyTorch will automatically fall back to CPU when no GPU is available. + +Submit and compare: + +```bash +condor_submit cpu_train.sub +``` + +For the Titanic dataset (roughly 700 training samples, 7 features, 3-layer MLP), you will likely find that: + +- **CPU training time**: a few seconds +- **GPU training time**: similar or slightly longer + +The GPU overhead (CUDA initialization, data transfer to GPU memory) can actually make small jobs *slower* on a GPU. This is expected and normal. + +:::::::::::::::::::::::::::::::::::::::: challenge + +### When is a GPU worth it? + +Consider the following scenarios. For each one, decide whether you would request a GPU or stick with CPU-only training on CHTC: + +1. Training a 3-layer MLP on the Titanic dataset (891 rows, 7 features). +2. Fine-tuning a ResNet-50 model on 50,000 images (224x224 pixels). +3. Training a transformer language model with 125 million parameters on 10 GB of text. +4. Running 200 independent hyperparameter trials of a small random forest on tabular data. + +:::::::::::::::::::::::::::::::::::::::: solution + +### Solution + +1. **CPU.** The dataset and model are tiny. GPU overhead would likely make it slower, and you would wait longer in the queue for a GPU slot. +2. **GPU.** Image models like ResNet involve large matrix operations (convolutions) on high-dimensional data. A GPU will be significantly faster -- potentially 10-50x. +3. **GPU (possibly multiple).** Transformer training is extremely compute-intensive. Even a single A100 might take days; without a GPU this would be impractical. +4. **CPU.** Random forests are not GPU-accelerated in standard scikit-learn. The 200 trials are independent, so submit them as 200 separate CPU jobs and let HTCondor parallelize across the cluster. + +::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::::: + +## GPU training workflow summary + +Here is the complete workflow for GPU training on CHTC: + +1. **Prepare data** on the submit node (create `.npz` files). +2. **Write your training script** (`train_nn.py`) to detect GPU automatically with `torch.cuda.is_available()`. +3. **Write a wrapper script** (`run_training.sh`) that calls Python inside the container. +4. **Write a submit file** (`gpu_train.sub`) with `request_gpus`, `require_gpus`, `+WantGPULab = true`, and `+GPUJobLength`. +5. **Submit and monitor** with `condor_submit` and `condor_q`. +6. **Collect results** from the transferred output files. + +:::::::::::::::::::::::::::::::::::::::: challenge + +### Modify the submit file + +Starting from the GPU submit file above, make the following changes: + +1. Request a GPU with at least 40 GB of memory. +2. Set the job length to "medium" (24-hour limit). +3. Request 16 GB of system memory instead of 8 GB. + +:::::::::::::::::::::::::::::::::::::::: solution + +### Solution + +The three lines that change: + +``` +require_gpus = (GlobalMemoryMb >= 40000) +request_memory = 16GB ++GPUJobLength = "medium" +``` + +This would match A100 (40 GB or 80 GB), H100, or H200 GPUs, allow up to 24 hours of runtime, and provide more system RAM for data loading and preprocessing. + +::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::::: + +## Additional resources + +- [CHTC GPU Lab guide](https://chtc.cs.wisc.edu/uw-research-computing/gpu-lab) +- [HTCondor documentation on GPUs](https://htcondor.readthedocs.io/en/latest/) +- [PyTorch CUDA documentation](https://pytorch.org/docs/stable/cuda.html) +- [Docker Hub: PyTorch images](https://hub.docker.com/r/pytorch/pytorch/tags) + +::::::::::::::::::::::::::::::::::::: keypoints + +- Use `request_gpus`, `require_gpus`, `+WantGPULab = true`, and `+GPUJobLength` in your HTCondor submit file to request GPU resources from the CHTC GPU Lab. +- GPU-enabled Docker containers (e.g., `pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime`) provide CUDA and cuDNN so your code can use the GPU without manual driver installation. +- Prepare data as `.npz` files on the submit node and transfer them with the job -- this is compact, fast, and reproducible. +- GPU acceleration pays off for large models and large datasets; for small problems like Titanic, CPU is often faster due to GPU initialization overhead. +- Declare the shortest `+GPUJobLength` that fits your job (`"short"` = 12 hr, `"medium"` = 24 hr, `"long"` = 7 days) -- shorter jobs are scheduled sooner. + +:::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/06-Hyperparameter-tuning.md b/episodes/06-Hyperparameter-tuning.md index 20eed062..a15ed85f 100644 --- a/episodes/06-Hyperparameter-tuning.md +++ b/episodes/06-Hyperparameter-tuning.md @@ -1,422 +1,498 @@ --- -title: "Hyperparameter Tuning in Vertex AI: Neural Network Example" +title: "Hyperparameter Tuning on CHTC" teaching: 40 exercises: 10 --- -:::::::::::::::::::::::::::::::::::::: questions +:::::::::::::::::::::::::::::::::::::: questions -- How can we efficiently manage hyperparameter tuning in Vertex AI? -- How can we parallelize tuning jobs to optimize time without increasing costs? +- How can I use HTCondor to run many training jobs with different hyperparameters in parallel? +- What are the different ways to parameterize and submit multiple jobs in a single submit file? +- How do I collect and compare results from a hyperparameter sweep? :::::::::::::::::::::::::::::::::::::::::::::::: ::::::::::::::::::::::::::::::::::::: objectives -- Set up and run a hyperparameter tuning job in Vertex AI. -- Define search spaces using `DoubleParameterSpec` and `IntegerParameterSpec`. -- Log and capture objective metrics for evaluating tuning success. -- Optimize tuning setup to balance cost and efficiency, including parallelization. +- Write an HTCondor submit file that parameterizes hyperparameters using `$(variable)` syntax. +- Use HTCondor's `queue` command with inline lists, external files, and variable substitution to launch parallel sweeps. +- Collect `metrics.json` files from multiple jobs and identify the best trial. +- Understand the trade-offs between grid/random search on CHTC and managed Bayesian optimization services. :::::::::::::::::::::::::::::::::::::::::::::::: -In the previous episode (Episode 5) you submitted a single PyTorch training job to Vertex AI and inspected its artifacts. That gave you one model trained with one set of hyperparameters. In practice, choices like learning rate, early-stopping patience, and regularization thresholds can dramatically affect model quality — and the best combination is rarely obvious up front. +In the previous episode (Episode 5) you submitted a single PyTorch training job to CHTC and inspected its artifacts. That gave you one model trained with one set of hyperparameters. In practice, choices like learning rate, early-stopping patience, and regularization thresholds can dramatically affect model quality — and the best combination is rarely obvious up front. -In this episode we'll use Vertex AI's **Hyperparameter Tuning Jobs** to systematically search for better settings. The key is defining a clear search space, ensuring metrics are properly logged, and keeping costs manageable by controlling the number of trials and level of parallelization. +In this episode we will use HTCondor's **`queue` command** to systematically search for better settings by launching many training jobs in parallel, each with a different combination of hyperparameters. The `train_nn.py` script from Episode 5 already saves a `metrics.json` file with final validation accuracy and loss — we just need to run it many times and compare the results. -### Key steps for hyperparameter tuning +### Why CHTC is great for hyperparameter tuning -The overall process involves these steps: +Hyperparameter tuning is an *embarrassingly parallel* problem: each trial is completely independent, so you can run them all at the same time. This is exactly the kind of workload CHTC is built for. Key advantages: -1. Prepare the training script and ensure metrics are logged. -2. Define the hyperparameter search space. -3. Configure a hyperparameter tuning job in Vertex AI. -4. Set data paths and launch the tuning job. -5. Monitor progress in the Vertex AI Console. -6. Extract the best model and inspect recorded metrics. +- **Massive parallelism** — CHTC can run hundreds of independent jobs simultaneously across its shared pool. A sweep that would take hours sequentially can finish in the time of a single trial. +- **No cost** — all of these jobs are free for UW-Madison researchers. There are no credits to burn, no billing surprises, and no reason to limit your search space to save money. +- **Simple to set up** — HTCondor's `queue` syntax makes it straightforward to parameterize jobs without writing custom orchestration code. -## Initial setup +Unlike managed services that use Bayesian optimization to choose the next trial based on previous results, CHTC sweeps are essentially **grid search** or **random search** — every combination is decided up front and launched independently. This sounds less sophisticated, but CHTC's massive parallelism more than compensates: you can afford to explore a much larger space when each trial is free and runs in parallel. -#### 1. Open pre-filled notebook -Navigate to `/Intro_GCP_for_ML/notebooks/06-Hyperparameter-tuning.ipynb` to begin this notebook. **Select the *PyTorch* environment (kernel).** Local PyTorch is only needed for local tests — your *Vertex AI job* uses the container specified by `container_uri` (e.g., `pytorch-xla.2-4.py310`), so it brings its own framework at run time. +### Key steps for hyperparameter tuning on CHTC -#### 2. CD to instance home directory -Change to your Jupyter home folder to keep paths consistent. +1. Write a submit file that uses `$(variable)` placeholders for hyperparameters. +2. Define the combinations to try (inline, in a file, or programmatically). +3. Submit all trials with a single `condor_submit` command. +4. Collect `metrics.json` from each job's output and find the best trial. + +## Writing a parameterized submit file + +HTCondor submit files support **variable substitution** using the `$(variable)` syntax. When you combine this with the `queue ... from` syntax, HTCondor creates one job per line of input, substituting the variables into every field of the submit file. + +Here is a complete submit file for a hyperparameter sweep: -```python -%cd /home/jupyter/ ``` +# File: tune_nn.sub +universe = docker +docker_image = pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime -## Prepare and configure the tuning job +executable = run_training.sh +arguments = --train train_data.npz --val val_data.npz --learning_rate $(lr) --patience $(pat) --epochs 500 -#### 3. Understand how the training script reports metrics -Your training script (`train_nn.py`) **already includes** hyperparameter tuning metric reporting — you don't need to modify it. Here's how it works: +transfer_input_files = train_nn.py, run_training.sh, train_data.npz, val_data.npz +transfer_output_remaps = "model.pt = results/model_$(Cluster)_$(Process).pt; metrics.json = results/metrics_$(Cluster)_$(Process).json" -The script uses the `cloudml-hypertune` library (pre-installed on Vertex AI training workers) to report metrics so the tuner can compare trials. A `try/except` block lets the same script run locally without crashing: +should_transfer_files = YES +when_to_transfer_output = ON_EXIT -```python -# Already in train_nn.py — initialization near the top: -try: - from hypertune import HyperTune - _hpt = HyperTune() - _hpt_enabled = True -except Exception: - _hpt = None - _hpt_enabled = False +log = logs/tune_$(Cluster)_$(Process).log +output = logs/tune_$(Cluster)_$(Process).out +error = logs/tune_$(Cluster)_$(Process).err + +request_cpus = 1 +request_memory = 4GB +request_disk = 2GB + +queue lr,pat from params.txt ``` -Inside the training loop, after computing validation metrics each epoch: +Let's break down the key parts: -```python -# Already in train_nn.py — inside the epoch loop: -if _hpt_enabled: - _hpt.report_hyperparameter_tuning_metric( - hyperparameter_metric_tag="validation_accuracy", - metric_value=val_acc, - global_step=ep, - ) +- **`$(lr)` and `$(pat)`** — these are variable placeholders. HTCondor replaces them with values from `params.txt` for each job. +- **`$(Cluster)` and `$(Process)`** — built-in HTCondor variables. `$(Cluster)` is the job cluster ID (shared by all jobs from one submission), and `$(Process)` is the index within that cluster (0, 1, 2, ...). Together they create unique file names for each trial's outputs. +- **`transfer_output_remaps`** — this is crucial for collecting results. Without it, every job would write to `model.pt` and `metrics.json` in the same directory, overwriting each other. The remap renames each job's outputs to include the cluster and process IDs, placing them in a `results/` directory. +- **`queue lr,pat from params.txt`** — reads variable values from an external file (one combination per line). + +::::::::::::::::::::::::::::::::::::::: callout + +### Create output directories before submitting + +HTCondor will not create directories for you. Before running `condor_submit`, make sure the `results/` and `logs/` directories exist: + +```bash +mkdir -p results logs ``` -The critical detail: the `hyperparameter_metric_tag` string **must exactly match** the key you use in `metric_spec` when configuring the tuning job (e.g., `"validation_accuracy"`). If they don't match, trials will show as **INFEASIBLE**. +If these directories do not exist, your jobs will fail at output transfer time. -#### 4. Define hyperparameter search space -This step defines which parameters Vertex AI will vary across trials and their allowed ranges. The number of total settings tested is determined later using `max_trial_count`. +::::::::::::::::::::::::::::::::::::::::::::::: -Vertex AI uses **Bayesian optimization** by default (internally listed as `"ALGORITHM_UNSPECIFIED"` in the API). That means if you don’t explicitly specify a search algorithm, Vertex AI automatically applies an adaptive Bayesian strategy to balance exploration (trying new areas of the parameter space) and exploitation (focusing near the best results so far). Each completed trial helps the tuner model how your objective metric (for example, `validation_accuracy`) changes across parameter values. Subsequent trials then sample new parameter combinations that are statistically more likely to improve performance, which usually yields better results than random or grid search—especially when `max_trial_count` is limited. +## Three approaches to defining hyperparameter combinations -Vertex AI supports four parameter spec types. This episode uses the first two: +HTCondor's `queue` command is flexible. Here are three ways to specify which combinations to try, from simplest to most powerful. -| Spec type | Use case | Example | -|---|---|---| -| `DoubleParameterSpec` | Continuous floats | Learning rate 1e-4 to 1e-2 | -| `IntegerParameterSpec` | Whole numbers | Patience 5 to 20 | -| `DiscreteParameterSpec` | Specific numeric values | Batch size [32, 64, 128] | -| `CategoricalParameterSpec` | Named options (strings) | Optimizer ["adam", "sgd"] | +### Approach 1: Queue with inline variable lists -Include early-stopping parameters so the tuner can learn good stopping behavior for your dataset: +For a small number of combinations, you can list them directly in the submit file: -```python -from google.cloud import aiplatform -from google.cloud.aiplatform import hyperparameter_tuning as hpt - -parameter_spec = { - "learning_rate": hpt.DoubleParameterSpec(min=1e-4, max=1e-2, scale="log"), - "patience": hpt.IntegerParameterSpec(min=5, max=20, scale="linear"), - "min_delta": hpt.DoubleParameterSpec(min=1e-6, max=1e-3, scale="log"), -} +``` +queue lr,pat from ( + 0.001, 10 + 0.01, 5 + 0.0001, 20 + 0.005, 15 + 0.001, 5 + 0.0005, 10 +) ``` -#### 5. Initialize Vertex AI, project, and bucket -Initialize the Vertex AI SDK and set your staging and artifact locations in GCS. +This submits 6 jobs, one for each line inside the parentheses. Each line provides a value for `lr` and `pat`, separated by a comma. -```python -from google.cloud import aiplatform, storage -import datetime as dt - -client = storage.Client() -PROJECT_ID = client.project -REGION = "us-central1" -LAST_NAME = "DOE" # change to your name or unique ID -BUCKET_NAME = "doe-titanic" # replace with your bucket name - -aiplatform.init( - project=PROJECT_ID, - location=REGION, - staging_bucket=f"gs://{BUCKET_NAME}/.vertex_staging", -) +**When to use this:** quick experiments with a handful of combinations where you want everything in one file. + +### Approach 2: Queue from a file + +For larger sweeps, store the parameter combinations in a separate file: + +``` +# File: params.txt +0.001, 10 +0.01, 5 +0.0001, 20 +0.005, 15 +0.001, 5 +0.0005, 10 +0.01, 10 +0.0001, 15 +0.005, 20 +``` + +Then reference it in the submit file: + +``` +queue lr,pat from params.txt ``` -#### 6. Define runtime configuration -Create a unique run ID and set the container, machine type, and base output directory for artifacts. Each variable controls a different aspect of the training environment: +This is cleaner for larger sweeps and lets you generate `params.txt` programmatically (e.g., with a Python script that creates a grid or random sample). -- **`RUN_ID`** — a timestamp that uniquely identifies this tuning session, used to organize artifacts in GCS. -- **`ARTIFACT_DIR`** — the GCS folder where all trial outputs (models, metrics, logs) will be written. -- **`IMAGE`** — the prebuilt Docker container that includes PyTorch and its dependencies. -- **`MACHINE`** — the VM shape (CPU/RAM) for each trial. Start small for testing. -- **`ACCELERATOR_TYPE` / `ACCELERATOR_COUNT`** — set to unspecified/0 for CPU-only runs. As we saw in Episode 5, GPU overhead isn't worth it for a dataset this small, and HP tuning launches *multiple* trials, so unnecessary GPUs multiply cost quickly. Change these to attach a GPU when your model or data genuinely benefits from one. +**When to use this:** any sweep with more than a few combinations, or when you want to generate combinations with a script. + +### Generating params.txt programmatically + +You can use a simple Python script to generate a grid of hyperparameter combinations: ```python -RUN_ID = dt.datetime.now().strftime("%Y%m%d-%H%M%S") -ARTIFACT_DIR = f"gs://{BUCKET_NAME}/artifacts/pytorch_hpt/{RUN_ID}" +# File: make_params.py +import itertools + +learning_rates = [0.0001, 0.0005, 0.001, 0.005, 0.01] +patience_values = [5, 10, 15, 20] -IMAGE = "us-docker.pkg.dev/vertex-ai/training/pytorch-xla.2-4.py310:latest" # XLA container includes cloudml-hypertune -MACHINE = "n1-standard-4" -ACCELERATOR_TYPE = "ACCELERATOR_TYPE_UNSPECIFIED" -ACCELERATOR_COUNT = 0 +with open("params.txt", "w") as f: + for lr, pat in itertools.product(learning_rates, patience_values): + f.write(f"{lr}, {pat}\n") + +print(f"Wrote {len(learning_rates) * len(patience_values)} combinations to params.txt") ``` -#### 7. Configure hyperparameter tuning job -When you use Vertex AI Hyperparameter Tuning Jobs, each trial needs a complete, runnable training configuration: the script, its arguments, the container image, and the compute environment. -Rather than defining these pieces inline each time, we create a **CustomJob** to hold that configuration. +Running `python make_params.py` produces a `params.txt` with 20 combinations (5 learning rates x 4 patience values). You could just as easily sample randomly: -The CustomJob acts as the blueprint for running a single training task — specifying exactly what to run and on what resources. The tuner then reuses that job definition across all trials, automatically substituting in new hyperparameter values for each run. +```python +# Random search variant +import random + +with open("params.txt", "w") as f: + for _ in range(20): + lr = 10 ** random.uniform(-4, -1) # log-uniform between 0.0001 and 0.1 + pat = random.randint(5, 20) + f.write(f"{lr:.6f}, {pat}\n") +``` -This approach has a few practical advantages: +### Approach 3: DAGMan for complex workflows -- You only define the environment once — machine type, accelerators, and output directories are all reused across trials. -- The tuner can safely inject trial-specific parameters (those declared in `parameter_spec`) while leaving other arguments unchanged. -- It provides a clean separation between *what a single job does* (`CustomJob`) and *how many times to repeat it with new settings* (`HyperparameterTuningJob`). -- It avoids the extra abstraction layers of higher-level wrappers like `CustomTrainingJob`, which automatically package code and environments. Using `CustomJob.from_local_script` keeps the workflow predictable and explicit. +When your hyperparameter sweep is part of a larger pipeline — for example, you want to preprocess data, run the sweep, and then aggregate results automatically — HTCondor's **DAGMan** (Directed Acyclic Graph Manager) can manage the workflow. DAGMan lets you define dependencies between jobs: job B only starts after job A finishes. -In short: -`CustomJob` defines how to run one training run. -`HyperparameterTuningJob` defines how to repeat it with different parameter sets and track results. +A DAG file for a tune-then-aggregate workflow might look like: -The number of total runs is set by `max_trial_count`, and the number of simultaneous runs is controlled by `parallel_trial_count`. Each trial's output and metrics are logged under the GCS `base_output_dir`. +``` +# File: tune_pipeline.dag +JOB SWEEP tune_nn.sub +JOB AGGREGATE aggregate_results.sub -For a first pass, we'll run **3 trials fully in parallel**. With only 3 trials the adaptive optimizer has almost nothing to learn from, so running them simultaneously costs no search quality. This still validates that the full pipeline works end-to-end (metrics are reported, artifacts land in GCS, the tuner picks a best trial) while giving you a quick look at how results vary across different parameter combinations. +PARENT SWEEP CHILD AGGREGATE +``` +This ensures all sweep trials complete before the aggregation job runs. We will cover DAGMan in more detail in [Episode 8](08-CLI-workflows.md). For now, the key insight is that DAGMan gives you the ability to chain hyperparameter sweeps with post-processing steps automatically. -```python -# metric_spec = {"validation_loss": "minimize"} - also stored by train_nn.py -metric_spec = {"validation_accuracy": "maximize"} - -custom_job = aiplatform.CustomJob.from_local_script( - display_name=f"{LAST_NAME}_pytorch_hpt-trial_{RUN_ID}", - script_path="Intro_GCP_for_ML/scripts/train_nn.py", - container_uri=IMAGE, - requirements=["python-json-logger>=2.0.7"], # resolves a dependency conflict in the prebuilt container - args=[ - f"--train=gs://{BUCKET_NAME}/data/train_data.npz", - f"--val=gs://{BUCKET_NAME}/data/val_data.npz", - "--learning_rate=0.001", # HPT will override when sampling - "--patience=10", # HPT will override when sampling - "--min_delta=0.001", # HPT will override when sampling - ], - base_output_dir=ARTIFACT_DIR, - machine_type=MACHINE, - accelerator_type=ACCELERATOR_TYPE, - accelerator_count=ACCELERATOR_COUNT, -) +## Submitting the sweep -DISPLAY_NAME = f"{LAST_NAME}_pytorch_hpt_{RUN_ID}" - -# Start with a small batch of 3 trials, all in parallel. -# With so few trials the adaptive optimizer has nothing to learn from, -# so full parallelism costs no search quality — and finishes faster. -tuning_job = aiplatform.HyperparameterTuningJob( - display_name=DISPLAY_NAME, - custom_job=custom_job, # must be a CustomJob (not CustomTrainingJob) - metric_spec=metric_spec, - parameter_spec=parameter_spec, - max_trial_count=3, # small initial sweep - parallel_trial_count=3, # all at once — adaptive search needs more data to help - # search_algorithm="ALGORITHM_UNSPECIFIED", # default = adaptive search (Bayesian) - # search_algorithm="RANDOM_SEARCH", # optional override - # search_algorithm="GRID_SEARCH", # optional override -) +Once your submit file and parameter file are ready, submitting is a single command: -tuning_job.run(sync=True) -print("HPT artifacts base:", ARTIFACT_DIR) +```bash +$ condor_submit tune_nn.sub +Submitting job(s)...... +6 job(s) submitted to cluster 12345. ``` -## Run and analyze results +HTCondor queues all jobs at once. Depending on pool availability, some or all may start running immediately. You can monitor progress with: -#### 8. Monitor tuning job -Open **Vertex AI → Training → Hyperparameter tuning jobs** in the [Cloud Console](https://console.cloud.google.com/vertex-ai/training/hyperparameter-tuning-jobs) to track trials, parameters, and metrics. You can also stop jobs from the console if needed. +```bash +# Check status of all your jobs +condor_q -> **Note:** Replace the project ID in the URL below with your own if you are not using the shared workshop project. +# Watch a specific cluster +condor_watch_q 12345 -For the MLM25 workshop: [Hyperparameter tuning jobs](https://console.cloud.google.com/vertex-ai/training/hyperparameter-tuning-jobs?hl=en&project=doit-rci-mlm25-4626). +# Check why jobs are idle (waiting for resources) +condor_q -better-analyze 12345 +``` ::::::::::::::::::::::::::::::::::::::: callout -### Troubleshooting common HPT issues +### How many jobs should I submit? -- **All trials show INFEASIBLE:** The `hyperparameter_metric_tag` in your training script doesn't match the key in `metric_spec`. Double-check spelling and case — `"validation_accuracy"` is not `"val_accuracy"`. -- **Quota errors on launch:** Your project may not have enough VM or GPU quota in the selected region. Check **IAM & Admin → Quotas** and request an increase or switch to a smaller `MACHINE` type. -- **Trial succeeds but metrics are empty:** Make sure `cloudml-hypertune` is importable inside the container. The prebuilt PyTorch containers include it. If using a custom container, add `cloudml-hypertune` to your `requirements`. -- **Job stuck in PENDING:** Another tuning or training job may be consuming your quota. Check **Vertex AI → Training** for running jobs. +CHTC can handle hundreds of simultaneous jobs, but be a good citizen of the shared pool: + +- **Start small** — submit 5-10 trials first to verify the pipeline works end-to-end (correct outputs, no file transfer errors). +- **Then scale up** — once everything works, submit the full sweep. HTCondor's fair-share scheduling ensures your jobs don't starve other users. +- **Check resource requests** — over-requesting memory or disk means your jobs wait longer to match with available machines. Use `condor_q -better-analyze` to diagnose idle jobs. ::::::::::::::::::::::::::::::::::::::::::::::: -#### 9. Inspect best trial results -After completion, look up the best configuration and objective value from the SDK: +## Collecting and comparing results -```python -best_trial = tuning_job.trials[0] # best-first -print("Best hyperparameters:", best_trial.parameters) -print("Best validation_accuracy:", best_trial.final_measurement.metrics) +After all jobs complete, the `results/` directory will contain pairs of files for each trial: + +```bash +$ ls results/ +metrics_12345_0.json model_12345_0.pt +metrics_12345_1.json model_12345_1.pt +metrics_12345_2.json model_12345_2.pt +metrics_12345_3.json model_12345_3.pt +metrics_12345_4.json model_12345_4.pt +metrics_12345_5.json model_12345_5.pt ``` -#### 10. Review recorded metrics in GCS -Your script writes a `metrics.json` (with keys such as `final_val_accuracy`, `final_val_loss`) to each trial's output directory (under `ARTIFACT_DIR`). The snippet below aggregates those into a dataframe for side-by-side comparison. +Each `metrics__.json` file contains the metrics saved by `train_nn.py`, including `final_val_accuracy`, `final_val_loss`, `learning_rate`, `patience`, and other training details. + +### Aggregation script + +Here is a simple Python script that reads all metrics files, compares them, and reports the best trial: ```python -from google.cloud import storage -import json, pandas as pd - -def list_metrics_from_gcs(ARTIFACT_DIR: str): - client = storage.Client() - bucket_name = ARTIFACT_DIR.replace("gs://", "").split("/")[0] - prefix = "/".join(ARTIFACT_DIR.replace("gs://", "").split("/")[1:]) - blobs = client.list_blobs(bucket_name, prefix=prefix) - - records = [] - for blob in blobs: - if blob.name.endswith("metrics.json"): - # Path: …/{RUN_ID}/{trial_number}/model/metrics.json → [-3] = trial number - trial_id = blob.name.split("/")[-3] - data = json.loads(blob.download_as_text()) - data["trial_id"] = trial_id - records.append(data) - return pd.DataFrame(records) - -df = list_metrics_from_gcs(ARTIFACT_DIR) -cols = ["trial_id","final_val_accuracy","final_val_loss","best_val_loss", - "best_epoch","patience","min_delta","learning_rate"] -df_sorted = df[cols].sort_values("final_val_accuracy", ascending=False) -print(df_sorted) +# File: find_best_trial.py +import json +import glob +import sys + +def find_best_trial(results_dir="results"): + metrics_files = sorted(glob.glob(f"{results_dir}/metrics_*.json")) + + if not metrics_files: + print(f"No metrics files found in {results_dir}/") + sys.exit(1) + + trials = [] + for path in metrics_files: + with open(path) as f: + data = json.load(f) + data["_file"] = path + trials.append(data) + + # Sort by validation accuracy (descending) + trials.sort(key=lambda t: t.get("final_val_accuracy", 0), reverse=True) + + print(f"{'File':<40} {'Val Acc':>8} {'Val Loss':>9} {'LR':>10} {'Patience':>9}") + print("-" * 80) + for t in trials: + print(f"{t['_file']:<40} {t.get('final_val_accuracy', 'N/A'):>8.4f} " + f"{t.get('final_val_loss', 'N/A'):>9.4f} " + f"{t.get('learning_rate', 'N/A'):>10.6f} " + f"{t.get('patience', 'N/A'):>9}") + + best = trials[0] + print(f"\nBest trial: {best['_file']}") + print(f" Validation accuracy: {best.get('final_val_accuracy', 'N/A'):.4f}") + print(f" Validation loss: {best.get('final_val_loss', 'N/A'):.4f}") + print(f" Learning rate: {best.get('learning_rate', 'N/A')}") + print(f" Patience: {best.get('patience', 'N/A')}") + + # Identify the corresponding model file + model_file = best["_file"].replace("metrics_", "model_").replace(".json", ".pt") + print(f" Model file: {model_file}") + +if __name__ == "__main__": + results_dir = sys.argv[1] if len(sys.argv) > 1 else "results" + find_best_trial(results_dir) ``` -#### 11. Visualize trial comparison -A quick chart makes it easier to see which trials performed best and how learning rate relates to accuracy: +Run it after all jobs complete: + +```bash +$ python find_best_trial.py results/ + +File Val Acc Val Loss LR Patience +-------------------------------------------------------------------------------- +results/metrics_12345_2.json 0.8212 0.4015 0.000100 20 +results/metrics_12345_0.json 0.8101 0.4198 0.001000 10 +results/metrics_12345_4.json 0.8045 0.4301 0.001000 5 +results/metrics_12345_3.json 0.7989 0.4456 0.005000 15 +results/metrics_12345_1.json 0.7877 0.4612 0.010000 5 +results/metrics_12345_5.json 0.7821 0.4823 0.000500 10 + +Best trial: results/metrics_12345_2.json + Validation accuracy: 0.8212 + Validation loss: 0.4015 + Learning rate: 0.0001 + Patience: 20 + Model file: results/model_12345_2.pt +``` -```python -import matplotlib.pyplot as plt +You can then use the best model file directly for inference or further fine-tuning. -fig, axes = plt.subplots(1, 2, figsize=(12, 4)) +::::::::::::::::::::::::::::::::::::::: callout -# Bar chart: accuracy per trial -axes[0].barh(df_sorted["trial_id"].astype(str), df_sorted["final_val_accuracy"]) -axes[0].set_xlabel("Validation Accuracy") -axes[0].set_ylabel("Trial") -axes[0].set_title("Accuracy by Trial") +### Grid search vs. Bayesian optimization -# Scatter: learning rate vs accuracy (color = patience) -sc = axes[1].scatter( - df_sorted["learning_rate"], df_sorted["final_val_accuracy"], - c=df_sorted["patience"], cmap="viridis", edgecolors="k", s=80, -) -axes[1].set_xscale("log") -axes[1].set_xlabel("Learning Rate (log scale)") -axes[1].set_ylabel("Validation Accuracy") -axes[1].set_title("LR vs. Accuracy (color = patience)") -plt.colorbar(sc, ax=axes[1], label="patience") - -plt.tight_layout() -plt.show() -``` +Managed cloud services like Vertex AI offer **Bayesian optimization**, where the system learns from completed trials to choose more promising hyperparameter combinations for future trials. This is sample-efficient — it finds good results with fewer trials. -::::::::::::::::::::::::::::::::::::: challenge +On CHTC, we use **grid search** (try every combination in a predefined grid) or **random search** (sample combinations randomly from defined ranges). These methods don't learn from previous results, but they have an important advantage: **every trial is independent**, so they all run in parallel with zero coordination overhead. -### Exercise 1: Widen the learning-rate search space +When compute is free and plentiful — as it is on CHTC — the practical difference shrinks considerably. You can afford to run 50 or 100 trials instead of 12, covering the search space thoroughly through brute force rather than statistical cleverness. -The current search space uses `min=1e-4, max=1e-2` for learning rate. Suppose you suspect that slightly larger learning rates (up to `0.1`) might converge faster with early stopping enabled. +**Rule of thumb:** Bayesian optimization shines when each trial is expensive (cloud GPU billing by the minute). Grid/random search shines when you have abundant free compute and want simplicity. -1. Update `parameter_spec` to widen the `learning_rate` range to `max=0.1`. -2. Thinking question: Why does `scale="log"` make sense for learning rate but `scale="linear"` makes sense for patience? -3. **Do not run the job yet** — just update the configuration. +::::::::::::::::::::::::::::::::::::::::::::::: -::::::::::::::::::::::: solution +## Putting it all together -```python -parameter_spec = { - "learning_rate": hpt.DoubleParameterSpec(min=1e-4, max=1e-1, scale="log"), - "patience": hpt.IntegerParameterSpec(min=5, max=20, scale="linear"), - "min_delta": hpt.DoubleParameterSpec(min=1e-6, max=1e-3, scale="log"), -} -``` +Here is the complete workflow, from setup to results: -**Why log vs. linear?** Learning rate values span several orders of magnitude (0.0001 to 0.1), so `scale="log"` ensures the tuner samples evenly across those orders rather than clustering near the high end. Patience is an integer (5–20) where each step is equally meaningful, so `scale="linear"` is appropriate. +```bash +# 1. Create directories +mkdir -p results logs -::::::::::::::::::::::::::::::: +# 2. Generate parameter combinations +python make_params.py -:::::::::::::::::::::::::::::::::::::::::::::::::: +# 3. Verify the params file +cat params.txt + +# 4. Submit the sweep +condor_submit tune_nn.sub + +# 5. Monitor progress +condor_q + +# 6. After all jobs complete, find the best trial +python find_best_trial.py results/ +``` ::::::::::::::::::::::::::::::::::::: challenge -### Exercise 2: Scale up trials with adaptive search +### Exercise 1: Write a parameter file for a 3-variable sweep -Your initial 3-trial run validated the pipeline. Now scale up to a proper search where the adaptive optimizer can actually help — but keep parallelism **low** so the tuner learns between batches. +Extend the hyperparameter sweep to include a third variable: `min_delta` (the minimum improvement threshold for early stopping). Write a `params.txt` that includes combinations of: -1. Set `max_trial_count=12` and `parallel_trial_count=3`. -2. Before running, estimate the approximate cost: if each trial takes ~5 minutes on an `n1-standard-4` (~ `$0.19`/hr), how much would 12 trials cost? -3. Why does it make sense to keep `parallel_trial_count` at 3 instead of 12 now that we have more trials? -4. Run the updated job and monitor it in the Vertex AI Console. +- `learning_rate`: 0.001, 0.0005, 0.0001 +- `patience`: 5, 10, 20 +- `min_delta`: 0.001, 0.0001 + +You will also need to update the submit file to accept the third variable. + +1. How many total combinations are there? +2. Write the `params.txt` file (or a script to generate it). +3. Update the `queue` line and `arguments` line of `tune_nn.sub`. ::::::::::::::::::::::: solution +There are 3 x 3 x 2 = **18 combinations**. + +A script to generate the file: + ```python -tuning_job = aiplatform.HyperparameterTuningJob( - display_name=DISPLAY_NAME, - custom_job=custom_job, - metric_spec=metric_spec, - parameter_spec=parameter_spec, - max_trial_count=12, - parallel_trial_count=3, -) +import itertools + +lrs = [0.001, 0.0005, 0.0001] +pats = [5, 10, 20] +deltas = [0.001, 0.0001] + +with open("params.txt", "w") as f: + for lr, pat, md in itertools.product(lrs, pats, deltas): + f.write(f"{lr}, {pat}, {md}\n") +``` + +Updated submit file lines: + ``` +arguments = --train train_data.npz --val val_data.npz --learning_rate $(lr) --patience $(pat) --min_delta $(md) --epochs 500 -**Cost estimate:** 12 trials x 5 min each = 60 minutes of compute. At ~ `$0.19`/hr for `n1-standard-4`, that's roughly `$0.19` total. With `parallel_trial_count=3`, wall-clock time would be approximately 20 minutes (4 batches of 3 trials). +queue lr,pat,md from params.txt +``` -**Why not run all 12 in parallel?** With 12 trials we have enough data for the adaptive optimizer to learn: after each batch of 3 completes, the tuner updates its model of which regions of the search space are promising and steers the next batch toward them. Running all 12 at once would turn the search into an expensive random sweep — every trial would be launched "blind" before any results come back. +The `transfer_output_remaps`, log/output/error lines, and everything else stays the same — only `arguments` and `queue` need to change. ::::::::::::::::::::::::::::::: :::::::::::::::::::::::::::::::::::::::::::::::::: -::::::::::::::::::::::::::::::::::::: discussion +::::::::::::::::::::::::::::::::::::: challenge -### What is the effect of parallelism in tuning? +### Exercise 2: Diagnose a failed sweep -- How might running 10 trials in parallel differ from running 2 at a time in terms of cost, time, and result quality? -- When would you want to prioritize speed over adaptive search benefits? +You submitted a sweep of 20 jobs, but when you look at the results directory you only see 15 metrics files. What steps would you take to figure out what happened to the other 5 jobs? -| Factor | High parallelism (e.g., 10) | Low parallelism (e.g., 2) | -|---|---|---| -| **Wall-clock time** | Shorter | Longer | -| **Total cost** | ~Same (slightly more overhead) | ~Same | -| **Adaptive search quality** | Worse (tuner explores “blind”) | Better (tuner learns between batches) | -| **Best for** | Cheap/short trials, deadlines | Expensive trials, small budgets | +::::::::::::::::::::::: solution -**Why does parallelism hurt result quality?** Vertex AI's adaptive search learns from completed trials to choose better parameter combinations. With many trials in flight simultaneously, the tuner can't incorporate results quickly — it explores “blind” for longer, often yielding slightly worse results for a fixed `max_trial_count`. With modest parallelism (2–4), the tuner can update beliefs and exploit promising regions between batches. +1. **Check job status** — run `condor_q` to see if any jobs are still running, idle, or held: -**Guidelines:** -- Keep `parallel_trial_count` to **≤ 25–33%** of `max_trial_count` when you care about adaptive quality. -- Increase parallelism when trials are long and the search space is well-bounded. + ```bash + condor_q + ``` -::::::::::::::::::::::::::::::::::::::: callout +2. **Check for held jobs** — held jobs encountered an error. See why: -### When to prioritize speed vs. adaptive quality + ```bash + condor_q -held + ``` -**Favor higher parallelism** when you have strict deadlines, very cheap/short trials where startup time dominates, a non-adaptive search, or unused quota/credits. + Common reasons include: Docker image pull failures, file transfer errors (missing input files), or exceeding requested memory/disk. -**Favor lower parallelism** when trials are expensive or noisy, `max_trial_count` is small (≤ 10–20), early stopping is enabled, or you're exploring many dimensions at once. +3. **Check log files** — each job writes to `logs/tune__.log`. The log file records start time, completion, and any abnormal termination. The `.err` file contains stderr output from the job itself: -::::::::::::::::::::::::::::::::::::::::::::::: + ```bash + # Find which process IDs are missing + ls results/metrics_*.json | sort -**Practical recipe:** -- First run: `max_trial_count=3`, `parallel_trial_count=3` (pipeline sanity check — too few trials for adaptive search to help, so run them all at once). -- Main run: `max_trial_count=10–20`, `parallel_trial_count=2–4` (enough trials for the optimizer to learn between batches). -- Scale up parallelism only after the above completes cleanly and you confirm adaptive performance is acceptable. + # Then check the corresponding logs + cat logs/tune_12345_7.err + cat logs/tune_12345_7.log + ``` -:::::::::::::::::::::::::::::::::::::::::::::::: +4. **Resubmit failed jobs** — once you fix the issue, you can resubmit just the failed combinations by creating a new params file with only those lines, or by using `condor_submit` with a specific process range. + +::::::::::::::::::::::::::::::: +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: challenge -## Clean up staging files +### Exercise 3: Random search vs. grid search -HP tuning launches multiple trials, so staging tarballs accumulate even faster. Delete them when you're done: +Instead of a full grid, write a Python script that generates 15 random hyperparameter combinations with: + +- `learning_rate`: log-uniform between 0.0001 and 0.01 +- `patience`: uniform integer between 5 and 25 + +Why might random search find a better result than a grid of the same size? + +::::::::::::::::::::::: solution ```python -!gsutil -m rm -r gs://{BUCKET_NAME}/.vertex_staging/ +import random + +random.seed(42) # for reproducibility + +with open("params_random.txt", "w") as f: + for _ in range(15): + lr = 10 ** random.uniform(-4, -2) # log-uniform: 0.0001 to 0.01 + pat = random.randint(5, 25) + f.write(f"{lr:.6f}, {pat}\n") + +print("Wrote 15 random combinations to params_random.txt") ``` -## What's next: using your tuned model +**Why random search can outperform grid search:** Grid search distributes trials evenly across every dimension, which means many trials differ in only one parameter at a time. If one parameter matters much more than the other (e.g., learning rate has a large effect but patience has a small effect), grid search wastes many trials exploring patience values that don't matter, while only testing a few learning rate values. + +Random search places trials throughout the full space, so it effectively tests more unique values of each individual parameter. Research by Bergstra and Bengio (2012) showed that random search is more efficient than grid search for the same number of trials when some hyperparameters matter more than others — which is almost always the case in practice. -After tuning, your best model's weights sit in GCS under the best trial's artifact directory. The most common next steps are: +::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## What's next: using your tuned model -- **Batch prediction (most common):** Load the best model from GCS and run inference on a dataset — this is what we did in the evaluation sections of Episodes 4–5 when we loaded models from GCS into memory. For larger-scale batch prediction, Vertex AI offers [Batch Prediction Jobs](https://cloud.google.com/vertex-ai/docs/predictions/get-batch-predictions) that handle provisioning and scaling automatically. -- **Experiment tracking:** Vertex AI [Experiments](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments) can log metrics, parameters, and artifacts across runs for systematic comparison. Consider integrating this into your workflow as your projects grow. -- **Online deployment:** If you need real-time predictions via an API, Vertex AI [Endpoints](https://cloud.google.com/vertex-ai/docs/predictions/get-online-predictions) let you deploy your model — but endpoints bill continuously (~ `$4.50`/day for an `n1-standard-4`), so only deploy when you genuinely need a live API. +After tuning, the best model's weights sit in the `results/` directory. The most common next steps are: +- **Load and evaluate** — load the best `model_*.pt` file in Python and run inference on a test set, just as you did in Episode 5. +- **Move to production** — copy the best model to a shared location or deploy it as part of a larger application. +- **Automate with DAGMan** — set up a DAG that runs the sweep and automatically aggregates results (see [Episode 8](08-CLI-workflows.md)). +- **Iterate** — use the results to narrow your search space and run a more focused sweep around the most promising region. ::::::::::::::::::::::::::::::::::::: keypoints -- Vertex AI Hyperparameter Tuning Jobs efficiently explore parameter spaces using adaptive strategies. -- Define parameter ranges in `parameter_spec`; the number of settings tried is controlled later by `max_trial_count`. -- The `hyperparameter_metric_tag` reported by `cloudml-hypertune` must exactly match the key in `metric_spec`. -- Limit `parallel_trial_count` (2–4) to help adaptive search. -- Use GCS for input/output and aggregate `metrics.json` across trials for detailed analysis. +- HTCondor's `queue ... from` syntax lets you launch many jobs from a single submit file, each with different hyperparameter values substituted via `$(variable)` placeholders. +- Three approaches to defining parameter combinations: inline lists in the submit file, an external parameter file, or DAGMan for multi-step pipelines. +- Use `transfer_output_remaps` with `$(Cluster)` and `$(Process)` to give each trial's output files unique names and avoid overwriting. +- After the sweep completes, a simple Python script can aggregate `metrics.json` files and identify the best trial. +- CHTC's grid/random search trades statistical sophistication for massive free parallelism — run more trials instead of smarter trials. +- All hyperparameter tuning jobs on CHTC are free, removing cost as a constraint on how thoroughly you explore the parameter space. :::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/07-Retrieval-augmented-generation.md b/episodes/07-Retrieval-augmented-generation.md index 29d8ee02..758ce64b 100644 --- a/episodes/07-Retrieval-augmented-generation.md +++ b/episodes/07-Retrieval-augmented-generation.md @@ -1,33 +1,32 @@ --- -title: "Retrieval-Augmented Generation (RAG) with Vertex AI" +title: "Retrieval-Augmented Generation (RAG) on CHTC" teaching: 20 exercises: 10 --- :::::::::::::::::::::::::::::::::::::: questions -- How do we go from "a pile of PDFs" to "ask a question and get a cited answer" using Google Cloud tools? -- What are the key parts of a RAG system (chunking, embedding, retrieval, generation), and how do they map onto Vertex AI services? -- How much does each part of this pipeline cost (VM time, embeddings, LLM calls), and where can we keep it cheap? +- How do we go from "a pile of PDFs" to "ask a question and get a cited answer"? +- What are the key parts of a RAG system (chunking, embedding, retrieval, generation)? +- How can we run a RAG pipeline on CHTC or from a submit node? :::::::::::::::::::::::::::::::::::::::::::::::: ::::::::::::::::::::::::::::::::::::: objectives -- Unpack the core RAG pipeline: ingest → chunk → embed → retrieve → answer. -- Run a minimal, fully programmatic RAG loop on a Vertex AI Workbench VM using Google's foundation models for embeddings and generation. -- Answer questions using content from provided papers and return grounded answers backed by source text, not unverifiable claims. +- Unpack the core RAG pipeline: ingest, chunk, embed, retrieve, answer. +- Run a minimal RAG loop using foundation model APIs for embeddings and generation. +- Answer questions using content from provided papers and return grounded answers backed by source text. :::::::::::::::::::::::::::::::::::::::::::::::: ## Background concepts -This episode shifts from classical ML training (Episodes 4–6) to working with large language models (LLMs). If any of the following terms are new to you, here's a quick primer: +This episode shifts from classical ML training (Episodes 4-6) to working with large language models (LLMs). If any of the following terms are new to you, here's a quick primer: - **Embeddings:** A numerical vector (list of numbers) that represents the *meaning* of a piece of text. Texts with similar meanings have similar vectors. This lets us search "by meaning" rather than by keyword matching. - **Cosine similarity:** A measure of how similar two vectors are (1.0 = identical direction, 0.0 = unrelated). Used to find which stored text chunks are most relevant to a question. - **Large Language Model (LLM):** A model (like Gemini, GPT, or LLaMA) trained on massive text corpora that can generate coherent text given a prompt. In this episode, we use an LLM to *answer questions* based on retrieved text, not to train one from scratch. -- **Foundation model APIs:** In this episode, we use the `google-genai` client library to access Google's managed embedding and generation models. This is separate from the `google-cloud-aiplatform` SDK used for training jobs in earlier episodes. ## Overview: What we're building @@ -37,60 +36,79 @@ This episode shifts from classical ML training (Episodes 4–6) to working with 2. The system **retrieves** relevant passages from your PDFs or data. 3. An LLM **answers** using those passages only, with citations. -This approach is useful any time you need to ground an LLM's answers in a specific corpus — research papers, policy documents, lab notebooks, etc. For example, a sustainability research team could use this pipeline to extract AI water and energy metrics from published papers, getting cited answers instead of generic LLM summaries. +This approach is useful any time you need to ground an LLM's answers in a specific corpus — research papers, policy documents, lab notebooks, etc. -![RAG pipeline with Gemini API](https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/images/diagram2_rag_gemini.svg){alt="Architecture diagram showing the RAG pipeline: a Workbench notebook orchestrates document chunking, embedding via the Gemini API, and retrieval-augmented generation, with documents and embeddings stored in a GCS bucket."} +### Running RAG on CHTC + +There are two approaches for running RAG pipelines with CHTC: + +1. **API-based (recommended for this workshop):** Use API-based models (Google Gemini, OpenAI) for embeddings and generation. The pipeline is lightweight enough to run interactively on the submit node for small corpora, or as an HTCondor job for larger ones. + +2. **Open-source models on GPU nodes:** Run sentence-transformers for embeddings and open-source LLMs (Gemma, Mistral, LLaMA) for generation as HTCondor GPU jobs. This avoids API costs but requires more setup and larger compute resources. + +We'll focus on the API approach since it's simpler and lets us focus on the RAG concepts. ### About the corpus -Our corpus is a curated bundle of **32 research papers** on the environmental and economic costs of AI — topics like training energy, inference power consumption, water footprint, and carbon emissions. The papers span 2019–2025 and include titles such as *"Green AI"*, *"Making AI Less Thirsty"*, and *"The ML.ENERGY Benchmark"*. They're shipped as `data/pdfs_bundle.zip` in the lesson repository so that everyone works with the same documents. You could swap in your own PDFs — the pipeline is corpus-agnostic. +Our corpus is a curated bundle of **32 research papers** on the environmental and economic costs of AI — topics like training energy, inference power consumption, water footprint, and carbon emissions. They're shipped as `data/pdfs_bundle.zip` in the lesson repository. You could swap in your own PDFs — the pipeline is corpus-agnostic. ## Step 1: Set up the environment -Navigate to `/Intro_GCP_for_ML/notebooks/07-Retrieval-augmented-generation.ipynb` to begin this notebook. **Select the *Python 3 (ipykernel)* kernel** — this episode uses only the `google-genai` client library and scikit-learn, so no PyTorch or TensorFlow kernel is needed. +::::::::::::::::::::::::::::::::::::: callout -#### CD to instance home directory -To ensure we're all in the same starting spot, change directory to your Jupyter home directory. +### Running on the submit node vs. as a job -```python -%cd /home/jupyter/ -``` +For a small corpus (a few dozen PDFs), the embedding and retrieval steps are lightweight enough to run directly on the submit node. For larger corpora (thousands of documents), you'd submit the embedding step as an HTCondor job with more memory and compute. -We need the `pypdf` library to extract text from PDF files. +For this workshop, we'll run everything interactively on the submit node since our corpus is small. -```python -!pip install --quiet --upgrade pypdf +:::::::::::::::::::::::::::::::::::::::::::::::: + +Install the required packages: + +```bash +pip install --user pypdf scikit-learn numpy google-genai ``` -**Cost note:** Installing packages is free; you're only billed for VM runtime. +::::::::::::::::::::::::::::::::::::: callout + +### API key setup -### Initialize project +Unlike the GCP version of this workshop where Vertex AI provides automatic authentication, running on CHTC requires you to set up API credentials manually. -We initialize the `vertexai` SDK to give our notebook access to Google's foundation models (embeddings and Gemini). Both the project ID and region are needed so API calls are billed to your project. +**For Google Gemini API:** +1. Get an API key from [Google AI Studio](https://aistudio.google.com/apikey). +2. Set it as an environment variable: `export GOOGLE_API_KEY="your-key-here"` -```python -from vertexai import init as vertexai_init -import os +**For OpenAI API:** +1. Get an API key from [platform.openai.com](https://platform.openai.com/api-keys). +2. Set it as an environment variable: `export OPENAI_API_KEY="your-key-here"` -PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT", "") -REGION = "us-central1" +**Security:** Never hardcode API keys in scripts or submit files. Use environment variables or a `.env` file that is not committed to version control. -vertexai_init(project=PROJECT_ID, location=REGION) -print("Initialized:", PROJECT_ID, REGION) -``` +:::::::::::::::::::::::::::::::::::::::::::::::: +### Initialize project + +```python +import os +# For Google Gemini API +API_KEY = os.environ.get("GOOGLE_API_KEY") +if not API_KEY: + print("Warning: GOOGLE_API_KEY not set. Set it with: export GOOGLE_API_KEY='your-key'") +``` ## Step 2: Extract and chunk PDFs -Before we can search our documents, we need to break them into smaller pieces ("chunks"). Embedding models produce better vectors from focused passages than from entire papers, and LLMs have limited context windows. The code below extracts text from each PDF and splits it into overlapping chunks of roughly 1,200 characters. +Before we can search our documents, we need to break them into smaller pieces ("chunks"). Embedding models produce better vectors from focused passages than from entire papers, and LLMs have limited context windows. ```python import zipfile, pathlib, re, pandas as pd from pypdf import PdfReader ZIP_PATH = pathlib.Path("Intro_GCP_for_ML/data/pdfs_bundle.zip") -DOC_DIR = pathlib.Path("/home/jupyter/docs") +DOC_DIR = pathlib.Path("docs") DOC_DIR.mkdir(exist_ok=True) # unzip @@ -113,60 +131,31 @@ corpus_df = pd.DataFrame(rows) print(len(corpus_df), "chunks created") ``` -**Cost note:** Only VM runtime applies. Chunk size affects future embedding cost — fewer, larger chunks mean fewer API calls but potentially noisier embeddings. - ::::::::::::::::::::::::::::::::::::: callout ### Why these chunking parameters? -The `max_chars=1200` / `overlap=150` values are practical defaults, not magic numbers: - -- **1,200 characters** (~200–300 tokens) keeps each chunk within a single focused idea while staying well under the embedding model's 8,000-token limit. -- **150-character overlap** ensures that sentences split across chunk boundaries are still captured in at least one chunk. -- **Character-based splitting** is simple and predictable. Sentence-level or paragraph-level chunking can produce better results but requires an NLP tokenizer and more code. - -Chunk size is a key tuning knob: smaller chunks give more precise retrieval but lose surrounding context; larger chunks preserve context but may dilute the embedding with irrelevant text. There's no single best answer — experiment with your own corpus. +- **1,200 characters** (~200-300 tokens) keeps each chunk within a single focused idea. +- **150-character overlap** ensures that sentences split across chunk boundaries are still captured. +- Chunk size is a key tuning knob: smaller chunks give more precise retrieval but lose context; larger chunks preserve context but may dilute the embedding. :::::::::::::::::::::::::::::::::::::::::::::::: +## Step 3: Embed the corpus - -## Step 3: Embed the corpus with Vertex AI - -Now we convert each text chunk into a numerical vector (an "embedding") so we can search by meaning rather than keywords. We use Google's **`gemini-embedding-001`** model — currently the top-ranked Google embedding model on the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). It accepts up to **2,048 input tokens** per text (~1,500 words), supports **100+ languages**, and uses [Matryoshka Representation Learning](https://huggingface.co/blog/matryoshka) so you can choose your output dimensions (768, 1,536, or 3,072) without retraining — smaller dimensions save memory and speed up search, while larger ones preserve more semantic detail. See the [Choosing an embedding model](#choosing-an-embedding-model) callout later in this episode for alternatives. - -### Initialize the Gen AI client +Now we convert each text chunk into a numerical vector so we can search by meaning rather than keywords. ```python from google import genai -from google.genai.types import HttpOptions, EmbedContentConfig, GenerateContentConfig +from google.genai.types import EmbedContentConfig import numpy as np -client = genai.Client( - http_options=HttpOptions(api_version="v1"), - vertexai=True, # route calls through your GCP project for billing - project=PROJECT_ID, - location=REGION, -) +client = genai.Client(api_key=API_KEY) -# Embedding model and dimensions EMBED_MODEL_ID = "gemini-embedding-001" -EMBED_DIM = 1536 # valid choices: 768, 1536, 3072 -``` +EMBED_DIM = 1536 -### Build the embedding helper - -The helper below converts text strings into embedding vectors in batches. Notice the `task_type` parameter: the Gemini embedding model optimizes its vectors differently depending on whether the input is a **document** being indexed or a **query** being searched. Using `RETRIEVAL_DOCUMENT` for corpus chunks and `RETRIEVAL_QUERY` for user questions produces better retrieval accuracy than using a single task type for both. - -```python def embed_texts(text_list, batch_size=32, dims=EMBED_DIM, task_type="RETRIEVAL_DOCUMENT"): - """ - Embed a list of strings using gemini-embedding-001. - Returns a NumPy array of shape (len(text_list), dims). - - task_type should be "RETRIEVAL_DOCUMENT" for corpus chunks - and "RETRIEVAL_QUERY" for user questions. - """ vectors = [] for start in range(0, len(text_list), batch_size): batch = text_list[start : start + batch_size] @@ -183,66 +172,40 @@ def embed_texts(text_list, batch_size=32, dims=EMBED_DIM, task_type="RETRIEVAL_D return np.array(vectors, dtype="float32") ``` -### Embed all chunks and build the retrieval index - -We embed the full corpus, then build a **nearest-neighbors index** so that future queries are fast. Think of this as two separate stages: - -1. **Embed & index (now)** — We convert every chunk into a vector and hand the matrix to scikit-learn's `NearestNeighbors`. Calling `.fit()` here doesn't train a model — it organizes the vectors into a data structure optimized for similarity search (like building a phone book before anyone looks up a number). -2. **Query (later, in Step 4)** — When a user question arrives, we embed *that* question and call `.kneighbors()` to find the corpus vectors closest to it by cosine similarity. - -We set `metric="cosine"` so the index knows *how* to measure closeness when queries arrive. The `n_neighbors=5` default means each query returns the 5 most relevant chunks — enough to give the LLM good context without overwhelming it with noise. You can tune this: fewer neighbors (3) gives more focused answers; more (10) gives broader coverage at the cost of including less-relevant text. +### Build the retrieval index ```python from sklearn.neighbors import NearestNeighbors -# Embed every chunk in the corpus emb_matrix = embed_texts(corpus_df["text"].tolist(), dims=EMBED_DIM) -print("emb_matrix shape:", emb_matrix.shape) # (num_chunks, EMBED_DIM) +print("emb_matrix shape:", emb_matrix.shape) -# Build nearest-neighbors index nn = NearestNeighbors(metric="cosine", n_neighbors=5) nn.fit(emb_matrix) ``` - - -## Step 4: Retrieve and generate answers with Gemini - -With embeddings indexed, we can now build the two remaining pieces of the RAG pipeline: a **retrieve** function that finds relevant chunks for a question, and an **ask** function that sends those chunks to Gemini for a grounded answer. +## Step 4: Retrieve and generate answers ### Retrieve relevant chunks ```python def retrieve(query, k=5): - """ - Embed the user query and find the top-k most similar corpus chunks. - Returns a DataFrame with a 'similarity' column. - """ - query_vec = embed_texts( - [query], dims=EMBED_DIM, task_type="RETRIEVAL_QUERY" - )[0] - + query_vec = embed_texts([query], dims=EMBED_DIM, task_type="RETRIEVAL_QUERY")[0] distances, indices = nn.kneighbors([query_vec], n_neighbors=k, return_distance=True) - result_df = corpus_df.iloc[indices[0]].copy() - result_df["similarity"] = 1 - distances[0] # cosine distance → similarity + result_df["similarity"] = 1 - distances[0] return result_df.sort_values("similarity", ascending=False) ``` ### Generate a grounded answer -The `ask()` function ties the full pipeline together: retrieve → build prompt → call Gemini. The `temperature=0.2` setting keeps answers factual and deterministic. The prompt instructs Gemini to answer *only* from the provided context and cite the source chunks. - ```python -GENERATION_MODEL_ID = "gemini-2.5-pro" # or "gemini-2.5-flash" for cheaper/faster +from google.genai.types import GenerateContentConfig + +GENERATION_MODEL_ID = "gemini-2.5-flash" def ask(query, top_k=5, temperature=0.2): - """ - Full RAG pipeline: retrieve context, build prompt, generate answer. - """ hits = retrieve(query, k=top_k) - - # Build context block with source tags for citation context_lines = [ f"[{row.doc}#chunk-{row.chunk_id}] {row.text}" for _, row in hits.iterrows() @@ -266,7 +229,7 @@ def ask(query, top_k=5, temperature=0.2): return response.text ``` -### Test the pipeline end-to-end +### Test the pipeline ```python print(ask("How much energy does it cost to train a large language model?")) @@ -284,7 +247,7 @@ Change the `max_chars` parameter in `chunk_text()` to **500** and then to **2500 :::::::::::::::::::::::: solution -Smaller chunks (500 chars) produce more precise retrieval hits but each chunk has less context, so Gemini may struggle to synthesize a complete answer. Larger chunks (2,500 chars) preserve more context but may dilute the embedding with unrelated text, leading to less accurate retrieval. For most research-paper corpora, 800–1,500 characters is a practical sweet spot. +Smaller chunks (500 chars) produce more precise retrieval hits but each chunk has less context, so the LLM may struggle to synthesize a complete answer. Larger chunks (2,500 chars) preserve more context but may dilute the embedding with unrelated text. For most research-paper corpora, 800-1,500 characters is a practical sweet spot. ::::::::::::::::::::::::::::::::: @@ -294,19 +257,19 @@ Smaller chunks (500 chars) produce more precise retrieval hits but each chunk ha ### Challenge 2: Test hallucination behavior -Ask a question that has **no answer** in the corpus — for example: +Ask a question that has **no answer** in the corpus: ```python print(ask("What was the GDP of France in 2019?")) ``` -- Does Gemini refuse to answer, or does it hallucinate? +- Does the LLM refuse to answer, or does it hallucinate? - Try modifying the system prompt in `ask()` to add: *"If the context does not contain enough information to answer, say 'I don't have enough information to answer this.'"* - Does the modified prompt change the behavior? :::::::::::::::::::::::: solution -Without the guardrail prompt, Gemini may produce a plausible-sounding answer from its training data, ignoring the "use only the following context" instruction. Adding an explicit refusal instruction significantly reduces hallucination. This is a key lesson: **prompt engineering is part of RAG system design**, not just model selection. +Without the guardrail prompt, the LLM may produce a plausible-sounding answer from its training data. Adding an explicit refusal instruction significantly reduces hallucination. **Prompt engineering is part of RAG system design**, not just model selection. ::::::::::::::::::::::::::::::::: @@ -314,50 +277,17 @@ Without the guardrail prompt, Gemini may produce a plausible-sounding answer fro ::::::::::::::::::::::::::::::::::::: challenge -### Challenge 3: Compare `gemini-2.5-pro` vs `gemini-2.5-flash` - -Change `GENERATION_MODEL_ID` to `"gemini-2.5-flash"` and ask the same question. - -```python -# Change the generation model and re-run a query -GENERATION_MODEL_ID = "gemini-2.5-flash" -print(ask("How much energy does it cost to train a large language model?")) -``` - -- Is the answer quality noticeably different? -- How does response time compare? -- Check the [Vertex AI pricing page](https://cloud.google.com/vertex-ai/generative-ai/pricing) — what's the cost difference per million tokens? - -:::::::::::::::::::::::: solution - -For well-grounded RAG queries (where the answer is clearly in the context), Flash often produces comparable answers at significantly lower cost and latency. Pro shines when the question requires more nuanced reasoning across multiple chunks. For workshop-scale workloads, Flash is usually sufficient and much cheaper. - -::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 4: Tune retrieval depth with `top_k` +### Challenge 3: Tune retrieval depth with `top_k` Call `ask()` with `top_k=2` and then with `top_k=10`. Compare the answers. -```python -# Try different retrieval depths -print("--- top_k=2 ---") -print(ask("How much energy does it cost to train a large language model?", top_k=2)) - -print("\n--- top_k=10 ---") -print(ask("How much energy does it cost to train a large language model?", top_k=10)) -``` - -- With `top_k=2`, does Gemini miss relevant information? +- With `top_k=2`, does the LLM miss relevant information? - With `top_k=10`, does the extra context help or introduce noise? - What value of `top_k` seems to work best for your question? :::::::::::::::::::::::: solution -Lower `top_k` gives Gemini a tighter, more focused context — good when the answer is localized in one or two chunks. Higher `top_k` provides broader coverage but risks including irrelevant passages that can confuse the model or dilute the answer. A good default is 3–5 for most research-paper RAG tasks. For questions that span multiple sections of a paper, higher values help. +Lower `top_k` gives a tighter, more focused context — good when the answer is localized in one or two chunks. Higher `top_k` provides broader coverage but risks including irrelevant passages. A good default is 3-5 for most research-paper RAG tasks. ::::::::::::::::::::::::::::::::: @@ -365,127 +295,86 @@ Lower `top_k` gives Gemini a tighter, more focused context — good when the ans ::::::::::::::::::::::::::::::::::::: challenge -### Challenge 5: Try different questions - -The quality of a RAG system depends heavily on the questions you ask. Try these queries — each tests a different aspect of retrieval and generation: +### Challenge 4: Try different questions ```python -# Off-topic question — not covered by the corpus at all -print(ask("How much does an elephant weight?")) - -print("\n" + "="*60 + "\n") +# Off-topic question +print(ask("How much does an elephant weigh?")) # Comparative question — requires synthesizing across sources print(ask("Is cloud computing more energy efficient than university HPC clusters?")) -print("\n" + "="*60 + "\n") - -# Opinion/marketing question — may tempt the model to go beyond the corpus -print(ask("Is Google Cloud the best cloud provider option?")) +# Opinion question — may tempt the model to go beyond the corpus +print(ask("What is the most energy-efficient way to train a neural network?")) ``` For each question, consider: - Does the answer cite specific numbers or papers from the corpus? -- Does Gemini stay grounded in the retrieved context, or does it add outside knowledge? +- Does the LLM stay grounded in the retrieved context? - Which question produces the most useful, well-supported answer? :::::::::::::::::::::::: solution -The elephant-weight question is deliberately off-topic — the corpus is about environmental costs of AI, not zoology, so a well-behaved RAG system should indicate that the context doesn't contain relevant information rather than answering from general knowledge. The cloud-vs-HPC question requires the model to compare across sources — look for whether it hedges appropriately when papers disagree. The "best cloud provider" question is deliberately tricky: the corpus is about environmental costs of AI, not cloud provider rankings, so a well-behaved RAG system should indicate that the context doesn't support a definitive answer rather than generating marketing-style claims. +The elephant-weight question is deliberately off-topic — a well-behaved RAG system should indicate the context doesn't contain relevant information. The comparative and opinion questions require synthesis across sources — look for whether the model hedges appropriately when papers disagree. ::::::::::::::::::::::::::::::::: :::::::::::::::::::::::::::::::::::::::::::::::: +## Running RAG as an HTCondor job -## Step 5: Cost summary +For larger corpora, you can submit the embedding step as an HTCondor job. Create a script that reads PDFs, computes embeddings, and saves them to a file: -Understanding the cost of each pipeline component helps you decide where to optimize. For a small workshop with a handful of PDFs, total costs are typically well under `$1`. +```bash +# rag_embed.sub +universe = docker +docker_image = python:3.10-slim -| Step | Resource | Cost Driver | Typical Range | -|------|-----------|-------------|---------------| -| VM runtime | Vertex AI Workbench (`n1-standard-4`) | Uptime (hourly) | ~ `$0.20`/hr | -| Embeddings | `gemini-embedding-001` | Tokens embedded (one-time) | ~ `$0.10` / 1M tokens | -| Retrieval | Local `NearestNeighbors` | CPU only | Free | -| Generation | `gemini-2.5-pro` | Input + output tokens per query | ~ `$1.25`–`$10` / 1M tokens | -| Generation (alt) | `gemini-2.5-flash` | Input + output tokens per query | ~ `$0.15`–`$0.60` / 1M tokens | +executable = run_rag_embed.sh +transfer_input_files = rag_embed.py, pdfs_bundle.zip, requirements_rag.txt -**Tip:** Embeddings are the best investment — compute them once, reuse them for every query. Generation is the ongoing cost; choosing Flash over Pro and keeping prompts concise are the two biggest levers. +should_transfer_files = YES +when_to_transfer_output = ON_EXIT -::::::::::::::::::::::::::::::::::::: callout - -### Common issues and troubleshooting - -- **Rate limiting on the Gemini API:** If you see `429 Resource Exhausted` errors, wait 30–60 seconds and retry. For large corpora, add a short `time.sleep(1)` between embedding batches. -- **PDFs with no extractable text:** Scanned documents or image-heavy PDFs will return empty strings from `PdfReader`. Check for empty chunks with `corpus_df[corpus_df["text"].str.strip() == ""]` and drop them before embedding. -- **Embeddings fail mid-batch:** If an embedding call fails partway through, you'll have partial results. Consider saving `emb_matrix` to disk after each batch so you can resume rather than re-embedding everything. -- **"Project not found" or permission errors:** Make sure your `PROJECT_ID` matches the project where Vertex AI APIs are enabled. Run `gcloud config get-value project` in a terminal cell to verify. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: callout - -### Choosing an embedding model - -We use `gemini-embedding-001` in this episode, but Vertex AI offers several alternatives in the [Model Garden](https://console.cloud.google.com/vertex-ai/model-garden): +log = logs/rag_embed_$(Cluster).log +output = logs/rag_embed_$(Cluster).out +error = logs/rag_embed_$(Cluster).err -- **`text-embedding-005`** — older model, 768-dimensional output, still widely used. -- **`multimodal-embedding-001`** — supports image + text embeddings for richer use cases. -- **Third-party models** (via Model Garden) — e.g., `bge-large-en`, `cohere-embed-v3`, `all-MiniLM`. +request_cpus = 2 +request_memory = 8GB +request_disk = 4GB -When choosing, consider: output dimensions (higher = more expressive but more memory), token limits, multilingual support, and pricing. +# Pass API key as an environment variable +environment = "GOOGLE_API_KEY=$(GOOGLE_API_KEY)" -:::::::::::::::::::::::::::::::::::::::::::::::: - -### Cleanup note - -The embeddings and nearest-neighbors index in this episode are held **in memory** — they disappear when your notebook kernel restarts or your VM stops. No persistent cloud resources (endpoints, buckets, or managed indexes) were created, so there's nothing extra to clean up beyond the VM itself. If you're done for the day, stop your Workbench Instance to avoid ongoing charges (see [Episode 9](09-Resource-management-cleanup.md)). - -## Key takeaways - -- **Chunk → embed → retrieve → generate** is the core RAG loop. Each step has its own tuning knobs. -- Use **Vertex AI managed embeddings** and **Gemini** for a low-ops, cost-controlled pipeline. -- **Cache embeddings** — computing them once and reusing them saves the most cost. -- **Prompt engineering matters** — how you instruct the LLM to use (or refuse to use) the context directly affects answer quality and hallucination risk. -- This workflow generalizes to any retrieval task — research papers, policy documents, lab notebooks, etc. - -::::::::::::::::::::::::::::::::::::: callout - -### Scaling beyond in-memory search - -This episode stores embeddings **in memory** with scikit-learn's `NearestNeighbors` — fine for prototyping with up to a few thousand chunks. For larger or production corpora, swap in a managed vector store such as [Vertex AI Vector Search](https://cloud.google.com/vertex-ai/docs/vector-search/overview). The core pipeline (chunk → embed → retrieve → generate) stays the same; only the index backend changes. +queue 1 +``` -:::::::::::::::::::::::::::::::::::::::::::::::: +Then do retrieval and generation interactively with the pre-computed embeddings. ::::::::::::::::::::::::::::::::::::: callout -### Hugging Face / open-model alternatives +### Open-source alternatives -You can replace the Google-managed APIs used in this episode with open-source models: +You can replace the API-based models with open-source alternatives that run entirely on CHTC GPU nodes: - **Embeddings:** `sentence-transformers/all-MiniLM-L6-v2`, `BAAI/bge-large-en-v1.5` -- **Generators:** `google/gemma-2b-it`, `mistralai/Mistral-7B-Instruct`, or `tiiuae/falcon-7b-instruct` +- **Generators:** `google/gemma-2b-it`, `mistralai/Mistral-7B-Instruct` -This requires a GPU VM (e.g., `n1-standard-8` + `T4`) and manual model management. Rather than running a large GPU in Workbench, you can launch Vertex AI custom jobs that perform the embedding and generation steps — start with a PyTorch container image and add the HuggingFace libraries as requirements. +This requires GPU jobs with appropriate Docker images (e.g., a container with PyTorch and transformers installed). The advantage is no API costs and no external dependencies; the tradeoff is more setup and longer queue times for GPU resources. :::::::::::::::::::::::::::::::::::::::::::::::: -## What's next? - -This episode built a minimal RAG pipeline from scratch. Here's where to go from here depending on your goals: +### Cleanup note -- **[Vertex AI Vector Search](https://cloud.google.com/vertex-ai/docs/vector-search/overview)** — Replace the in-memory `NearestNeighbors` index with a managed, scalable vector database for production workloads with millions of documents. -- **[Vertex AI Agent Builder](https://cloud.google.com/products/agent-builder)** — Build managed RAG applications with built-in grounding, chunking, and retrieval — less code, more guardrails. -- **Evaluation and iteration** — Measure retrieval quality (precision\@k, recall\@k) and generation quality (faithfulness, relevance) to systematically improve your pipeline. -- **Advanced chunking** — Explore sentence-level splitting (with `spaCy` or `nltk`), recursive chunking, or document-structure-aware chunking for better retrieval on complex papers. -- **[Deploying RAG in Bedrock vs. Local: WattBot 2025 Case Study](https://uw-madison-datascience.github.io/ML-X-Nexus/Applications/Videos/Forums/mlx_2026-02-17.html)** — See how the same sustainability-paper corpus powers a production RAG system deployed on AWS Bedrock and local hardware, with comparisons of cost, latency, and model choice. +The embeddings and nearest-neighbors index in this episode are held **in memory** — they disappear when your Python session ends. No persistent CHTC resources were created beyond the files in your home directory. Clean up any large temporary files (extracted PDFs, cached embeddings) when you're done. ::::::::::::::::::::::::::::::::::::: keypoints - RAG grounds LLM answers in your own data — retrieve first, then generate. -- Vertex AI provides managed embedding and generation APIs that require minimal infrastructure. +- The pipeline (chunk, embed, retrieve, generate) works the same regardless of where you run it. - Chunk size, retrieval depth (`top_k`), and prompt design are the primary tuning levers. +- API-based models are simplest for small corpora; open-source models on CHTC GPUs avoid API costs for larger workloads. - Always cite retrieved chunks for reproducibility and transparency. -- Embeddings are computed once and reused; generation cost scales with query volume. :::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/08-Advanced-HTCondor-workflows.md b/episodes/08-Advanced-HTCondor-workflows.md new file mode 100644 index 00000000..fdc8051f --- /dev/null +++ b/episodes/08-Advanced-HTCondor-workflows.md @@ -0,0 +1,504 @@ +--- +title: "Bonus: Advanced HTCondor Workflows" +teaching: 15 +exercises: 10 +--- + +:::::::::::::::::::::::::::::::::::::: questions + +- How do I chain multiple HTCondor jobs into a multi-step workflow? +- How can I debug a running job or match jobs to specific hardware? +- What happens when a job fails, and how do I handle retries automatically? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: objectives + +- Write a DAGMan workflow that chains preprocessing, training, and evaluation jobs. +- Submit and monitor DAG workflows with `condor_submit_dag` and `condor_q -dag`. +- Use wrapper scripts, `condor_ssh_to_job`, and ClassAd requirements to control job execution. +- Configure automatic retries and failure handling in DAGMan. +- Access OSPool resources with `+WantFlocking` and `+WantGlidein`. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: callout + +### Bonus episode + +This episode is not part of the standard workshop flow. It covers advanced HTCondor features for building multi-step ML workflows on CHTC. Contributions and feedback are welcome — open an issue or pull request on the [lesson repository](https://github.com/qualiaMachine/Intro_GCP_for_ML). + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Why multi-step workflows? + +In earlier episodes we submitted individual HTCondor jobs — one submit file, one execution. Real ML projects almost always involve a pipeline of steps: + +1. **Preprocess** raw data into training-ready format. +2. **Train** a model on the preprocessed data. +3. **Evaluate** the trained model on a held-out test set. + +You could submit these one at a time and wait, but that is manual, error-prone, and does not scale. HTCondor's **DAGMan** (Directed Acyclic Graph Manager) lets you define the entire pipeline in a single file and submit it as one unit. DAGMan handles the ordering, monitors each step, and can retry failed jobs automatically. + + +## DAGMan: Directed Acyclic Graph Manager + +A DAG file describes your workflow as a graph of jobs and dependencies. Each node is an HTCondor job (defined by a `.sub` file), and edges define which jobs must finish before others can start. + +### DAG file syntax + +A DAG file uses a small set of keywords: + +- `JOB ` — defines a node in the graph. +- `PARENT CHILD ` — defines dependency edges. +- `RETRY ` — retries a node up to `` times if it fails. + +Here is a complete DAG file for a train-then-evaluate pipeline: + +``` +# workflow.dag +JOB preprocess preprocess.sub +JOB train train.sub +JOB evaluate evaluate.sub + +PARENT preprocess CHILD train +PARENT train CHILD evaluate + +RETRY train 2 +``` + +This tells DAGMan: + +1. Run the `preprocess` job first. +2. When `preprocess` succeeds, run `train`. +3. When `train` succeeds, run `evaluate`. +4. If `train` fails, retry it up to 2 additional times before giving up. + +### Submitting a DAG + +Submit the entire workflow with a single command: + +```bash +condor_submit_dag workflow.dag +``` + +DAGMan itself runs as a lightweight job on the submit server. It watches the child jobs and advances through the graph as nodes complete. + +### Monitoring a DAG + +Use `condor_q` with the `-dag` flag to see the DAG structure: + +```bash +condor_q -dag +``` + +This shows each node's status (idle, running, completed, failed) and the overall DAG progress. You can also check the log file that DAGMan creates automatically: + +```bash +cat workflow.dag.dagman.out +``` + +This log records every state transition — when each node was submitted, started, succeeded, or failed. + + +## Wrapper scripts + +Each node in your DAG points to a submit file, and each submit file specifies an `executable`. For ML workflows it is common to use a **wrapper script** — a short shell script that sets up the environment before running your Python code: + +```bash +#!/bin/bash +# run_train.sh — wrapper script for the training step + +# Unpack the Python environment (transferred as a tarball) +tar -xzf python_env.tar.gz +export PATH=$PWD/python_env/bin:$PATH + +# Run the training script +python3 train_nn.py --train data/train.npz --val data/val.npz --epochs 500 + +# Save exit code so HTCondor sees the real status +exit $? +``` + +The corresponding submit file references this wrapper: + +``` +# train.sub +executable = run_train.sh +arguments = + +transfer_input_files = python_env.tar.gz, train_nn.py, data/ +transfer_output_files = model/, metrics.json + +log = train_$(Cluster).log +output = train_$(Cluster).out +error = train_$(Cluster).err + +request_cpus = 1 +request_memory = 4GB +request_disk = 2GB + +queue +``` + +::::::::::::::::::::::::::::::::::::: callout + +### The `run_*.sh` naming convention + +Using a consistent naming pattern like `run_preprocess.sh`, `run_train.sh`, and `run_evaluate.sh` makes it easy to see which wrapper belongs to which pipeline step. Each wrapper handles environment setup so your Python scripts stay portable. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +## Debugging running jobs with `condor_ssh_to_job` + +Sometimes a job is running but producing unexpected output. Instead of waiting for it to fail and reading log files, you can SSH directly into the running job's execution environment: + +```bash +condor_ssh_to_job +``` + +This opens a shell session on the execute node, inside the job's working directory. You can inspect files, check environment variables, and even run quick diagnostic commands. When you exit the session, the job continues normally. + +```bash +# Example: check what files the job has produced so far +condor_ssh_to_job 12345.0 +ls -la +cat metrics.json +exit +``` + +::::::::::::::::::::::::::::::::::::: callout + +### When `condor_ssh_to_job` is not available + +Not all pools enable SSH access to running jobs. On CHTC, this feature is generally available for jobs running on CHTC-owned hardware. Jobs running on OSPool resources via flocking may not support it. Check with your system administrators if the command fails. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +## Job requirements and ClassAds + +HTCondor uses a system called **ClassAds** (Classified Advertisements) to match jobs to machines. Every machine advertises its properties (CPUs, memory, GPU type, operating system), and every job advertises its requirements. The HTCondor matchmaker pairs them up. + +### Requesting specific hardware + +You can add a `requirements` line to your submit file to target specific hardware: + +``` +# Request a machine with at least 8 CPUs and a GPU +request_cpus = 8 +request_memory = 16GB +request_gpus = 1 + +# Only run on machines with NVIDIA A100 GPUs +requirements = (CUDADeviceName == "NVIDIA A100-SXM4-80GB") +``` + +### Viewing available ClassAds + +To see what machines are available and what they advertise: + +```bash +# List all GPUs available in the pool +condor_status -compact -constraint 'TotalGpus > 0' + +# See detailed ClassAds for a specific machine +condor_status -long +``` + + +## Job priorities and scheduling + +When the pool is busy, HTCondor decides which jobs run first based on **priority**. You can influence this with the `priority` keyword in your submit file: + +``` +# Higher numbers = higher priority (runs sooner) +priority = 10 +``` + +Within your own jobs, higher-priority jobs will be scheduled before lower-priority ones. Note that this only affects your relative ordering — it does not let you jump ahead of other users. HTCondor uses a **fair-share** scheduling policy across users. + +You can check your current priority standing with: + +```bash +condor_userprio +``` + + +## Handling job failures + +Jobs fail for many reasons: out-of-memory errors, network timeouts, transient hardware issues. HTCondor and DAGMan provide several mechanisms for handling failures gracefully. + +### Retries in DAGMan + +The simplest approach is the `RETRY` keyword in your DAG file: + +``` +RETRY train 2 +``` + +If the `train` node exits with a non-zero exit code, DAGMan will resubmit it up to 2 more times. This is useful for transient failures (e.g., a preempted job or a temporary network error). + +### Holding and releasing jobs + +In your submit file, you can use **policy expressions** to hold a job that exits abnormally and release it after a delay: + +``` +# Hold the job if it exits with a non-zero code +on_exit_hold = (ExitCode != 0) +on_exit_hold_reason = "Job exited with non-zero code; holding for inspection." + +# Automatically release held jobs after 5 minutes (300 seconds), +# but only if fewer than 3 release attempts have been made +periodic_release = (HoldReasonCode == 3) && (NumJobStarts < 3) && \ + ((time() - EnteredCurrentStatus) > 300) +``` + +This pattern is helpful when failures are intermittent — the job is held so you can inspect it, but it also gets a second chance automatically. + +### Checking why a job was held + +```bash +condor_q -hold +``` + +This shows the hold reason, which tells you whether the job ran out of memory, hit a time limit, or failed for another reason. + + +## Accessing OSPool resources with flocking + +CHTC is part of the **OSPool** (Open Science Pool), a nationwide network of computing resources. By adding two lines to your submit file, your jobs can "flock" to machines at other institutions when CHTC is busy: + +``` ++WantFlocking = true ++WantGlidein = true +``` + +- `+WantFlocking` allows your jobs to run on OSPool resources contributed by other institutions. +- `+WantGlidein` allows your jobs to run on resources provisioned dynamically by GlideinWMS. + +::::::::::::::::::::::::::::::::::::: callout + +### Flocking considerations + +When your jobs flock to remote sites, there are a few things to keep in mind: + +- **Transfer everything** — remote machines do not have access to your home directory. Make sure all input files are listed in `transfer_input_files`. +- **Software portability** — pack your Python environment into a tarball or use a container. Do not rely on software installed on CHTC submit servers. +- **Longer queue times** — flocked jobs may wait longer to start since they compete with other OSPool users. +- **No `condor_ssh_to_job`** — you typically cannot SSH into jobs running on remote sites. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +## Putting it all together: a complete DAG workflow + +Here is a full example of a three-step ML pipeline managed by DAGMan. + +**Step 1: Write the submit files** + +``` +# preprocess.sub +executable = run_preprocess.sh +transfer_input_files = python_env.tar.gz, preprocess.py, raw_data/ +transfer_output_files = processed_data/ + +log = preprocess_$(Cluster).log +output = preprocess_$(Cluster).out +error = preprocess_$(Cluster).err + +request_cpus = 1 +request_memory = 4GB +request_disk = 2GB + +queue +``` + +``` +# train.sub +executable = run_train.sh +transfer_input_files = python_env.tar.gz, train_nn.py, processed_data/ +transfer_output_files = model/, metrics.json + +log = train_$(Cluster).log +output = train_$(Cluster).out +error = train_$(Cluster).err + +request_cpus = 4 +request_memory = 8GB +request_disk = 4GB +request_gpus = 1 + ++WantFlocking = true ++WantGlidein = true + +queue +``` + +``` +# evaluate.sub +executable = run_evaluate.sh +transfer_input_files = python_env.tar.gz, evaluate.py, model/, processed_data/ +transfer_output_files = results/ + +log = evaluate_$(Cluster).log +output = evaluate_$(Cluster).out +error = evaluate_$(Cluster).err + +request_cpus = 1 +request_memory = 4GB +request_disk = 2GB + +queue +``` + +**Step 2: Write the DAG file** + +``` +# ml_pipeline.dag +JOB preprocess preprocess.sub +JOB train train.sub +JOB evaluate evaluate.sub + +PARENT preprocess CHILD train +PARENT train CHILD evaluate + +RETRY train 2 +RETRY evaluate 1 +``` + +**Step 3: Submit and monitor** + +```bash +# Submit the full pipeline +condor_submit_dag ml_pipeline.dag + +# Watch the DAG progress +condor_q -dag + +# Check the DAGMan log for detailed status +tail -f ml_pipeline.dag.dagman.out +``` + +When the DAG completes successfully, all three steps have run in sequence, and your results are in the `results/` directory. + + +::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1 — Write a DAG file + +Given two submit files `clean_data.sub` and `train_model.sub`, write a DAG file that: + +1. Runs `clean_data` first. +2. Runs `train_model` after `clean_data` succeeds. +3. Retries `train_model` up to 3 times on failure. + +:::::::::::::::::::::::::::::::::::: solution + +``` +# my_pipeline.dag +JOB clean_data clean_data.sub +JOB train_model train_model.sub + +PARENT clean_data CHILD train_model + +RETRY train_model 3 +``` + +Submit with: + +```bash +condor_submit_dag my_pipeline.dag +``` + +::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2 — Add GPU requirements + +You have a training job that needs an NVIDIA GPU with at least 40 GB of memory. Modify the submit file snippet below to add the appropriate requirements: + +``` +executable = run_train.sh +request_cpus = 4 +request_memory = 16GB +request_gpus = 1 + +# Add your requirements line here + +queue +``` + +:::::::::::::::::::::::::::::::::::: solution + +``` +executable = run_train.sh +request_cpus = 4 +request_memory = 16GB +request_gpus = 1 + +requirements = (CUDAGlobalMemoryMb >= 40000) + ++WantFlocking = true ++WantGlidein = true + +queue +``` + +The `CUDAGlobalMemoryMb` ClassAd attribute reports GPU memory in megabytes. Adding flocking increases the chance of matching a machine with a large GPU. You can discover available GPU ClassAd attributes by running `condor_status -compact -constraint 'TotalGpus > 0'`. + +::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 3 — Diagnose a held job + +You check on your jobs and see one is held: + +``` +$ condor_q + +-- Schedd: submit1.chtc.wisc.edu + ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD + 98765.0 jdoe 3/25 10:30 0+00:00:00 H 0 4.0 run_train.sh +``` + +What command would you run to find out why it is held? What are two common reasons a job might be placed on hold? + +:::::::::::::::::::::::::::::::::::: solution + +Run: + +```bash +condor_q -hold 98765.0 +``` + +Two common reasons: + +1. **Exceeded memory request** — the job used more memory than `request_memory` and was killed by the system. +2. **`on_exit_hold` policy** — the job exited with a non-zero exit code and the submit file included `on_exit_hold = (ExitCode != 0)`. + +Other possibilities include exceeding disk quota, missing input files, or a Docker image that could not be pulled. The hold reason message from `condor_q -hold` will tell you exactly which one occurred. + +::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +::::::::::::::::::::::::::::::::::::: keypoints + +- DAGMan lets you define multi-step workflows in a single `.dag` file using `JOB` and `PARENT/CHILD` syntax. +- Submit an entire pipeline with `condor_submit_dag` and monitor it with `condor_q -dag`. +- Wrapper scripts (`run_*.sh`) set up the execution environment before calling your Python code. +- `condor_ssh_to_job` lets you debug a running job by opening a shell on the execute node. +- ClassAd `requirements` expressions let you match jobs to specific hardware (e.g., GPU type or memory). +- DAGMan `RETRY`, `on_exit_hold`, and `periodic_release` provide automatic failure handling. +- `+WantFlocking` and `+WantGlidein` give your jobs access to the nationwide OSPool when CHTC is busy. + +:::::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/08-CLI-workflows.md b/episodes/08-CLI-workflows.md deleted file mode 100644 index 52a4cc5f..00000000 --- a/episodes/08-CLI-workflows.md +++ /dev/null @@ -1,434 +0,0 @@ ---- -title: "Bonus: CLI Workflows Without Notebooks" -teaching: 15 -exercises: 10 ---- - -:::::::::::::::::::::::::::::::::::::: questions - -- How do I submit Vertex AI training jobs from the command line instead of a Jupyter notebook? -- What does authentication look like when working outside of a Workbench VM? -- Can I manage GCS buckets, training jobs, and endpoints entirely from a terminal? - -:::::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: objectives - -- Authenticate with GCP and set a default project using the `gcloud` CLI. -- Upload data to GCS and submit a Vertex AI custom training job from the terminal. -- Monitor, cancel, and clean up jobs using `gcloud ai` commands. -- Understand when CLI workflows are more practical than notebooks. - -:::::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: callout - -### Bonus episode - -This episode is not part of the standard workshop flow. It covers CLI alternatives to the notebook-based workflows from earlier episodes. Contributions and feedback are welcome — open an issue or pull request on the [lesson repository](https://github.com/qualiaMachine/Intro_GCP_for_ML). - -:::::::::::::::::::::::::::::::::::::::::::::::::: - -## Why use the CLI? - -Throughout this workshop we used Jupyter notebooks on a Vertex AI Workbench VM as our control center. That setup is great for teaching, but it is not the only way — and sometimes it is not the best way. Common situations where a terminal-based workflow makes more sense: - -- **Automation and CI/CD** — You want a GitHub Actions workflow or a cron job to kick off training runs. Notebooks require manual interaction; shell scripts do not. -- **SSH into an HPC cluster or remote server** — You already have a terminal session and do not want to spin up a Workbench VM just to submit a job. -- **Reproducibility** — A shell script checked into version control is easier to review and reproduce than a notebook with hidden state. -- **Cost** — If all you need is to submit a job, paying for a Workbench VM while you wait is unnecessary. You can submit from Cloud Shell (free) or your laptop. - -Everything we did with the Python SDK in Episodes 4–6 has an equivalent `gcloud` command. This episode walks through the key ones. - - -## Step 1: Install and authenticate - -If you are on a Workbench VM, the `gcloud` CLI is already installed and authenticated via the VM's service account. On your laptop or another machine you need to install and log in. - -### Install the gcloud CLI - -Follow the instructions for your platform at [cloud.google.com/sdk/docs/install](https://cloud.google.com/sdk/docs/install). On most systems this is a single installer or package manager command: - -```bash -# macOS (Homebrew) -brew install --cask google-cloud-sdk - -# Ubuntu / Debian -sudo apt-get install google-cloud-cli - -# Windows — download the installer from the link above -``` - -### Authenticate - -```bash -# Interactive browser-based login (laptop / desktop) -gcloud auth login - -# Set your default project so you don't need --project on every command -gcloud config set project YOUR_PROJECT_ID - -# Set a default region (optional but saves typing) -gcloud config set compute/region us-central1 -``` - -On a Workbench VM these steps are already done for you — the VM's attached service account provides credentials automatically. This is the authentication convenience mentioned in [Episode 2](02-Notebooks-as-controllers.md). - -### Application Default Credentials - -If you also want to use the Python SDK (e.g., `aiplatform.init()`) outside of a Workbench VM, you need Application Default Credentials (ADC): - -```bash -gcloud auth application-default login -``` - -This stores a credential file locally that Google client libraries pick up automatically. Without it, Python SDK calls from your laptop will fail with an authentication error. - - -## Step 2: Upload data to GCS - -In Episode 3 we uploaded data through the [Cloud Console](https://console.cloud.google.com/storage/browser). From the CLI the equivalent is: - -```bash -# Create a bucket (if it doesn't already exist) -gcloud storage buckets create gs://doe-titanic \ - --location=us-central1 - -# Upload the Titanic CSV files -gcloud storage cp ~/Downloads/data/titanic_train.csv gs://doe-titanic/ -gcloud storage cp ~/Downloads/data/titanic_test.csv gs://doe-titanic/ - -# Verify -gcloud storage ls gs://doe-titanic/ -``` - -::::::::::::::::::::::::::::::::::::: callout - -### gsutil vs gcloud storage - -Older tutorials may reference `gsutil`. Google now recommends `gcloud storage` as the primary CLI for Cloud Storage. The commands are very similar (`gsutil cp` → `gcloud storage cp`), but `gcloud storage` is faster for large transfers and receives more active development. - -:::::::::::::::::::::::::::::::::::::::::::::::::: - - -## Step 3: Submit a training job - -In Episode 4 we used the Python SDK to create and run a `CustomTrainingJob`. The `gcloud` equivalent is `gcloud ai custom-jobs create`. You provide a JSON or YAML config file that describes the job. - -### Write a job config file - -Create a file called `xgb_job.yaml`: - -```yaml -# xgb_job.yaml — Vertex AI custom training job config -# Note: display_name goes on the command line (--display-name), not in this file. -# The --config file describes the job *spec* only, using snake_case field names. -worker_pool_specs: - - machine_spec: - machine_type: n1-standard-4 - replica_count: 1 - container_spec: - image_uri: us-docker.pkg.dev/vertex-ai/training/xgboost-cpu.2-1:latest - args: - - "--train=gs://doe-titanic/titanic_train.csv" - - "--max_depth=6" - - "--eta=0.3" - - "--subsample=0.8" - - "--colsample_bytree=0.8" - - "--num_round=100" -base_output_directory: - output_uri_prefix: gs://doe-titanic/artifacts/xgb/cli-run/ -``` - -Replace the bucket name and hyperparameters to match your setup. - -### Submit the job - -```bash -gcloud ai custom-jobs create \ - --region=us-central1 \ - --display-name=cli-xgb-titanic \ - --config=xgb_job.yaml -``` - -::::::::::::::::::::::::::::::::::::: callout - -### Windows users — line continuation syntax - -The `\` at the end of each line is a **Linux / macOS** line continuation character. It does **not** work in the Windows Command Prompt. You have three options: - -1. **Put the command on one line** (easiest): - - ``` - gcloud ai custom-jobs create --region=us-central1 --display-name=cli-xgb-titanic --config=xgb_job.yaml - ``` - -2. **Use the `^` continuation character** (Windows CMD): - - ``` - gcloud ai custom-jobs create ^ - --region=us-central1 ^ - --display-name=cli-xgb-titanic ^ - --config=xgb_job.yaml - ``` - -3. **Use the backtick continuation character** (PowerShell): - - ``` - gcloud ai custom-jobs create ` - --region=us-central1 ` - --display-name=cli-xgb-titanic ` - --config=xgb_job.yaml - ``` - -This applies to **all** multi-line commands in this episode, not just this one. - -:::::::::::::::::::::::::::::::::::::::::::::::::: - -Vertex AI provisions a VM, runs your training container, and writes outputs to the `base_output_directory`. The job runs on GCP's infrastructure, not on your machine — you can close your terminal and it keeps going. - -### GPU example (PyTorch) - -For the PyTorch GPU job from Episode 5, the config includes an `acceleratorType` and `acceleratorCount`. Note that the argument names must match exactly what `train_nn.py` expects (`--train`, `--val`, `--learning_rate`, etc.): - -```yaml -# pytorch_gpu_job.yaml -worker_pool_specs: - - machine_spec: - machine_type: n1-standard-8 - accelerator_type: NVIDIA_TESLA_T4 - accelerator_count: 1 - replica_count: 1 - container_spec: - image_uri: us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.2-4.py310:latest - args: - - "--train=gs://doe-titanic/data/train_data.npz" - - "--val=gs://doe-titanic/data/val_data.npz" - - "--epochs=500" - - "--learning_rate=0.001" - - "--patience=50" -base_output_directory: - output_uri_prefix: gs://doe-titanic/artifacts/pytorch/cli-gpu-run/ -``` - -Submit the same way: - -```bash -gcloud ai custom-jobs create \ - --region=us-central1 \ - --display-name=cli-pytorch-titanic-gpu \ - --config=pytorch_gpu_job.yaml -``` - - -## Step 4: Monitor jobs - -### List jobs - -```bash -gcloud ai custom-jobs list --region=us-central1 -``` - -This prints a table with job ID, display name, state (`JOB_STATE_RUNNING`, `JOB_STATE_SUCCEEDED`, etc.), and creation time. - -### Stream logs - -```bash -gcloud ai custom-jobs stream-logs JOB_ID --region=us-central1 -``` - -This is the CLI equivalent of watching the log panel in a notebook — output streams to your terminal in real time. - -### Hyperparameter tuning jobs - -The `gcloud ai hp-tuning-jobs` family works the same way: - -```bash -gcloud ai hp-tuning-jobs list --region=us-central1 -gcloud ai hp-tuning-jobs stream-logs JOB_ID --region=us-central1 -``` - -Creating HP tuning jobs via YAML is more verbose — for complex tuning configs, the Python SDK ([Episode 6](06-Hyperparameter-tuning.md)) is often more readable. - - -## Step 5: Check for running resources (don't skip this) - -The biggest risk with CLI workflows is submitting a job — or leaving a notebook VM running — and forgetting about it. Unlike a Workbench notebook where you can see tabs and running kernels, the CLI gives you no visual reminder that something is still billing you. Jobs and VMs keep running whether or not your terminal is open. - -**Get in the habit of checking before you walk away:** - -```bash -# Training jobs still running -gcloud ai custom-jobs list --region=us-central1 --filter="state=JOB_STATE_RUNNING" - -# HP tuning jobs still running -gcloud ai hp-tuning-jobs list --region=us-central1 --filter="state=JOB_STATE_RUNNING" - -# Endpoints still deployed (these bill 24/7, even when idle) -gcloud ai endpoints list --region=us-central1 - -# Workbench notebook VMs still running -gcloud workbench instances list --location=us-central1-a -``` - -If anything shows up that you don't need, shut it down: - -```bash -# Cancel a running training job -gcloud ai custom-jobs cancel JOB_ID --region=us-central1 - -# Undeploy a model from an endpoint (stops the per-hour charge) -gcloud ai endpoints undeploy-model ENDPOINT_ID \ - --region=us-central1 \ - --deployed-model-id=DEPLOYED_MODEL_ID - -# Stop a Workbench notebook VM -gcloud workbench instances stop INSTANCE_NAME --location=us-central1-a -``` - -::::::::::::::::::::::::::::::::::::: callout - -### Cost leaks are silent - -A forgotten endpoint bills ~ `$1.50`–`$3`/hour depending on machine type — that's **`$36`–`$72`/day** doing nothing. A GPU training job you accidentally submitted twice burns money until you cancel it. There's no pop-up warning; you'll only find out on your billing dashboard or when you hit a quota. - -Build the habit: **every time you finish a CLI session, run the check commands above.** For a more thorough cleanup checklist, see [Episode 9](09-Resource-management-cleanup.md). - -:::::::::::::::::::::::::::::::::::::::::::::::: - - -## Step 6: Download results - -After a job succeeds, download artifacts from GCS: - -```bash -# List what the job wrote -gcloud storage ls gs://doe-titanic/artifacts/xgb/cli-run/ - -# Download everything locally -gcloud storage cp -r gs://doe-titanic/artifacts/xgb/cli-run/ ./local_results/ -``` - -You can then load the model and metrics in a local Python session for evaluation — no Workbench VM required. - - -## Putting it all together: a shell script - -Here is a minimal end-to-end script that submits a training job and waits for it to finish. You could check this into your repository or trigger it from CI. - -```bash -#!/usr/bin/env bash -set -euo pipefail - -PROJECT_ID="your-project-id" -REGION="us-central1" -BUCKET="doe-titanic" -RUN_ID=$(date +%Y%m%d-%H%M%S) - -# Upload latest training data -gcloud storage cp ./data/titanic_train.csv gs://${BUCKET}/ - -# Submit the job -gcloud ai custom-jobs create \ - --region=${REGION} \ - --display-name="xgb-${RUN_ID}" \ - --worker-pool-spec=machine-type=n1-standard-4,replica-count=1,container-image-uri=us-docker.pkg.dev/vertex-ai/training/xgboost-cpu.2-1:latest \ - --args="--train=gs://${BUCKET}/titanic_train.csv,--max_depth=6,--eta=0.3,--num_round=100" \ - --base-output-dir=gs://${BUCKET}/artifacts/xgb/${RUN_ID}/ - -echo "Job submitted. Check status with:" -echo " gcloud ai custom-jobs list --region=${REGION}" -``` - -::::::::::::::::::::::::::::::::::::: callout - -### Cloud Shell — free CLI access - -If you do not want to install the `gcloud` CLI locally, you can use **Cloud Shell** directly in the [Google Cloud Console](https://console.cloud.google.com/). It gives you a free, temporary Linux VM with `gcloud` pre-installed and authenticated. Click the terminal icon (">_") in the top-right corner of the Cloud Console to open it. - -Cloud Shell is a good option for one-off job submissions or quick resource checks without spinning up a Workbench instance. - -:::::::::::::::::::::::::::::::::::::::::::::::::: - - -::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 1 — Submit a job from the CLI - -Using the XGBoost YAML config shown above (adjusted for your bucket name), submit a training job from Cloud Shell or your local terminal. Verify it appears in the Vertex AI Console under **Training > Custom Jobs**. - -:::::::::::::::::::::::::::::::::::: solution - -```bash -# Edit xgb_job.yaml with your bucket name, then: -gcloud ai custom-jobs create --region=us-central1 --display-name=cli-xgb-titanic --config=xgb_job.yaml - -# Confirm it's running: -gcloud ai custom-jobs list --region=us-central1 -``` - -::::::::::::::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 2 — Stream logs in real time - -Find the job ID from the previous challenge and stream its logs to your terminal. Compare this experience to watching logs in the notebook. - -:::::::::::::::::::::::::::::::::::: solution - -```bash -# Get the job ID from the list output -gcloud ai custom-jobs list --region=us-central1 - -# Stream logs (replace JOB_ID with the actual ID) -gcloud ai custom-jobs stream-logs JOB_ID --region=us-central1 -``` - -::::::::::::::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 3 — Download and inspect artifacts - -After your job completes, download the model and metrics files to your local machine. Load `metrics.json` in Python and verify the accuracy value. - -:::::::::::::::::::::::::::::::::::: solution - -```bash -gcloud storage cp -r gs://YOUR_BUCKET/artifacts/xgb/cli-run/ ./results/ -python3 -c "import json; print(json.load(open('./results/model/metrics.json')))" -``` - -::::::::::::::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::::: - - -## When to use notebooks vs. CLI - -| | Notebooks | CLI / scripts | -|---|---|---| -| **Best for** | Exploration, teaching, visualization | Automation, CI/CD, reproducibility | -| **Auth setup** | Automatic on Workbench VMs | Requires `gcloud auth login` or service account keys | -| **Cost** | Pay for VM uptime while notebook is open | Free from Cloud Shell; zero cost from laptop | -| **State management** | Hidden state can cause issues | Stateless scripts are easier to debug | -| **Interactivity** | Rich (plots, widgets, markdown) | Terminal only (or pipe to other tools) | - -Most real-world ML/AI projects use both: notebooks for early experimentation and CLI/scripts for production runs. - -::::::::::::::::::::::::::::::::::::: keypoints - -- Every Vertex AI operation available in the Python SDK has an equivalent `gcloud` CLI command. -- `gcloud ai custom-jobs create` submits training jobs from any terminal — no notebook required. -- Use `gcloud auth login` and `gcloud auth application-default login` to authenticate outside of Workbench VMs. -- Cloud Shell provides free, pre-authenticated CLI access directly in the browser. -- Shell scripts checked into version control are more reproducible than notebooks with hidden state. -- CLI workflows give no visual reminder of running resources — always check for active jobs, endpoints, and VMs before walking away. -- Notebooks and CLI workflows are complementary — use each where it fits best. - -:::::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/09-Resource-management-best-practices.md b/episodes/09-Resource-management-best-practices.md new file mode 100644 index 00000000..2e5b1360 --- /dev/null +++ b/episodes/09-Resource-management-best-practices.md @@ -0,0 +1,481 @@ +--- +title: "Resource Management Best Practices on CHTC" +teaching: 25 +exercises: 10 +--- + +:::::::::::::::::::::::::::::::::::::: questions + +- How do I check my disk usage and job history on CHTC? +- What tools does HTCondor provide for monitoring and managing running jobs? +- How do I right-size my resource requests so jobs start faster? +- What are CHTC's runtime limits and how do I handle long-running jobs? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: objectives + +- Check disk usage and quota on the submit node using `du` and quota commands. +- Monitor, inspect, and remove HTCondor jobs with `condor_q`, `condor_watch_q`, and `condor_rm`. +- Use job ClassAds and log files to right-size CPU, memory, and GPU requests. +- Identify CHTC's runtime categories and know when checkpointing is needed. +- Apply an end-of-session cleanup checklist to be a good citizen on shared resources. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +CHTC is free for UW-Madison researchers — there are no billing accounts and no surprise invoices. But "free" does not mean "unlimited." CHTC is a **shared resource**: every CPU, GPU, and gigabyte of storage you hold is unavailable to another researcher. Being a good citizen means requesting only what you need, monitoring your jobs, and cleaning up when you are done. + +This episode covers the practical tools and habits that keep your work running smoothly and the cluster healthy for everyone. + + +## Checking your disk usage + +Your `/home` directory has a quota (typically ~20 GB). If you exceed it, jobs may fail to write output and new jobs will not submit. Check your usage regularly: + +```bash +# How much space am I using in my home directory? +du -sh /home/$USER + +# Break it down by subdirectory (top-level only) +du -sh /home/$USER/*/ +``` + +To check your quota and how close you are to the limit: + +```bash +quota -vs +``` + +:::::::::::::::::::::::::::::::::::::: callout + +#### Storage tiers as a reminder + +| Location | Purpose | Typical quota | +|----------|---------|---------------| +| `/home/$USER` | Code, submit files, small data | ~20 GB | +| `/staging/$USER` | Large input/output files for jobs | ~500 GB (by request) | +| SQUID (`/squid/$USER`) | Publicly readable large files | ~100 GB (by request) | + +Large datasets and model checkpoints should go in `/staging`, not `/home`. See the [CHTC file system guide](https://chtc.cs.wisc.edu/uw-research-computing/file-avail-largedata) for details. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +## Checking your job history + +After jobs complete (or fail), use `condor_history` to review what happened: + +```bash +# Your recent jobs (most recent first) +condor_history $USER + +# Show specific columns: job ID, status, runtime, memory used +condor_history $USER -af ClusterId JobStatus RemoteWallClockTime MemoryUsage +``` + +The `JobStatus` field uses numeric codes: **1** = Idle, **2** = Running, **3** = Removed, **4** = Completed, **5** = Held. + +This is especially useful for debugging: if a job completed but produced bad output, you can check how long it ran and how much memory it actually used. + + +## Monitoring running jobs + +### condor_q — your primary dashboard + +```bash +# All your jobs in the queue +condor_q + +# Just your jobs (explicit) +condor_q $USER + +# Show only running jobs +condor_q -running + +# Show only held jobs (these need attention!) +condor_q -held +``` + +### condor_watch_q — live updating view + +`condor_watch_q` refreshes automatically, like `top` for your jobs: + +```bash +condor_watch_q +``` + +Press `Ctrl+C` to exit. This is handy when you are waiting for jobs to start or watching a batch complete. + +### Checking for held jobs + +Held jobs are stuck and will not run until you fix the problem. Common causes include exceeded disk or memory requests, missing input files, and Docker image pull failures. + +```bash +# See why jobs are held +condor_q -held + +# More detail on a specific held job +condor_q JOB_ID -af HoldReason +``` + +:::::::::::::::::::::::::::::::::::::: callout + +#### Do not ignore held jobs + +Held jobs sit in the queue consuming your job slot allowance but doing no useful work. Check for them regularly and either fix the issue and release them (`condor_release JOB_ID`) or remove them (`condor_rm JOB_ID`). + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +## Removing jobs + +```bash +# Remove a single job +condor_rm JOB_ID + +# Remove all your jobs (careful!) +condor_rm $USER + +# Remove a specific cluster of jobs +condor_rm CLUSTER_ID +``` + +Use `condor_rm` when a job is stuck, when you realize you submitted with wrong parameters, or when you need to free up your queue slots. + + +## Understanding job ClassAds and resource usage + +Every HTCondor job carries a set of **ClassAds** — key-value attributes that describe the job's requests and actual usage. These are your best tool for right-sizing future jobs. + +```bash +# Show ALL ClassAds for a job (verbose) +condor_q JOB_ID -l + +# Show specific attributes (auto-format) +condor_q JOB_ID -af RequestCpus RequestMemory RequestDisk + +# Check actual usage of a running job +condor_q JOB_ID -af MemoryUsage DiskUsage RemoteWallClockTime +``` + +After a job completes, use `condor_history` with the same `-af` flags: + +```bash +condor_history JOB_ID -af RemoteWallClockTime MemoryUsage DiskUsage_RAW RequestMemory RequestCpus +``` + +Comparing **requested** resources to **actual** usage tells you whether to adjust your submit file for the next run. + + +## Right-sizing resource requests + +This is one of the most impactful things you can do as a CHTC user. Over-requesting resources does not make your job run faster — it makes your job **wait longer** in the queue, because the scheduler has to find a machine with all the resources you asked for. + +### The right-sizing workflow + +1. **Start with a reasonable estimate** for your first run. +2. **Check actual usage** after the job completes (see ClassAds above, or check the job log file). +3. **Adjust your submit file** for the next run — request ~20% more than the actual usage as a safety margin. + +### What to look for in job logs + +Every HTCondor job writes a log file (specified by the `log` line in your submit file). At the end of a completed job, HTCondor appends a summary like: + +``` +Partitionable Resources : Usage Request Allocated + Cpus : 4 4 + Disk (KB) : 35000 1048576 4184124 + Memory (MB) : 870 8192 8192 +``` + +In this example, the job requested 8192 MB of memory but only used 870 MB. You could safely request `1024` or `1500` MB next time, which would let the job match to more machines and start sooner. + +### Common over-requesting mistakes + +| Resource | Mistake | Better approach | +|----------|---------|-----------------| +| **CPUs** | Requesting 8 CPUs for a single-threaded Python script | Request 1 CPU unless your code uses multiprocessing/threading | +| **Memory** | Requesting 32 GB "just in case" | Check actual usage, request actual + 20% buffer | +| **GPUs** | Requesting 2 GPUs for code that uses only 1 | Only request multiple GPUs if your code explicitly supports multi-GPU | +| **Disk** | Requesting 100 GB when output is 2 GB | Check output sizes from a test run, add buffer | + +::::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1: Right-size a request + +A colleague shows you their submit file: + +``` +request_cpus = 8 +request_memory = 16GB +request_disk = 50GB +request_gpus = 1 +``` + +Their job log shows this usage summary: + +``` +Partitionable Resources : Usage Request Allocated + Cpus : 8 8 + Disk (KB) : 524288 52428800 52428800 + Gpus : 1 1 + Memory (MB) : 2048 16384 16384 +``` + +Their Python training script uses PyTorch with a single `model.to("cuda")` call and no `DataParallel` or multiprocessing. What changes would you suggest to their submit file? + +::::::::::::::::: solution + +The job used only 2048 MB (~2 GB) of memory out of 16 GB requested, and the disk usage was ~512 MB out of 50 GB. Since the script is single-threaded Python (no multiprocessing) and uses a single GPU, the CPUs are also over-requested. Suggested changes: + +``` +request_cpus = 1 +request_memory = 3GB +request_disk = 1GB +request_gpus = 1 +``` + +This requests ~50% more than actual usage for memory and disk as a safety margin, drops CPUs to 1 since the script is single-threaded, and keeps the single GPU. The job will match to far more machines and likely start much sooner. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +## Storage cleanup + +After jobs complete, clean up files you no longer need — especially in `/staging`: + +```bash +# Check staging usage +du -sh /staging/$USER + +# List what's in staging +ls -lh /staging/$USER/ + +# Remove old output directories +rm -rf /staging/$USER/old_experiment_output/ + +# Clean up large files in home +find /home/$USER -name "*.tar.gz" -size +100M -ls +``` + +:::::::::::::::::::::::::::::::::::::: callout + +#### Make cleanup part of your workflow + +A good habit is to add cleanup steps to your workflow: after you have copied important results to your local machine or a permanent archive, delete the copies on CHTC. The `/staging` filesystem is meant for active jobs, not long-term storage. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +## CHTC's job runtime limits + +CHTC jobs are categorized by maximum runtime. You declare this in your submit file so HTCondor can schedule your job onto an appropriate machine: + +| Category | Max runtime | Submit file setting | +|----------|------------|---------------------| +| Short | 12 hours | `+is_resumable = true` (default) | +| Medium | 24 hours | `+WantFlocking = true` (varies) | +| Long | 7 days | Requires special configuration | + +If your job exceeds its runtime limit, HTCondor will terminate it. To avoid losing work: + +- **Estimate your runtime** from test runs on smaller data. +- **Use checkpointing** for long-running jobs (see below). +- **Break work into smaller chunks** when possible (e.g., train for fewer epochs per job and resume). + +See the [CHTC job duration guide](https://chtc.cs.wisc.edu/uw-research-computing/job-duration) for current policies and how to request longer runtimes. + + +## Checkpointing for long-running jobs + +If your training job might exceed the runtime limit, implement **checkpointing** — periodically saving your model state so you can resume from where you left off if the job is interrupted. + +For PyTorch, a minimal checkpoint pattern looks like: + +```python +# Save checkpoint every N epochs +if epoch % checkpoint_interval == 0: + torch.save({ + 'epoch': epoch, + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'loss': loss, + }, 'checkpoint.pt') +``` + +```python +# At start of training, resume if checkpoint exists +import os +start_epoch = 0 +if os.path.exists('checkpoint.pt'): + checkpoint = torch.load('checkpoint.pt') + model.load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + start_epoch = checkpoint['epoch'] + 1 +``` + +HTCondor can be configured to automatically transfer checkpoint files when a job is evicted. See the [CHTC checkpointing guide](https://chtc.cs.wisc.edu/uw-research-computing/checkpoint-overview) for details on self-checkpointing and exit-code-based retry workflows. + + +## Common pitfalls + +Here are the mistakes that CHTC facilitators see most often. Avoiding these will save you time and keep the cluster healthy for everyone. + +### 1. Running heavy computation on the submit node + +The submit node is shared by everyone who logs in. Running training, large data processing, or GPU code directly on the submit node slows it down for all users. **Always submit jobs through HTCondor** — even for quick tests, consider using an interactive job: + +```bash +condor_submit -i request_cpus=1 request_memory=4GB +``` + +### 2. Over-requesting resources + +Jobs that request more CPUs, memory, or GPUs than they need wait longer in the queue because fewer machines can satisfy the request. Right-size your requests using the workflow described above. + +### 3. Not cleaning up /staging after large jobs + +The `/staging` filesystem is shared and has finite capacity. If you leave hundreds of gigabytes of old results sitting there, other researchers may not have space for their active jobs. Clean up after each project or experiment. + +### 4. Forgetting to check for held jobs + +Held jobs are easy to miss — they sit quietly in the queue. If you submit a batch and walk away without checking, you might come back to find that none of them ran. Always check `condor_q -held` after submitting. + +::::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2: Diagnose the problem + +You submitted 50 jobs an hour ago. You run `condor_q` and see: + +``` +OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD +you job.sub 3/25 14:00 0 0 0 50 +``` + +All 50 jobs are held. What is your next step, and what command do you use? + +::::::::::::::::: solution + +Run `condor_q -held` to see the hold reason for your jobs. For example: + +```bash +condor_q -held +``` + +This might show something like: + +``` +012345.000: Error from slot1@e1234.chtc.wisc.edu: Failed to pull Docker image ... +``` + +Common hold reasons include: +- Docker image not found or typo in the image name +- Requested more memory or disk than any machine has +- Input file specified in `transfer_input_files` does not exist +- Permissions errors on `/staging` files + +Once you fix the underlying issue, release the jobs with `condor_release $USER` or remove them with `condor_rm $USER` and resubmit. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +## Getting help + +CHTC has a dedicated facilitation team that helps researchers optimize their workflows. Do not hesitate to reach out: + +- **Email:** [chtc@cs.wisc.edu](mailto:chtc@cs.wisc.edu) — the primary support channel. Include your username, job IDs, and error messages. +- **Office hours:** CHTC holds regular drop-in office hours (check [chtc.cs.wisc.edu](https://chtc.cs.wisc.edu/) for the current schedule). These are great for getting unstuck on tricky job configurations. +- **Documentation:** The [CHTC Guides](https://chtc.cs.wisc.edu/uw-research-computing/) cover everything from getting started to advanced GPU workflows. + +:::::::::::::::::::::::::::::::::::::: callout + +#### When to email vs. when to debug yourself + +Good reasons to email CHTC support: +- Jobs are held with errors you do not understand after checking the documentation +- You need a quota increase for `/staging` or `/home` +- You need access to specific GPU types or longer runtimes +- You are unsure how to structure your workflow for CHTC + +Things to try first: +- Check `condor_q -held` and read the hold reason +- Re-read your submit file for typos +- Test with a single short job before submitting a large batch +- Search the [CHTC guides](https://chtc.cs.wisc.edu/uw-research-computing/) for your error message + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +## End-of-session checklist + +Before you log off the submit node, run through this checklist: + +```bash +# 1. Check for any running jobs +condor_q + +# 2. Check for held jobs (fix or remove them) +condor_q -held + +# 3. Check home directory usage +du -sh /home/$USER + +# 4. Check staging usage +du -sh /staging/$USER + +# 5. Remove jobs you no longer need +condor_rm JOB_ID # or condor_rm $USER to remove all + +# 6. Clean up old output files +ls -lh /staging/$USER/ +# rm -rf /staging/$USER/old_experiment/ # if no longer needed +``` + +Make this a habit. Future-you (and your fellow researchers) will thank you. + +::::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 3: End-of-session practice + +Run through the end-of-session checklist on your submit node right now. Answer these questions: + +1. How many jobs (if any) do you have in the queue? +2. Are any of your jobs held? If so, what is the hold reason? +3. How much of your home directory quota are you using? +4. Is there anything in `/staging` that you no longer need? + +::::::::::::::::: solution + +Run the commands from the checklist above: + +```bash +condor_q +condor_q -held +du -sh /home/$USER +quota -vs +du -sh /staging/$USER 2>/dev/null +ls -lh /staging/$USER/ 2>/dev/null +``` + +Your answers will vary depending on your current session state. The key takeaway is making this check a routine part of your workflow. If you have held jobs, investigate and resolve them. If your storage is getting full, clean up old files before your next session. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + +:::::::::::::::::::::::::::::::::::::: keypoints + +- CHTC is free but shared — right-size your resource requests and clean up after yourself. +- Use `condor_q`, `condor_watch_q`, and `condor_q -held` to monitor jobs; use `condor_rm` to remove jobs you no longer need. +- Check actual resource usage in job logs and ClassAds (`condor_q -af`, `condor_history -af`) to refine future requests. +- Over-requesting CPUs, memory, or GPUs makes your jobs wait longer, not run faster. +- CHTC enforces runtime limits (12 hr, 24 hr, 7 days) — use checkpointing for long-running training jobs. +- Clean up `/home` and `/staging` regularly; run the end-of-session checklist before logging off. +- Email [chtc@cs.wisc.edu](mailto:chtc@cs.wisc.edu) or attend office hours when you need help. + +:::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/09-Resource-management-cleanup.md b/episodes/09-Resource-management-cleanup.md deleted file mode 100644 index c430673e..00000000 --- a/episodes/09-Resource-management-cleanup.md +++ /dev/null @@ -1,424 +0,0 @@ ---- -title: "Resource Management & Monitoring on Vertex AI (GCP)" -teaching: 30 -exercises: 10 ---- - -:::::::::::::::::::::::::::::::::::::: questions - -- How do I monitor and control Vertex AI, Workbench, and GCS costs day‑to‑day? -- What *specifically* should I stop, delete, or schedule to avoid surprise charges? -- How do I set budget alerts so cost leaks get caught quickly? - -:::::::::::::::::::::::::::::::::::::::::::::::: - -::::::::::::::::::::::::::::::::::::: objectives - -- Identify the major cost drivers across Vertex AI (training jobs, endpoints, Workbench notebooks) and GCS, with ballpark costs. -- Practice safe cleanup for Workbench Instances, training/tuning jobs, batch predictions, models, and endpoints. -- Set a budget alert and apply labels to keep costs visible and predictable. -- Use `gcloud` commands for auditing and rapid cleanup. - -:::::::::::::::::::::::::::::::::::::::::::::::: - -You've now run training jobs, tuning jobs, built a RAG pipeline, and possibly explored CLI workflows across the previous episodes. Before closing your laptop, let's make sure none of those resources are still billing you — and learn the habits that prevent surprise charges going forward. - -## Check your current spend first - -Before cleaning anything up, find out where you stand. Open the [**Cloud Console**](https://console.cloud.google.com/billing/reports) and navigate to: - -**Billing → Reports** - -- Set the time range to **This month** (or **Today** for workshop use). -- Group by **Service** to see which GCP services are costing the most. -- Look for **Compute Engine** (backs Workbench VMs and training jobs), **Vertex AI**, and **Cloud Storage**. - -This is the single most important dashboard to bookmark. If you only learn one thing from this episode, it's where to find this page. - -You can also check from the CLI: - -```bash -# Quick check: is my project accumulating Vertex AI resources right now? -gcloud ai endpoints list --region=us-central1 -gcloud workbench instances list --location=us-central1-a -gcloud ai custom-jobs list --region=us-central1 --filter="state=JOB_STATE_RUNNING" -``` - - -## What costs you money on GCP (and how much) - -Not all resources cost equally. Here are the main cost drivers you'll encounter in this workshop, ordered from most to least dangerous: - -| Resource | Billing model | Ballpark cost | Risk level | -|----------|--------------|---------------|------------| -| **Vertex AI endpoints** | Per node‑hour, **24/7 while deployed** | ~ `$4.50`/day for one `n1-standard-4` node | **High** — bills even with zero traffic | -| **Workbench Instances** (running) | Per VM‑hour + GPU | ~ `$0.19`/hr CPU‑only (`n1-standard-4`); add ~ `$0.35`/hr per T4 GPU | **High** — easy to forget overnight | -| **Training / HPT jobs** | Per VM/GPU‑hour while running | Same VM rates; auto‑stops when done | **Medium** — usually self‑limiting | -| **Workbench disks** (stopped VM) | Per GB‑month for persistent disk | ~ `$0.04`/GB/month (~ `$4`/month for 100 GB) | **Low** — small but adds up | -| **GCS storage** | Per GB‑month + operations + egress | ~ `$0.02`/GB/month (Standard) | **Low** — cheap until multi‑TB | -| **Network egress** | Per GB downloaded out of GCP | ~ `$0.12`/GB | **Low** — avoid large downloads to local | - -> **Rule of thumb:** Endpoints left deployed and notebooks left running are the most common surprise bills in education and research settings. - - -## Shutting down Workbench Instances - -In Episode 2 we created a **Workbench Instance** — the currently recommended notebook environment. Here's how to stop or delete it: - -### Stop via Console -Vertex AI → **Workbench** → **Instances** tab → select your instance → **Stop**. - -### Stop via CLI -```bash -# List all Workbench Instances in your zone -gcloud workbench instances list --location=us-central1-a - -# Stop an instance (stops VM billing; disk charges continue) -gcloud workbench instances stop INSTANCE_NAME --location=us-central1-a -``` - -### Delete when you're done for good -```bash -# Permanently delete the instance and its disk -gcloud workbench instances delete INSTANCE_NAME --location=us-central1-a --quiet -``` - -### Enable idle shutdown (recommended) -You can configure your instance to auto‑stop after a period of inactivity, so you never accidentally leave it running overnight: - -- **Console**: Select your instance → **Edit** → set **Idle shutdown** to 60–120 minutes. -- **At creation time**: Add `--idle-shutdown-timeout=60` to your `gcloud workbench instances create` command. - -> **Disks still cost money while the VM is stopped** (~ `$4`/month for 100 GB). If you're completely done with an instance, **delete** it rather than just stopping it. - - -## Cleaning up training, tuning, and batch jobs - -Training and HPT jobs automatically stop billing when they finish, but it's good practice to audit for jobs stuck in `RUNNING` and to delete old jobs you no longer need. - -### Audit with CLI -```bash -# Custom training jobs -gcloud ai custom-jobs list --region=us-central1 - -# Hyperparameter tuning jobs -gcloud ai hp-tuning-jobs list --region=us-central1 - -# Batch prediction jobs -gcloud ai batch-prediction-jobs list --region=us-central1 -``` - -Each command prints a table showing the job ID, display name, state (e.g., `JOB_STATE_SUCCEEDED`, `JOB_STATE_RUNNING`), and creation time. Look for any jobs stuck in `RUNNING` — those are still consuming resources. - -### Cancel or delete as needed -```bash -# Cancel a running job -gcloud ai custom-jobs cancel JOB_ID --region=us-central1 - -# Delete a completed job you no longer need -gcloud ai custom-jobs delete JOB_ID --region=us-central1 -``` - -> **Tip:** Keep one "golden" successful job per experiment for reference, then delete the rest to reduce console clutter. - - -## Undeploy models and delete endpoints (major cost pitfall) - -Deployed endpoints are billed per node‑hour **24/7**, even with zero prediction traffic. A single forgotten endpoint can cost ~ `$135`/month. Always undeploy models before deleting the endpoint. - -### Find endpoints and deployed models -```bash -gcloud ai endpoints list --region=us-central1 -gcloud ai endpoints describe ENDPOINT_ID --region=us-central1 -``` - -### Undeploy and delete -```bash -# Step 1: Undeploy the model (stops node-hour billing) -gcloud ai endpoints undeploy-model ENDPOINT_ID \ - --deployed-model-id=DEPLOYED_MODEL_ID \ - --region=us-central1 \ - --quiet - -# Step 2: Delete the endpoint itself -gcloud ai endpoints delete ENDPOINT_ID \ - --region=us-central1 \ - --quiet -``` - -> **Model Registry note:** Keeping a model *registered* (but not deployed to an endpoint) does not incur node‑hour charges. You only pay a small amount for the model artifact storage in GCS. - - -## GCS housekeeping - -### Check bucket size -```bash -# Human-readable bucket size -gcloud storage du gs://YOUR_BUCKET --summarize --readable-sizes - -# List top-level contents -gcloud storage ls gs://YOUR_BUCKET -``` - -> **Note:** `gsutil` commands (e.g., `gsutil du`, `gsutil ls`) still work but are being replaced by `gcloud storage`. We use the newer syntax here. - -### Lifecycle policies -A lifecycle policy tells GCS to automatically delete or transition objects based on rules you define. This is useful for cleaning up temporary training outputs. - -Save the following as `lifecycle.json`: -```json -{ - "lifecycle": { - "rule": [ - { - "action": {"type": "Delete"}, - "condition": {"age": 7, "matchesPrefix": ["tmp/"]} - }, - { - "action": {"type": "Delete"}, - "condition": {"numNewerVersions": 3} - } - ] - } -} -``` - -- **Rule 1**: Auto‑delete any object under `tmp/` that is older than 7 days. -- **Rule 2**: If versioning is enabled, keep only the 3 most recent versions. - -Apply it: -```bash -gcloud storage buckets update gs://YOUR_BUCKET --lifecycle-file=lifecycle.json - -# Verify -gcloud storage buckets describe gs://YOUR_BUCKET --format="yaml(lifecycle)" -``` - -### Egress reminder -Downloading data out of GCP to your laptop costs ~ `$0.12`/GB. Prefer **in‑cloud** training and evaluation, and share results via GCS links rather than local downloads. - - -## Labels and budgets - -### Standardize labels on all resources -Labels let you track costs per user, team, or experiment in billing reports. Apply them consistently: - -- Examples: `name=firstname-lastname`, `purpose=workshop`, `dataset=titanic` -- The Vertex AI Python SDK supports labels on job creation; `gcloud` commands accept `--labels=key=value,...` - -### Set budget alerts (do this now) -This is the single most protective action you can take: - -1. Go to **Billing → Budgets & alerts** in the [Cloud Console](https://console.cloud.google.com/billing/budgets). -2. Click **Create budget**. -3. Set a budget amount (e.g., `$10` or `$25` for a workshop). -4. Set alert thresholds at **50%**, **80%**, and **100%**. -5. Add **forecast‑based alerts** to catch trends before you hit the limit. -6. Make sure email notifications go to **all project maintainers**, not just you. - -> **For production use:** You can export detailed billing data to BigQuery for cost analysis by service, label, or SKU. See the [billing export documentation](https://cloud.google.com/billing/docs/how-to/export-data-bigquery) for setup instructions. - - -## Common pitfalls and quick fixes - -| Pitfall | Fix | -|---------|-----| -| Forgotten endpoint billing 24/7 | Undeploy models → delete endpoint | -| Notebook left running over weekend | Enable **idle shutdown** (60–120 min) | -| Duplicate datasets across buckets | Consolidate to one bucket; set lifecycle to purge `tmp/` | -| Too many parallel HPT trials | Cap `parallel_trial_count` to 2–4 | -| Don't know what's costing money | Check **Billing → Reports**; add labels to all resources | - -::::::::::::::::::::::::::::::::::::: callout - -### Going further: automating cleanup - -Once you move from workshop use to regular research, consider automating resource cleanup: - -- **Cloud Scheduler** can run a nightly job to stop idle Workbench Instances via the Vertex AI API. -- **Cloud Functions** or **Cloud Run** can periodically sweep for forgotten endpoints. -- **Budget alerts** can trigger Pub/Sub messages that automatically shut down resources when spend exceeds a threshold. - -These are beyond the scope of this workshop, but the [Cloud Scheduler documentation](https://cloud.google.com/scheduler/docs) is a good starting point. - -:::::::::::::::::::::::::::::::::::::::::::::::: - - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 1 — Check your spend and set a budget - -1. Navigate to **Billing → Reports** in the [Cloud Console](https://console.cloud.google.com/billing/reports). Find your project's current‑month spend grouped by service. -2. Navigate to **Billing → Budgets & alerts**. Create a **`$10` budget** with alert thresholds at 50% and 100%. - -:::::::::::::::: solution - -1. In the [Cloud Console](https://console.cloud.google.com/billing/reports), click the **Navigation menu (☰)** → **Billing** → **Reports**. Set time range to "This month" and group by "Service." You should see Compute Engine, Vertex AI, and Cloud Storage if you've been running workshop exercises. - -2. Go to **Billing** → **Budgets & alerts** → **Create budget**. Set: - - **Name**: `workshop-budget` - - **Amount**: `$10` - - **Thresholds**: 50% (`$5`) and 100% (`$10`) - - **Alerts to**: your email address - -Click **Finish** to activate the budget. - -:::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 2 — Find and stop idle notebooks - -List all running Workbench Instances in your zone and stop any you are not actively using. - -```bash -gcloud workbench instances list --location=us-central1-a -``` - -:::::::::::::::: solution - -```bash -# List instances — look for STATE=ACTIVE -gcloud workbench instances list --location=us-central1-a - -# Stop an instance you're not using -gcloud workbench instances stop INSTANCE_NAME --location=us-central1-a -``` - -If the instance shows `STATE=ACTIVE` and you're not currently working in it, stop it. You can restart it later with `gcloud workbench instances start`. - -:::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 3 — Endpoint sweep - -List all deployed endpoints in your region, undeploy any model you don't need, and delete the endpoint. - -:::::::::::::::: solution - -```bash -# List all endpoints -gcloud ai endpoints list --region=us-central1 - -# Pick an endpoint ID from the list, then inspect it -gcloud ai endpoints describe ENDPOINT_ID --region=us-central1 - -# Undeploy the model (find DEPLOYED_MODEL_ID in the describe output) -gcloud ai endpoints undeploy-model ENDPOINT_ID \ - --deployed-model-id=DEPLOYED_MODEL_ID \ - --region=us-central1 \ - --quiet - -# Delete the now-empty endpoint -gcloud ai endpoints delete ENDPOINT_ID \ - --region=us-central1 \ - --quiet -``` - -:::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 4 — Write and apply a lifecycle policy - -Create a GCS lifecycle rule that deletes objects under `tmp/` after 7 days and keeps only 3 versions of versioned objects. Apply it to your bucket. - -:::::::::::::::: solution - -Save the following as `lifecycle.json`: -```json -{ - "lifecycle": { - "rule": [ - { - "action": {"type": "Delete"}, - "condition": {"age": 7, "matchesPrefix": ["tmp/"]} - }, - { - "action": {"type": "Delete"}, - "condition": {"numNewerVersions": 3} - } - ] - } -} -``` - -Apply and verify: -```bash -gcloud storage buckets update gs://YOUR_BUCKET --lifecycle-file=lifecycle.json -gcloud storage buckets describe gs://YOUR_BUCKET --format="yaml(lifecycle)" -``` - -:::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::: challenge - -### Challenge 5 — Full workshop teardown - -If you are done with all episodes, perform a complete cleanup: - -1. Stop or delete your Workbench Instance. -2. Verify no endpoints are deployed. -3. Delete any completed training/tuning jobs you don't need. -4. Check your GCS bucket — remove any files you don't want to keep, or delete the bucket entirely. - -:::::::::::::::: solution - -```bash -# 1. Delete your Workbench Instance -gcloud workbench instances delete INSTANCE_NAME \ - --location=us-central1-a --quiet - -# 2. Confirm no endpoints remain -gcloud ai endpoints list --region=us-central1 -# (If any appear, undeploy models and delete them as shown above) - -# 3. Delete old training jobs -gcloud ai custom-jobs list --region=us-central1 -gcloud ai custom-jobs delete JOB_ID --region=us-central1 - -gcloud ai hp-tuning-jobs list --region=us-central1 -gcloud ai hp-tuning-jobs delete JOB_ID --region=us-central1 - -# 4. Remove your GCS bucket (WARNING: this deletes all data in the bucket) -gcloud storage rm -r gs://YOUR_BUCKET -``` - -After cleanup, check **Billing → Reports** one more time to confirm no services are still accumulating charges. - -:::::::::::::::: - -:::::::::::::::::::::::::::::::::::::::::::::::: - - -## End‑of‑session checklist - -Before you close your laptop, run through this quick checklist: - -1. **Workbench Instances** — stopped (or deleted if you're done for good). -2. **Training / HPT jobs** — no jobs stuck in `RUNNING`. -3. **Endpoints** — all models undeployed; unused endpoints deleted. -4. **GCS** — no large temporary files lingering; lifecycle policy in place. -5. **Budget alert** — set and sending to your email. - -> Bookmark **Billing → Reports** and check it at the start of each session. A 10‑second glance can save you from a surprise bill. - -::::::::::::::::::::::::::::::::::::: keypoints - -- **Check Billing → Reports** regularly — know what you're spending before it surprises you. -- **Endpoints** and **running notebooks** are the most common cost leaks; undeploy and stop first. -- **Set a budget alert** — it's the single most protective action you can take. -- Configure **idle shutdown** on Workbench Instances so forgotten notebooks auto‑stop. -- Keep storage tidy with **GCS lifecycle policies** and avoid duplicate datasets. -- Use **labels** on all resources so you can trace costs in billing reports. - -:::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/index.md b/index.md index 09aee515..e1800024 100644 --- a/index.md +++ b/index.md @@ -2,16 +2,16 @@ site: sandpaper::sandpaper_site --- -Already know how to train an ML model in Python but haven't used the cloud? This hands-on workshop gets you running ML/AI workloads on **Google Cloud Platform (GCP)** — no prior cloud experience required. By the end, you'll be able to move a local training workflow into GCP's **Vertex AI** platform and take advantage of cloud-scale hardware and managed services. +Already know how to train an ML model in Python but haven't used high-throughput computing? This hands-on workshop gets you running ML/AI workloads on the **Center for High Throughput Computing (CHTC)** at UW-Madison — no prior HTC experience required. By the end, you'll be able to move a local training workflow onto CHTC's **HTCondor** system and take advantage of shared GPUs, scalable job submission, and containerized environments. **What you'll learn:** -- **Cloud-based notebooks** — Set up a Vertex AI Workbench notebook as your development environment and cloud controller. -- **Data in the cloud** — Upload datasets to Cloud Storage and connect them to your training code. -- **Scalable model training** — Launch custom training jobs on cloud GPUs/CPUs with your own PyTorch (or other framework) code. -- **Hyperparameter tuning** — Run parallel tuning jobs in Vertex AI to efficiently search for optimal model settings. -- **RAG pipelines** — Build a retrieval-augmented generation pipeline using Google's Gemini models with grounding via Google Search. -- **Cost management** — Monitor spending, set budget alerts, and clean up resources to avoid surprise bills. +- **Connecting to CHTC** — Log in to a submit node and navigate the CHTC filesystem. +- **Data management** — Stage datasets for HTCondor jobs using `/home`, `/staging`, and SQUID. +- **Scalable model training** — Submit HTCondor jobs that run your PyTorch (or other framework) code on CPUs or GPUs. +- **Hyperparameter tuning** — Use HTCondor's `queue` mechanisms and DAGMan to run parallel tuning sweeps. +- **RAG pipelines** — Build a retrieval-augmented generation pipeline using open-source or API-based models on CHTC. +- **Resource etiquette** — Follow best practices for shared infrastructure, manage disk usage, and monitor your jobs. #### Prerequisites @@ -22,7 +22,6 @@ This workshop assumes you have a **fundamental ML/AI background**. Specifically, - **Training a model** — you've trained at least one model in any framework (scikit-learn, PyTorch, TensorFlow, XGBoost, etc.). - **Command line basics** — navigating directories, running commands in a terminal. -No prior GCP or cloud experience is required — that's what this workshop teaches. +No prior CHTC or HTCondor experience is required — that's what this workshop teaches. [workbench]: https://carpentries.github.io/sandpaper-docs - diff --git a/instructors/instructor-notes.md b/instructors/instructor-notes.md index 6bc85cbd..8ab1c0b0 100644 --- a/instructors/instructor-notes.md +++ b/instructors/instructor-notes.md @@ -5,69 +5,66 @@ title: 'Instructor Notes' ## Target Learner Profiles ### Alex — Graduate Researcher in Biology -Alex is a second-year PhD student who trains random forest and XGBoost models on tabular genomics data using scikit-learn on their laptop. Their datasets are growing beyond what fits in RAM, and their advisor has suggested moving to cloud compute. Alex has basic Python skills and has heard of GCP but has never used it. They want to learn how to store data in the cloud, run training jobs without babysitting a notebook, and keep costs under control. +Alex is a second-year PhD student who trains random forest and XGBoost models on tabular genomics data using scikit-learn on their laptop. Their datasets are growing beyond what fits in RAM, and their advisor has suggested using CHTC. Alex has basic Python skills and has heard of HTCondor but has never used it. They want to learn how to run training jobs on shared hardware without babysitting a terminal session. ### Jordan — Data Scientist at a Research Lab -Jordan has 3 years of experience training deep learning models with PyTorch on a local GPU workstation. They are comfortable with the command line and Git. Their lab has GCP credits and wants to scale up hyperparameter tuning for a new project. Jordan needs to learn how to submit managed training jobs, attach GPUs, and compare tuning trial results without managing infrastructure manually. +Jordan has 3 years of experience training deep learning models with PyTorch on a local GPU workstation. They are comfortable with the command line and Git. Their lab wants to scale up hyperparameter tuning using CHTC's GPU Lab. Jordan needs to learn how to submit GPU jobs, run parallel tuning sweeps, and collect results across many HTCondor jobs. ### Sam — Postdoc Exploring LLMs for Literature Review -Sam is a postdoc in environmental science who wants to use retrieval-augmented generation (RAG) to extract information from research papers. They have intermediate Python skills and have used Jupyter notebooks extensively, but have no cloud experience. Sam is primarily interested in the RAG episode but needs the foundational GCP knowledge from earlier episodes to set up their environment and manage costs responsibly. +Sam is a postdoc in environmental science who wants to use retrieval-augmented generation (RAG) to extract information from research papers. They have intermediate Python skills and have used Jupyter notebooks extensively, but have no HTC experience. Sam is primarily interested in the RAG episode but needs the foundational CHTC knowledge from earlier episodes to set up their environment and submit jobs. ## Before the Workshop ### Account setup (1–2 weeks prior) -- Confirm whether you are using a **shared GCP project** or asking learners to use **Free Tier** accounts. -- If using a shared project (the standard approach at UW-Madison via RCI and ML+X), the recommended onboarding procedure is: - 1. **Create or reuse a Google Group** (e.g., `mlm-workshop-2025@googlegroups.com`) that has the necessary IAM roles on the shared project (at minimum: `Vertex AI User`, `Storage Object Admin`, `Compute Viewer`). - 2. **Add learner Google accounts to the group** — either by collecting emails in advance via a registration form or by adding them during a pre-workshop session. - 3. **Allow time for IAM propagation.** After adding a member to a Google Group, it can take **5–15 minutes** (occasionally up to an hour) for GCP to recognize the new membership and grant access. Plan accordingly: - - **Ideal:** Add all learners **the day before** the workshop so access is ready by start time. - - **If adding day-of:** Do it at least 15–30 minutes before the first hands-on episode (episode 02). Use the introduction episode to fill time while permissions propagate. - 4. **Verify access** by having at least one test account confirm they can see the shared project in the Cloud Console before the workshop begins. - 5. **After the workshop**, you can remove learners from the group (or delete it) to revoke access cleanly without touching individual IAM bindings. -- Verify GPU quota in the workshop region (`us-central1`). Request increases for `NVIDIA_TESLA_T4` if needed — quota requests can take 1–3 business days. -- Send a pre-workshop email with setup instructions (GCP access, data download). +- Confirm that all participants have CHTC accounts. Submit bulk account requests to CHTC if needed — requests can take several business days to process. +- Alternatively, coordinate with CHTC staff to provide temporary workshop accounts. +- Verify that participants can SSH into a submit node (e.g., `ap2002.chtc.wisc.edu`). Off-campus participants may need the UW VPN. +- Verify GPU Lab access: submit a test GPU job to confirm quota and availability. +- Send a pre-workshop email with setup instructions (SSH access, VPN if needed). ### Test run -- Walk through all episodes end-to-end on the shared project at least once. GCP UI changes frequently — confirm that screenshots and console paths still match. -- Verify that the Vertex AI prebuilt container URIs in Episodes 04, 05, and 06 are still available (container tags get deprecated). +- Walk through all episodes end-to-end on a CHTC submit node at least once. CHTC configurations and container availability can change. +- Verify that the Docker images used in episodes are accessible from CHTC (e.g., `python:3.10`, `pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime`). - Confirm that `data.zip` and `pdfs_bundle.zip` download correctly from the GitHub repository. +- Test that jobs complete within expected timeframes, including GPU jobs. ## During the Workshop ### Pacing and timing -The lesson is designed for roughly **5 hours** of instruction (including short breaks). Suggested time allocation: +The lesson is designed for roughly **4–5 hours** of instruction (including short breaks). Suggested time allocation: | Episode | Teaching + Exercises | Notes | |---------|---------------------|-------| | 01 Introduction | 12 min | Keep brief; learners are eager to get hands-on | -| 02 Notebooks as Controllers | 30 min | First console interaction; VM creation takes 3–5 min — fill with discussion | -| 03 Data Storage & Access | 50 min | Merged bucket creation + notebook data access; first notebook coding | -| 04 Training (XGBoost) | 40 min | Vertex AI job takes 2–5 min; use wait time for Q&A | -| 05 Training (PyTorch + GPU) | 30 min | GPU jobs may take longer; discuss CPU vs GPU during wait | -| 06 Hyperparameter Tuning | 50 min | Start with 1 trial; exercises have learners scale up | +| 02 Connecting to CHTC | 30 min | First SSH connection; may need VPN troubleshooting | +| 03 Data Management | 45 min | Filesystem navigation, data staging, first file transfers | +| 04 Training (XGBoost) | 40 min | First HTCondor job; queue wait times vary — fill with discussion | +| 05 Training (PyTorch + GPU) | 30 min | GPU jobs may queue longer; discuss CPU vs GPU during wait | +| 06 Hyperparameter Tuning | 50 min | Many parallel jobs; exercises scale up complexity | | 07 RAG | 30 min | Can be shortened to a demo if running behind | -| 08 CLI Workflows (bonus) | 15 min | Optional; skip if short on time | -| 09 Resource Cleanup | 40 min | Critical — do not skip. Learners must clean up resources | +| 08 Advanced Workflows (bonus) | 25 min | Optional; skip if short on time | +| 09 Resource Management | 35 min | Important — reinforce good citizenship habits | | **Total** | **~297 min** | **~5 hours including breaks** | ### Common issues -- **"I can't see the project"**: If a learner was just added to the Google Group, IAM propagation may still be in progress. Have them wait 5–15 minutes, try an incognito/private browser window, and confirm they are logged into the correct Google account (not a personal Gmail if the group expects a university account). -- **Bucket permission errors**: The most common blocker. Have the `gcloud storage buckets add-iam-policy-binding` commands ready to paste into Cloud Shell. -- **VM creation stuck**: If a Workbench Instance gets stuck in "Provisioning" for >5 min, try a different zone in the same region. -- **GPU quota exceeded**: If T4 quota is unavailable, fall back to CPU-only training for Episodes 05–06. The lesson works without GPUs — it just takes longer. -- **Numpy version mismatch**: The PyTorch kernel sometimes has a numpy 2.x conflict. The fix (`pip install --upgrade --force-reinstall "numpy<2"`) is included in Episode 05. -- **Idle shutdown confusion**: Some learners may find their VM stopped mid-workshop. Remind them to increase idle timeout or restart the instance. +- **"Can't connect via SSH"**: Most common issue. Check: (1) correct username (NetID), (2) correct submit node hostname, (3) VPN connected if off-campus, (4) CHTC account is active. +- **Jobs stuck in Idle**: This is normal — CHTC is a shared resource. Jobs wait for matching machines. GPU jobs may wait longer during peak times. Use this as a teaching moment about resource sharing. +- **Jobs go to Held state**: Usually a submit file error (wrong Docker image, missing input files, resource request too large). Check with `condor_q -hold` to see the hold reason. +- **Docker image pull failures**: Some images may be large and slow to pull on first use. Use smaller base images when possible (e.g., `python:3.10-slim`). +- **Disk quota exceeded in /home**: Remind learners that /home has a ~20 GB quota. Large outputs should go to /staging or be cleaned up. +- **GPU jobs take longer than expected**: GPU Lab queue times vary. Have a backup plan — the CPU versions of all exercises produce the same results. ### Tips -- Encourage learners to **add labels/tags to every resource** from the start. This is easy to skip but essential for cost tracking in shared accounts. -- Remind learners that **Vertex AI training jobs take 2–5 minutes just for provisioning** before training begins. This is normal, not an error. -- When walking through Episode 09 (cleanup), verify as a group that no endpoints are left deployed and no VMs are still running. This is now the final episode, so a full teardown is appropriate. -- Episode 09 includes a "Check your spend" section — use this as a live demo so learners can see the Billing Reports dashboard. Walk through the budget alert setup (Challenge 1) together as a class if time permits. -- For the RAG episode, have a backup plan in case the Gemini API is temporarily rate-limited. You can demo from pre-computed outputs. +- Encourage learners to **check job status frequently** with `condor_q` and `condor_watch_q`. This builds good habits. +- Remind learners that the **submit node is shared** — no heavy computation, no large downloads to /home. +- When jobs are queued, use the wait time for discussion questions and challenges. +- For the HP tuning episode, start with a small number of jobs (3–5) to validate the pipeline before scaling up. +- Demonstrate `condor_q -hold` and `condor_rm` early — learners will need these. +- For the RAG episode, if using API-based models, ensure API keys are set up in advance. If using open-source models, ensure the Docker images are tested. ## After the Workshop -- Verify all shared project resources are cleaned up (notebooks stopped, endpoints deleted, buckets with only intentional data remaining). -- Review billing dashboard for any unexpected charges. +- Remind learners to clean up their /home directories (old job outputs, logs, etc.). +- Verify no forgotten jobs are still running: `condor_q -all` filtered by workshop users. - Collect feedback from learners on pacing and difficulty level. +- Share links to CHTC documentation and support channels for continued learning. diff --git a/learners/compute-for-ML.md b/learners/compute-for-ML.md index c300b078..afb54750 100644 --- a/learners/compute-for-ML.md +++ b/learners/compute-for-ML.md @@ -2,123 +2,110 @@ title: Compute for ML --- -This page provides guidance for selecting compute configurations in Google Cloud Platform (GCP) for machine learning workloads. While instance size is an important factor, effective performance depends on how you pair a machine type with optional GPU accelerators. - -All pricing estimates are based on public rates for `us-central1` as of October 2025. Actual cost depends on sustained-use discounts, attached GPU quotas, and whether your project has promotional or educational credits. +This page provides guidance for selecting compute configurations on CHTC for machine learning workloads. CHTC resources are free for UW-Madison researchers, so the focus is on choosing the right hardware for your workload rather than minimizing cost. ### Reference Docs -- [Compute Engine VM Instance Pricing (applies to notebook backends)](https://cloud.google.com/compute/vm-instance-pricing) -- [Compute Engine GPU Pricing](https://cloud.google.com/compute/gpus-pricing) -- [All Compute Pricing Overview](https://cloud.google.com/compute/all-pricing) +- [CHTC GPU Lab Guide](https://chtc.cs.wisc.edu/uw-research-computing/gpu-lab) +- [CHTC Machine Learning Guide](https://chtc.cs.wisc.edu/uw-research-computing/machine-learning-htc) +- [HTCondor Manual — Submitting Jobs](https://htcondor.readthedocs.io/en/latest/users-manual/submitting-a-job.html) ### Key Terms -- **vCPU**: A *virtual CPU* represents one logical core allocated from a physical CPU. Two vCPUs typically correspond to one physical core on GCP hardware. More vCPUs allow for greater parallelism — useful when loading data, performing CPU-heavy preprocessing, or running multi-threaded operations. In GCP machine types, memory (RAM) generally scales with vCPUs — doubling vCPUs usually doubles available memory. -- **Memory (GiB)**: System RAM available to the VM. Higher RAM supports larger batch sizes, data caching, and in-memory preprocessing, reducing disk I/O overhead. -- **GPU (Graphics Processing Unit)**: Specialized hardware for parallel tensor operations used in deep learning model training and inference. -- **Machine type**: Defines CPU and RAM resources; determines how many vCPUs and how much memory your instance has. -- **Machine family**: A group of machine types optimized for a specific balance of performance, memory, and cost (e.g., `n2-standard-8`). -- **Accelerator**: Optional hardware (such as GPUs or TPUs) that can be attached to certain VM families to speed up training and inference. -- **Accelerator count**: Defines how many GPUs are attached to a single VM. Most training jobs begin with `accelerator_count=1`. Increasing the count (for example, to 2, 4, or 8) enables multi-GPU training, but it also requires proportional increases in CPU, memory, and disk I/O to feed data efficiently to all GPUs. Performance scaling is rarely linear — expect diminishing returns beyond 2–4 GPUs unless your model and batch sizes are very large. -- **Region**: The physical location of your compute resources (e.g., `us-central1`). Pricing and GPU availability can vary by region. +- **CPU**: A general-purpose processor. Most ML preprocessing, feature engineering, and tree-based models (XGBoost, random forests) run efficiently on CPUs. +- **GPU (Graphics Processing Unit)**: Specialized hardware for parallel tensor operations used in deep learning model training and inference. CHTC's GPU Lab provides access to high-end NVIDIA GPUs. +- **Memory (RAM)**: System memory available to your job. Higher RAM supports larger batch sizes, data caching, and in-memory preprocessing. +- **Disk**: Local scratch storage on the execute node. Your input files are transferred here, and outputs are written here during the job. +- **Execute node**: The machine where your HTCondor job runs. You request resources (CPUs, memory, disk, GPUs) in your submit file, and HTCondor matches your job to a machine that meets those requirements. ### Key Concepts -- **Machine type vs. GPU**: The `machine_type` defines CPU and RAM resources — it is not a GPU by itself. You can attach a GPU by adding `accelerator_type` and `accelerator_count` (for example, `NVIDIA_L4` or `NVIDIA_TESLA_T4`). Only specialized machine families like `A2` include GPUs automatically. -- **Full names and syntax**: Most machine types follow the pattern `--`. For example: - - `n2-standard-8`: N2 family, standard series, 8 vCPUs, 32 GB RAM - - `c2-standard-8`: C2 family, standard series, 8 vCPUs, 32 GB RAM (CPU-optimized) - - **Exception — GPU families (A2, A3, A4):** These use `--g` instead. The trailing number is the **GPU count**, not the vCPU count. For example, `a2-highgpu-1g` means 1× A100 GPU (with 12 vCPUs and 85 GB RAM bundled automatically). Similarly, `a3-highgpu-8g` means 8× H100 GPUs. CPU and memory are fixed per configuration — you don't choose them independently. -- **RAM requirements**: Minimum RAM should be at least 1.5× dataset size unless your workflow uses batching. -- **Free tier**: Some smaller instance types (for example, `e2-micro`) may qualify for the [GCP Free Tier](https://cloud.google.com/free). Check usage limits before running persistent notebooks. - -### Machine Families Overview - -Different machine families are optimized for different workloads. -Costs below are approximate per-hour rates for instances with 8 vCPUs in the `us-central1` region. - -| Family | Optimized For | Example Machine Type | Approx. Cost/hr | Typical Model or Dataset Scale | Notes | -|---------|----------------|----------------------|-----------------|-------------------------------|-------| -| `E2` | General purpose | `e2-standard-8` | ~ `$0.25` | Small jobs or lightweight scripts | Cheapest option; slower CPUs | -| `N1` | Balanced compute (older gen) | `n1-standard-8` | ~ `$0.35` | Small to mid-sized ML (<100M params) | Broad GPU compatibility | -| `N2` | Balanced compute (newer gen) | `n2-standard-8` | ~ `$0.38` | Mid-sized ML and RAG pipelines (100M–500M params) | Common choice for notebooks | -| `C2` | Compute optimized | `c2-standard-8` | ~ `$0.45` | CPU-heavy preprocessing or feature extraction | High single-thread performance | -| `C3` | Next-gen compute optimized | `c3-standard-8` | ~ `$0.50` | High-performance CPU-only workloads | Faster I/O and networking | -| `A2` | GPU (A100) | `a2-highgpu-1g` | ~ `$2.93` (with 1×A100) | Large DL models (0.5B–10B params) | Fixed GPU counts, quota required | -| `A3` | GPU (H100) | `a3-highgpu-8g` | ~ `$32.00` (with 8×H100) | Transformer-scale models (10B–70B params) | High throughput, limited quota | -| `A4` | GPU (B200) | `a4-highgpu-4g` | ~ `$36.00` (with 4×B200) | Foundation models (70B+ params) | Highest-end, limited availability | -| `T2A` / `T2D` | Arm or AMD CPUs | `t2a-standard-8` | ~ `$0.20` | Low-cost inference or lightweight workloads | No GPU support | - -**Cost notes:** -- Prices vary by region and storage/network configuration. -- `N2` instances are a typical choice for cost-effective ML/AI workloads. -- `A2–A4` families include GPUs by default; all others require attaching GPUs manually. - -### Attaching GPUs vs. Using GPU Families - -Attaching a GPU to a standard CPU family (`n1`, `n2`, or `c2`) is the most flexible and cost-efficient setup for research and medium-scale workloads. -Dedicated GPU families like `A2`, `A3`, and `A4` are designed for very large or multi-GPU training but come with higher fixed costs and quota requirements. - -| Approach | Best For | Pros | Cons | -|-----------|-----------|------|------| -| Attach GPU to Standard VM (`n1`/`n2` + `NVIDIA_L4`/`T4`) | Fine-tuning, RAG pipelines, and large-scale inference with models up to ~500M–1B params | Cheaper, flexible CPU/GPU balance, reusable for notebooks and jobs | Not ideal for multi-GPU scaling | -| Use GPU Machine Family (`A2`/`A3`/`A4`) | Multi-GPU training or high-throughput inference with models >1B params | High throughput, optimized GPU interconnects | Expensive, quota-restricted, fixed GPU count | - -For large-scale RAG deployments using very large models (e.g., 7B–70B parameters), `A2` or `A3` instances may be required to hold the model in GPU memory during inference. -However, when using model sharding or quantized models under 20–40 GB total, attached L4 GPUs on `n2` machines remain cost-effective. - -### Typical GPU Options for Attached Configurations - -| GPU Type | CUDA Version | Approx. Price/hr | Model Size Range | Dataset Scale | System RAM (Recommended) | Typical Use | -|-----------|--------------|------------------|------------------|----------------|---------------------------|--------------| -| `NVIDIA_TESLA_T4` | CUDA 11.x–12.x | ~ `$0.35` | ≤100 M params | ≤10 GB | ≥16 GB | Entry GPU for CNNs, small transformers | -| `NVIDIA_L4` | CUDA 12.x | ~ `$0.60` | ≤500 M–1 B params | ≤50 GB | ≥32 GB | Moderate training, RAG inference, fine-tuning | -| `NVIDIA_TESLA_V100` | CUDA 11.x | ~ `$2.48` | 0.5 B–2 B params | ≤100 GB | ≥64 GB | High-performance deep learning | -| `NVIDIA_A100_40GB` | CUDA 11.x–12.x | ~ `$2.93` | 2 B–10 B params | ≤200 GB | ≥128 GB | Research-scale model training | -| `NVIDIA_H100` | CUDA 12.x | ~ `$4.00` | 10 B–70 B params | ≤500 GB | ≥256 GB | Transformer and LLM training/inference | -| `NVIDIA_B200` | CUDA 12.x | ~ `$5.00+` | >70 B params | ≥1 TB | ≥512 GB | Foundation-model or multi-node workloads | - -### Example Workload Choices - -- **RAG with LLMs:** Retrieval-augmented generation pipelines rely mainly on CPU and memory for vector retrieval and embedding operations, with moderate GPU usage during inference. Recommended: `n2-standard-8` + `NVIDIA_L4` for typical RAG; move to `a2-highgpu-1g` or `a3-highgpu` if the model exceeds 1B parameters or GPU memory limits. -- **Training a 100M-parameter neural network:** This model size fits comfortably on a single mid-tier GPU and benefits from faster GPU memory bandwidth. Recommended: `n1-standard-8` + `NVIDIA_TESLA_T4` for affordability, or `NVIDIA_L4` if training time matters more than cost. -- **Multi-GPU or LLM fine-tuning (billions of parameters):** Large models (1B–70B parameters) often require multiple A100, H100, or B200 GPUs in parallel. Recommended: `a2-highgpu-2g` (2×A100) or larger depending on model size and parallelism. Cost note: Fine-tuning billion-parameter models can easily exceed `$200`–`$500` per hour of GPU time. Even short fine-tunes may consume hundreds of dollars in credits. Plan carefully, monitor utilization, and test your pipeline with smaller models first. - -### Example Configurations - -| Dataset Size | Recommended Notebook Instance | vCPU | Memory (GiB) | GPU / Accelerator | Price/hr (USD) | Typical Use | -|---------------|--------------------------------|------|----------------|-------------------|----------------|--------------| -| < 1 GB | `e2-micro` (Free Tier) | 2 | 1 | None | Free Tier | Lightweight code tests | -| < 1 GB | `n2-standard-4` | 4 | 16 | None | ~ `$0.17` | Preprocessing, regression, small models | -| < 1 GB | `n1-standard-8` + `NVIDIA_TESLA_T4` | 8 | 30 | 1× T4 | ~ `$0.55` | Entry GPU runs, small CNNs | -| 10 GB | `c2-standard-8` | 8 | 32 | None | ~ `$0.34` | CPU-heavy ML tasks | -| 10 GB | `n2-standard-8` + `NVIDIA_L4` | 8 | 32 | 1× L4 | ~ `$0.75` | Moderate deep learning workloads | -| 50 GB | `a2-highgpu-2g` (2× A100) | 24 | 170 | 2× A100 | ~ `$5.90` | Multi-GPU training, large-model inference | -| 100 GB | `a3-highgpu-8g` (8× H100) | 128 | 512 | 8× H100 | ~ `$32.00` | Transformer or LLM fine-tuning | -| 1 TB+ | `a4-highgpu-4g` (4× B200) | 96 | 768 | 4× B200 | ~ `$36.00` | Foundation-model scale training | - -### General Notes - -- For small datasets, CPUs are often faster to start and cheaper to run. -- When moving from CPU to GPU training, keep the same script and simply change: - - `container_uri` to a GPU-enabled image (for example, `pytorch-gpu.*`) - - Add both `accelerator_type` and `accelerator_count` in your `CustomTrainingJob`. For example: - ```python - job.run( - machine_type="n2-standard-8", - accelerator_type="NVIDIA_L4", - accelerator_count=1, - base_output_dir=ARTIFACTS, - ) - ``` - - Increasing `accelerator_count` (e.g., 2–4) enables parallel training but requires larger datasets and batch sizes to avoid idle GPUs. - - -### Summary - -1. Choose the `machine_type` for CPU and memory resources. -2. Attach a GPU with `accelerator_type` and `accelerator_count` if needed. -3. Only `A2`, `A3`, and `A4` families include GPUs automatically. -4. For most research training jobs, `n1-standard-8` + `NVIDIA_TESLA_T4` or `NVIDIA_L4` is a practical and affordable starting point. -5. Fine-tuning or large-scale inference with billion-parameter models can be extremely expensive; validate your workflow with smaller models first. +- **Right-sizing matters even when it's free.** Over-requesting resources (e.g., 8 CPUs when you need 1, or 32 GB RAM when you need 4 GB) means your job waits longer in the queue because fewer machines can match your request. Always start small and scale up based on actual usage. +- **Check actual resource usage** after your first job completes. The job log file reports actual memory and disk usage. Use this to refine future requests. +- **GPU ≠ always faster.** For small datasets (< 10,000 rows) or simple models (< 1M parameters), CPU training is often faster end-to-end because GPU jobs have provisioning and data transfer overhead. + +### Available GPUs on CHTC + +CHTC's GPU Lab includes several GPU types. You can request specific hardware using `require_gpus` in your submit file. + +| GPU Type | VRAM | Best For | require_gpus filter | +|----------|------|----------|-------------------| +| NVIDIA T4 | 16 GB | Entry-level deep learning, small transformers | `(GlobalMemoryMb >= 15000)` | +| NVIDIA L40 | 48 GB | Medium models, RAG inference, fine-tuning | `(GlobalMemoryMb >= 40000)` | +| NVIDIA A100 (40 GB) | 40 GB | Large deep learning, multi-billion param models | `(GlobalMemoryMb >= 35000)` | +| NVIDIA A100 (80 GB) | 80 GB | Very large models, large batch training | `(GlobalMemoryMb >= 75000)` | +| NVIDIA H100 | 80 GB | Transformer-scale training, LLM fine-tuning | `(GlobalMemoryMb >= 75000)` | +| NVIDIA H200 | 141 GB | Frontier models, 70B+ parameter fine-tuning | `(GlobalMemoryMb >= 130000)` | + +### GPU Job Runtime Limits + +CHTC GPU Lab jobs have runtime limits based on a job length category you specify: + +| Category | Max Runtime | Submit file setting | +|----------|------------|-------------------| +| Short | 12 hours | `+GPUJobLength = "short"` | +| Medium | 24 hours | `+GPUJobLength = "medium"` | +| Long | 7 days | `+GPUJobLength = "long"` | + +**Tip:** Short jobs get scheduled faster because more machines accept them. Start with "short" and only increase if your training genuinely needs more time. For very long training runs, implement checkpointing so you can resume if a job is interrupted. + +### Example Resource Requests + +| Workload | CPUs | Memory | Disk | GPUs | Notes | +|----------|------|--------|------|------|-------| +| XGBoost on Titanic (< 1 GB data) | 1 | 2 GB | 1 GB | 0 | CPU is sufficient | +| Small neural network (< 10M params) | 1 | 4 GB | 2 GB | 0 | CPU often faster for small models | +| Medium neural network (10M–500M params) | 1 | 8 GB | 4 GB | 1 (T4 or L40) | GPU speeds up training significantly | +| Large transformer fine-tuning (500M–10B) | 4 | 32 GB | 20 GB | 1 (A100 40/80 GB) | Need large VRAM for model weights | +| Very large model (10B–70B) | 8 | 64 GB | 50 GB | 1 (H100 or H200) | Quantization may be needed | +| Hyperparameter sweep (many small jobs) | 1 | 4 GB | 2 GB | 0 or 1 | Submit many jobs in parallel | + +### Example Submit File Snippets + +**CPU-only job:** +``` +request_cpus = 1 +request_memory = 4GB +request_disk = 2GB +``` + +**Single GPU job (any GPU with >= 16 GB VRAM):** +``` +request_cpus = 1 +request_gpus = 1 +require_gpus = (GlobalMemoryMb >= 15000) +request_memory = 8GB +request_disk = 4GB ++WantGPULab = true ++GPUJobLength = "short" +``` + +**Single GPU job (A100 or better):** +``` +request_cpus = 4 +request_gpus = 1 +require_gpus = (GlobalMemoryMb >= 35000) +request_memory = 32GB +request_disk = 20GB ++WantGPULab = true ++GPUJobLength = "medium" +``` + +### When Does Model Size Justify a GPU? + +| Model scale | Parameters | Example models | Recommended hardware | +|-------------|-----------|----------------|---------------------| +| Small | < 10M | Logistic regression, small CNNs, XGBoost | CPU only | +| Medium | 10M–500M | ResNets, BERT-base, mid-sized transformers | T4 or L40 | +| Large | 500M–10B | GPT-2, LLaMA-7B, fine-tuning large transformers | A100 (40/80 GB) | +| Very large | 10B–70B | LLaMA-70B, Mixtral | H100 or H200 | +| Frontier | 70B+ | GPT-4-scale, multi-expert models | Cloud (multi-node) | + +### General Guidelines + +1. **Start small.** Request minimal resources for your first run. Check the job log to see actual usage, then adjust. +2. **CPU first, GPU second.** Only add a GPU when your model and data are large enough to benefit from it. +3. **Match your container to your hardware.** Use a CUDA-enabled Docker image (e.g., `pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime`) for GPU jobs. +4. **Use short job lengths.** Short jobs get scheduled faster. Only request longer runtimes if genuinely needed. +5. **Implement checkpointing** for training runs longer than a few hours. This protects against job eviction and lets you resume training across multiple job submissions. +6. **Don't over-request.** Requesting 8 CPUs and 64 GB RAM for a job that uses 1 CPU and 2 GB RAM wastes shared resources and increases your queue wait time. diff --git a/learners/reference.md b/learners/reference.md index ee3383ce..f5c59b58 100644 --- a/learners/reference.md +++ b/learners/reference.md @@ -4,81 +4,97 @@ title: Reference ## Glossary -This glossary covers the GCP and ML terms used in this workshop. Refer back here whenever you hit an unfamiliar term during the episodes. +This glossary covers the CHTC, HTCondor, and ML terms used in this workshop. Refer back here whenever you hit an unfamiliar term during the episodes. -### Cloud Computing Basics +### High-Throughput Computing Basics -Virtual Machine (VM) -: A software-based computer that runs on Google's Compute Engine infrastructure. Each Vertex AI Workbench notebook is backed by a Compute Engine VM. +High-Throughput Computing (HTC) +: A computing paradigm focused on running many independent tasks over time, maximizing total throughput rather than speed of a single task. CHTC is designed for this model. -Instance -: A running VM in the cloud. In GCP, instances are defined by machine families (e.g., N2, C2, A2, A3) and can be customized for CPU, memory, and GPU needs. +HTCondor +: A workload management system developed at UW-Madison that schedules and manages jobs across a pool of distributed computing resources. It matches job requirements to available machines automatically. -Container -: A lightweight, isolated environment that packages code and dependencies together. Vertex AI training jobs run inside containers built from prebuilt Docker images. +Submit Node (Access Point) +: The server you SSH into to write code, prepare data, and submit jobs. It is a shared resource and should not be used for heavy computation. Also called an "access point" in newer HTCondor documentation. -Docker -: The most common container platform. Many GCP ML environments (like TensorFlow or PyTorch training images) are Docker containers hosted on Artifact Registry. +Execute Node +: A machine in the HTCondor pool where your job actually runs. HTCondor assigns one automatically based on your resource request. + +ClassAd +: HTCondor's attribute-value system for describing jobs and machines. Job ClassAds specify requirements (CPUs, memory, GPUs); machine ClassAds advertise available resources. HTCondor matches them to schedule work. + +Job Universe +: The execution environment for a job. Common universes include `docker` (runs in a container), `vanilla` (runs directly on the execute node), and `container` (newer container support). -### GCP Services +### CHTC Infrastructure -Google Cloud Console -: The web-based interface for managing GCP resources, available at [console.cloud.google.com](https://console.cloud.google.com/). This is where you create buckets, launch notebooks, monitor training jobs, check billing, and manage permissions. +CHTC (Center for High Throughput Computing) +: A research computing center at UW-Madison that provides free, large-scale computing resources to the campus community using HTCondor. -Compute Engine (GCE) -: The core infrastructure service that provides customizable VMs. Workbench notebooks and training jobs run on Compute Engine under the hood. +GPU Lab +: CHTC's dedicated pool of GPU-equipped machines, including NVIDIA A100 (40/80 GB), H100 (80 GB), H200 (141 GB), L40, and T4 GPUs. Access requires `+WantGPULab = true` in your submit file. -Vertex AI -: Google's unified ML platform — training jobs, tuning, notebooks, deployment, and more. The main service used throughout this workshop. +OSPool (Open Science Pool) +: A national shared computing pool that CHTC users can access for additional capacity via `+WantFlocking = true` or `+WantGlidein = true`. -Cloud Storage (GCS) -: GCP's object storage service for datasets, models, and artifacts. The direct counterpart to AWS S3. +### CHTC Storage -Bucket -: A top-level container in Cloud Storage that holds files (objects). Accessed via URIs like `gs://your-bucket-name/path/to/file.csv`. +/home +: Your home directory on the submit node (~20 GB quota). Used for code, submit files, and small input/output files. Files here are available on the submit node but must be explicitly transferred to jobs. -GCS URI -: The unique path referencing an object in a Cloud Storage bucket. Example: `gs://ml-project-dataset/train.csv`. +/staging +: A larger storage area for datasets and outputs that are too big for /home. Files are transferred to/from jobs using HTCondor's file transfer mechanism or accessed via staging protocols. -Persistent Disk (PD) -: Block storage attached to VMs, including Workbench notebooks. Retains data between VM reboots — used for local datasets, checkpoints, or outputs. +SQUID +: A web proxy cache for distributing large, read-only files to many jobs efficiently. Files placed on SQUID are served via HTTP, avoiding repeated file transfers. -Cloud Shell -: A browser-based terminal built into the Google Cloud Console (click the **>\_** icon in the top-right toolbar). It comes with `gcloud` pre-installed and already authenticated to your project. +### HTCondor Job Management -### Access and Billing +Submit File (.sub) +: A configuration file that tells HTCondor what to run, what resources to request, what files to transfer, and where to write output. Submitted with `condor_submit`. -IAM (Identity and Access Management) -: GCP's permission system. Defines who (user, service account, or group) can access which resources and at what privilege level. +condor_submit +: The command to submit a job (or batch of jobs) described by a submit file. -Service Account -: A special Google identity used by applications to access GCP resources. Your Workbench notebook uses a service account to read from Cloud Storage and launch training jobs. +condor_q +: The command to check the status of your submitted jobs (Idle, Running, Held, Completed). -Quotas and Limits -: Default usage caps (e.g., max GPUs per region). Quotas can be increased through the console — understanding them helps prevent job failures. +condor_rm +: The command to remove (cancel) one or more of your jobs from the queue. -Billing Alerts -: GCP's Budgets & Alerts feature tracks project spending and sends notifications when costs exceed thresholds. +condor_status +: The command to view available resources in the HTCondor pool (machines, GPUs, etc.). -### Vertex AI Workbench and ML/AI Workflows +condor_history +: The command to view information about your previously completed jobs. -Vertex AI Workbench -: A managed Jupyter notebook environment on Compute Engine. Used to run experiments and coordinate ML/AI workflows interactively. +condor_watch_q +: An interactive, auto-refreshing version of `condor_q` that updates in real time. -Controller -: In this workshop, the notebook acts as the controller — it configures and submits training, tuning, and evaluation jobs via the Vertex AI SDK rather than running heavy computation locally. +DAGMan (Directed Acyclic Graph Manager) +: HTCondor's built-in workflow manager for running multi-step jobs with dependencies. Defined in `.dag` files and submitted with `condor_submit_dag`. + +### Container and Environment Terms + +Docker +: A container platform for packaging code and dependencies together. CHTC's HTCondor supports running jobs inside Docker containers pulled from Docker Hub or other registries. -Vertex AI Custom Job -: A managed training job that runs your code on dedicated Compute Engine instances. Equivalent to an AWS SageMaker Training Job. +Apptainer (formerly Singularity) +: An alternative container runtime commonly used in HPC environments. CHTC supports Apptainer containers as well. -Hyperparameter Tuning Job -: A Vertex AI service that searches for the best model configuration by running multiple trials with different hyperparameter sets. +Container Image +: A pre-built package containing an operating system, libraries, and tools. Specified in your submit file (e.g., `docker_image = pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime`). -Model Registry -: Stores trained models for versioning, deployment, and comparison across experiments. +### ML/AI Workflow Terms -Endpoint -: A deployed model that serves predictions through Vertex AI Prediction. +Training Job +: A compute task that fits a model to data. On CHTC, this is an HTCondor job that runs your training script inside a container on an execute node. + +Hyperparameter Tuning +: The process of searching for optimal model configuration by running multiple training jobs with different settings. On CHTC, this leverages HTCondor's `queue` mechanism to submit many jobs in parallel. + +Checkpointing +: Saving model state periodically during training so that a job can be resumed if interrupted. Important for long-running jobs on CHTC where runtime limits apply. ### Retrieval-Augmented Generation (RAG) @@ -86,25 +102,18 @@ Retrieval-Augmented Generation (RAG) : A pattern where an LLM answers questions by first retrieving relevant passages from a corpus, then generating a response grounded in those passages. This reduces hallucination and allows citation of sources. Chunking -: The process of breaking a large document into smaller, overlapping text segments so that each segment can be independently embedded and retrieved. Common strategies include fixed-character, sentence-level, and paragraph-level chunking. +: The process of breaking a large document into smaller, overlapping text segments so that each segment can be independently embedded and retrieved. Embedding -: A dense numerical vector (array of floats) that represents the semantic meaning of a piece of text. Texts with similar meanings produce vectors that are close together in the embedding space, enabling search by meaning rather than exact keywords. - -Vector Similarity / Cosine Similarity -: A measure of how similar two embedding vectors are. Cosine similarity ranges from -1 (opposite) to 1 (identical direction). In RAG, it's used to rank which corpus chunks are most relevant to a user's query. - -Nearest Neighbors -: An algorithm that finds the data points (embeddings) closest to a given query point in vector space. Used in RAG to retrieve the top-k most relevant chunks for a user's question. - -Grounding -: The practice of constraining an LLM's response to information present in the retrieved context, rather than allowing it to generate answers from its general training data. Grounding reduces hallucination and improves factual accuracy. +: A dense numerical vector (array of floats) that represents the semantic meaning of a piece of text. Texts with similar meanings produce vectors that are close together in the embedding space. -Task Type (Embedding) -: A parameter passed to embedding models like `gemini-embedding-001` that tells the model to optimize its output for a specific use case. Common values: `RETRIEVAL_DOCUMENT` (for corpus text being indexed) and `RETRIEVAL_QUERY` (for user questions being searched). +Cosine Similarity +: A measure of how similar two embedding vectors are. Ranges from -1 (opposite) to 1 (identical direction). Used to rank which corpus chunks are most relevant to a query. ## Additional Resources -- [Compute for ML](compute-for-ML.html) — guide to choosing machine types and GPUs -- [UW-Madison Cloud Resources](uw-madison-cloud-resources.html) — discounts, credits, and campus support for UW researchers -- [Using a GitHub PAT in Vertex AI](github-pat.html) — pushing/pulling code from Workbench notebooks +- [Compute for ML](compute-for-ML.html) — guide to choosing hardware and GPUs on CHTC +- [UW-Madison CHTC Resources](uw-madison-chtc-resources.html) — CHTC support, GPU Lab, and campus computing options +- [Using a GitHub PAT](github-pat.html) — pushing/pulling code from CHTC +- [CHTC Guides](https://chtc.cs.wisc.edu/uw-research-computing/guides) — official CHTC documentation +- [HTCondor Manual](https://htcondor.readthedocs.io/en/latest/) — complete HTCondor reference diff --git a/learners/setup.md b/learners/setup.md index d02e8058..1ec77c1c 100644 --- a/learners/setup.md +++ b/learners/setup.md @@ -6,58 +6,77 @@ title: Setup Before attending this workshop, you'll need to complete a few setup steps to ensure you can follow along smoothly. The main requirements are: -1. **GCP Access** – Use the **shared Google Cloud project** provided by RCI and ML+X (standard for UW-Madison workshops) or sign up for a personal GCP Free Tier account. -2. **Titanic Dataset** – Download the required CSV files in advance. -3. **(Optional) Google Cloud Skills Boost** — For a broader overview of GCP, visit the [Getting Started with Google Cloud Fundamentals](https://www.cloudskillsboost.google/paths/8) course. +1. **CHTC Account** — Request a CHTC account if you don't already have one. +2. **SSH Access** — Ensure you can SSH into a CHTC submit node. +3. **Titanic Dataset** — The required CSV files are included in the workshop repository. 4. **(Optional) GitHub Account** — Only needed if you want to push your work back to a fork. See the [GitHub PAT guide](github-pat.html) for details. Details on each step are outlined below. -### 2. GCP Access +### 1. CHTC Account -There are two ways to get access to GCP for this lesson. Please wait for a pre-workshop email from the instructor to confirm which option to choose. +You need an active CHTC account to participate in this workshop. There are two scenarios: -#### Option 1) Shared Google Cloud Project (UW-Madison workshops) +#### Option A) You already have a CHTC account -When this workshop is taught at UW-Madison (e.g., Machine Learning Marathon, Research Bazaar), the instructors provide access to a shared GCP project courtesy of **RCI (Research Cyberinfrastructure)** and **ML+X**. You do not need to set up your own account or billing. +If you've used CHTC before, you're all set. Verify you can log in by running: -**How access works:** The instructors will add your Google account to a **Google Group** that has the necessary permissions on the shared project. Once you're added, GCP will recognize your membership and grant access — this can take **5–15 minutes** to propagate. If possible, the instructors will add you the day before the workshop so everything is ready at start time. +```bash +ssh YOUR_NETID@ap2002.chtc.wisc.edu +``` -What to expect: +If you can't connect, contact [chtc@cs.wisc.edu](mailto:chtc@cs.wisc.edu) for help. -* During the lesson, you will log in with your Google account credentials and select the shared GCP project. -* If you can't see the project right away, wait a few minutes for permissions to propagate. Try refreshing the page or opening an **incognito/private browser window**. Make sure you're logged into the correct Google account (the one the instructors added to the group). -* This setup ensures that all participants have a consistent environment and avoids unexpected billing for attendees. -* Please use shared credits responsibly — they are limited and reused for future training events. - * Stay within the provided exercises and avoid launching additional compute-heavy workloads (e.g., training large language models). - * Do not enable additional APIs or services unless instructed. +#### Option B) You need a new account -#### Option 2) GCP Free Tier — Skip If Using Shared Project +1. Visit the [CHTC account request page](https://chtc.cs.wisc.edu/uw-research-computing/form). +2. Fill out the request form. Mention that you're attending an ML workshop if asked about your use case. +3. **Submit your request at least 1 week before the workshop** — account creation requires manual review by CHTC staff. +4. Once approved, you'll receive an email with login instructions. -If the instructors aren't providing a shared account environment, please follow these instructions: +::::::::::::::::::::::::::::::::::::: callout -1. Go to the [GCP Free Tier page](https://cloud.google.com/free) and click **Get started for free**. -2. Complete the signup process. The Free Tier includes a `$300` credit valid for 90 days and ongoing free usage for some smaller services. -3. Once your account is ready, log in to the [Google Cloud Console](https://console.cloud.google.com/). -4. During the lesson, we will enable only a few APIs (Compute Engine, Cloud Storage, and Notebooks). +### Workshop-specific accounts -Following the lesson should cost well under `$15` total if you are using your own credits. +When this workshop is taught at UW-Madison (e.g., Machine Learning Marathon, Research Bazaar), the instructors may provide temporary shared accounts or coordinate bulk account creation with CHTC. Wait for a pre-workshop email from the instructor to confirm the setup process. -### 3. Download the Data +:::::::::::::::::::::::::::::::::::::::::::::::: -For this workshop, you will need the **Titanic dataset**, which can be used to train a classifier predicting survival. +### 2. SSH Access -1. Please download the following zip file (Right-click → Save as): - [data.zip](https://raw.githubusercontent.com/qualiaMachine/Intro_GCP_for_ML/main/data/data.zip) +You'll need an SSH client to connect to CHTC: -2. Extract the zip folder contents (Right-click → Extract all on Windows; double-click on macOS). +- **macOS/Linux**: Use the built-in Terminal app. SSH is pre-installed. +- **Windows**: Use [Windows Terminal](https://aka.ms/terminal) (Windows 10+, SSH built-in), [PuTTY](https://www.chiark.greenend.org.uk/~sgtatham/putty/), or [MobaXterm](https://mobaxterm.mobatek.net/). -3. Save the two data files (train and test) somewhere easy to access, for example: - - `~/Downloads/data/titanic_train.csv` - - `~/Downloads/data/titanic_test.csv` +Test your connection before the workshop: -In Episode 3, you will create a Cloud Storage bucket and upload this data to use with your notebook. +```bash +ssh YOUR_NETID@ap2002.chtc.wisc.edu +``` -### 4. (Optional) Google Cloud Skills Boost — Getting Started with Google Cloud Fundamentals +You'll need to authenticate with your UW-Madison NetID and password. If you're off-campus, you may need to use the [UW-Madison VPN](https://it.wisc.edu/services/wiscvpn/) first. -If you want a broader introduction to GCP before the workshop, consider exploring the [Getting Started with Google Cloud Fundamentals](https://www.cloudskillsboost.google/paths/8) self-paced learning path. It covers the basics of the Google Cloud environment, including project structure, billing, IAM (Identity and Access Management), and common services like Compute Engine, Cloud Storage, and BigQuery. This step is optional but recommended for those that want a broader overview of GCP before diving into ML/AI use-cases. +### 3. Workshop Data + +The Titanic dataset and other workshop files are included in the lesson repository. During the workshop, you'll clone the repo directly on the CHTC submit node: + +```bash +git clone https://github.com/qualiaMachine/Intro_GCP_for_ML.git +``` + +The repository contains: +- `data/data.zip` — Titanic dataset (titanic_train.csv, titanic_test.csv) +- `data/pdfs_bundle.zip` — Research papers for the RAG episode +- `scripts/` — Training scripts (train_xgboost.py, train_nn.py) +- `submit_files/` — HTCondor submit file examples + +### 4. (Optional) Familiarize Yourself with CHTC + +If you want a broader introduction to CHTC before the workshop, explore: + +- [CHTC Getting Started Guide](https://chtc.cs.wisc.edu/uw-research-computing/guides) +- [HTCondor Quick Start Tutorial](https://htcondor.readthedocs.io/en/latest/getting-htcondor/using-htcondor-first-time.html) +- [CHTC Hello World Example](https://chtc.cs.wisc.edu/uw-research-computing/helloworld) + +This is optional but recommended for those who want to get familiar with the command line and job submission before the workshop. diff --git a/learners/uw-madison-chtc-resources.md b/learners/uw-madison-chtc-resources.md new file mode 100644 index 00000000..6ceabb32 --- /dev/null +++ b/learners/uw-madison-chtc-resources.md @@ -0,0 +1,75 @@ +--- +title: UW-Madison CHTC Resources +--- + +This page collects UW-Madison-specific CHTC and research computing resources, contacts, and related services relevant to ML/AI researchers. It is meant as a companion to the workshop material and a starting point for learners who want to continue using CHTC after the workshop. + +## About CHTC + +The [Center for High Throughput Computing (CHTC)](https://chtc.cs.wisc.edu/) is a research computing center at UW-Madison that provides large-scale computing resources to the campus community at **no cost** to UW researchers. CHTC is home to the [HTCondor project](https://htcondor.org/), the workload management system used to schedule and manage jobs. + +### Key resources + +- **High Throughput Computing (HTC)** — Thousands of CPU cores for running many independent jobs in parallel. Ideal for hyperparameter sweeps, data preprocessing, and embarrassingly parallel workloads. +- **GPU Lab** — Hundreds of GPUs including NVIDIA A100 (40/80 GB), H100 (80 GB), H200 (141 GB), L40, and T4. Supports ML model training, fine-tuning, and inference. +- **High Performance Computing (HPC)** — A dedicated cluster for tightly coupled parallel workloads (MPI, multi-node training). Contact CHTC if your workload requires this. +- **Large-scale data staging** — `/staging` and SQUID provide storage for datasets too large for `/home`. + +### How to get an account + +1. Visit the [CHTC Account Request Form](https://chtc.cs.wisc.edu/uw-research-computing/form). +2. Fill out the form describing your research and computing needs. +3. CHTC staff will review your request and set up your account (typically within a few business days). + +### Getting help + +CHTC offers multiple support channels: + +- **Email**: [chtc@cs.wisc.edu](mailto:chtc@cs.wisc.edu) — the best way to get help with specific issues. +- **Office hours**: CHTC holds weekly drop-in office hours. Check the [CHTC website](https://chtc.cs.wisc.edu/) for the current schedule. +- **Research facilitation**: CHTC facilitators can help you design your workflow, optimize job submissions, and troubleshoot issues. Request a consultation via email. +- **Documentation**: [CHTC Guides](https://chtc.cs.wisc.edu/uw-research-computing/guides) cover everything from getting started to advanced workflows. + +## CHTC vs. cloud computing + +CHTC and cloud platforms (AWS, GCP, Azure) serve different needs. Here's a quick comparison: + +| Factor | CHTC | Cloud (GCP/AWS/Azure) | +|--------|------|----------------------| +| **Cost** | Free for UW researchers | Pay per hour | +| **GPU access** | Shared queue; wait times during peak periods | On-demand (subject to quota) | +| **Hardware variety** | A100, H100, H200, L40, T4 | Latest GPUs immediately available | +| **Scaling** | Hundreds of parallel jobs | Essentially unlimited | +| **Multi-GPU / NVLink** | Limited multi-GPU support | Available on demand | +| **Software environment** | Docker containers | Managed containers + cloud services | +| **Managed services** | None (you run your own code) | Managed training, tuning, deployment | +| **Data storage** | /home, /staging, SQUID | Cloud storage (S3, GCS) | +| **Runtime limits** | 12 hrs / 24 hrs / 7 days | No limits (pay for what you use) | + +**Recommendation:** Start with CHTC — it's free and covers most research ML/AI workloads. Move to cloud when you need managed services, unlimited scaling, or hardware beyond what CHTC offers. + +## Other on-campus compute options + +### BadgerCompute + +[BadgerCompute](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/BadgerCompute.html) is a lightweight, NetID-authenticated Jupyter notebook service available to UW-Madison users. It is suitable for quick prototyping and small-scale work. + +### Google Colab + +[Google Colab](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/GoogleColab.html) provides free cloud-based Jupyter notebooks with optional GPU access. Useful for quick experiments and teaching. + +### Cloud platforms + +If you need cloud computing, UW-Madison has institutional contracts with AWS, GCP, and Azure that provide negotiated pricing and reduced overhead on grants. See the [UW Cloud Services page](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/UW-Cloud-Services.html) for details. + +## Community and training + +- **ML+X Community** — Join [ML+X](https://uw-madison-datascience.github.io/ML-X-Nexus/) for monthly meetings on machine learning and AI at UW-Madison. Contact [endemann@wisc.edu](mailto:endemann@wisc.edu) or join the `#ml-community` channel in the [Data Science Hub Slack](https://hub.datascience.wisc.edu/). +- **CHTC Training** — CHTC periodically offers workshops on HTCondor, GPU computing, and research computing workflows. Check the [CHTC website](https://chtc.cs.wisc.edu/) for upcoming events. +- **RCI** — The [Research Cyberinfrastructure](https://it.wisc.edu/about/division-of-information-technology/research-cyberinfrastructure/) team can help with architecture design and comparing compute options. Email [rci@g-groups.wisc.edu](mailto:rci@g-groups.wisc.edu). + +## Related resources + +- [Intro to GCP for ML & AI](https://uw-madison-datascience.github.io/ML-X-Nexus/Learn/Workshops/Intro-GCP.html) — Hands-on workshop covering Vertex AI, model training/tuning, and RAG on Google Cloud. +- [Intro to AWS SageMaker for Predictive ML/AI](https://uw-madison-datascience.github.io/ML-X-Nexus/Learn/Workshops/Intro-Amazon_SageMaker.html) — Workshop covering ML workflows in AWS SageMaker. +- [CHTC on Nexus](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/CHTC.html) — Overview of CHTC resources and how to get started. diff --git a/learners/uw-madison-cloud-resources.md b/learners/uw-madison-cloud-resources.md deleted file mode 100644 index d667683b..00000000 --- a/learners/uw-madison-cloud-resources.md +++ /dev/null @@ -1,149 +0,0 @@ ---- -title: UW-Madison Cloud Resources ---- - -This page collects UW-Madison-specific cloud computing resources, contacts, and funding opportunities relevant to ML/AI researchers. It is meant as a companion to the workshop material and a starting point for learners who want to continue using cloud resources after the workshop. - -Much of this information is drawn from the [ML+X Nexus UW Cloud Services page](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/UW-Cloud-Services.html) — check there for the most up-to-date version. - -## Cloud platforms at UW-Madison - -UW-Madison has institutional contracts with three public cloud vendors: - -- **Amazon Web Services (AWS)** — [Service page](https://it.wisc.edu/services/amazon-web-services/) | [Pricing & billing FAQ](https://kb.wisc.edu/data/page.php?id=65532) -- **Google Cloud Platform (GCP)** — [Service page](https://it.wisc.edu/services/google-cloud-platform/) | [Pricing](https://kb.wisc.edu/100173) | [Requesting a project](https://kb.wisc.edu/data/100171) -- **Microsoft Azure** — [Service page](https://it.wisc.edu/services/microsoft-azure/) | [Pricing](https://kb.wisc.edu/69212) - -These services are managed by the [UW Public Cloud Team](https://kb.wisc.edu/page.php?id=109785), a cross-disciplinary group of operations, cybersecurity, and research cyberinfrastructure (RCI) professionals. - -## Why use a UW-provisioned account? - -A self-provisioned cloud account (one you create directly with Google or AWS) is a personal agreement between you and the vendor — it is **not** covered by UW-Madison's institutional contracts. Going through the UW Public Cloud Team gives you: - -- **Negotiated pricing** via [Internet2 NET+](https://internet2.edu/cloud/cloud-solutions-community/net-plus/) agreements. GCP accounts include a [network egress waiver](https://kb.wisc.edu/100173) (up to 15% of your total bill); Azure accounts receive ~3.5% off retail pricing. -- **Lower overhead on grants** — Cloud expenses normally carry 55.5% F&A overhead. With a UW cloud account that drops to **26%**, saving ~ `$2,950` per `$10,000` spent. See the [Cloud Computing Pilot](https://rsp.wisc.edu/proposalprep/cloudComputeInfo.cfm). -- **NIH STRIDES discounts** — Additional pricing reductions for NIH-funded researchers, layered on top of UW rates. See [STRIDES at UW-Madison](https://kb.wisc.edu/109813). -- **Business Associates Agreement (BAA)** — UW's contracts include a BAA that governs vendor access to your data, which is critical for HIPAA-regulated health data. -- **Security monitoring** — UW accounts benefit from Security Command Center monitoring with alerts escalated to the UW Cybersecurity Operations Team (CSOC). -- **Baseline security configuration** — Accounts come pre-configured to meet [CIS benchmark](https://www.cisecurity.org/cis-benchmarks) standards with NetID authentication built in. -- **Dedicated support** — Email [cloud-services@cio.wisc.edu](mailto:cloud-services@cio.wisc.edu), attend [office hours](https://kb.wisc.edu/101516), or schedule a consultation. - -For the full breakdown, see [Why Should I Use a UW Madison Public Cloud Account?](https://kb.wisc.edu/page.php?id=109785) on the UW KnowledgeBase. - -## How to request a UW cloud account - -1. **Get a DoIT Billing Customer ID** to tie cloud usage to a funding source. -2. **Fill out the [UW-Madison Cloud Account Request Form](https://kb.wisc.edu/sbsedirbs/page.php?id=104090)** — covers AWS, GCP, and Azure. -3. **For sensitive/restricted data** — complete a [Cybersecurity risk assessment](https://kb.wisc.edu/115296) before processing HIPAA, FERPA, or other regulated data. - -## Research funding and credits - -### Reduced F&A on grants (Cloud Computing Pilot) - -The [Cloud Computing Pilot](https://rsp.wisc.edu/proposalprep/cloudComputeInfo.cfm) reduces overhead from 55.5% to 26% on cloud expenses when using a UW-provisioned account. This applies to new proposals and awards. Costs paid via purchasing card or personal accounts are charged the full rate. RSP provides [budget templates](https://rsp.wisc.edu/proposalprep/cloudComputeInfo.cfm) for proposals. - -### NIH STRIDES Initiative - -NIH-funded researchers get negotiated pricing on GCP, AWS, and Azure services through the [STRIDES Initiative](https://kb.wisc.edu/109813). Discounts are provided via program resellers (Carahsoft for GCP, Four Points Technology for AWS) and vary by service — exact rates are shared through [STRIDES price tables](https://cloud.nih.gov/) rather than published publicly. The UW cloud team can transition accounts in or out of STRIDES at any time with no data migration. Contact [STRIDES@nih.gov](mailto:STRIDES@nih.gov) for pricing details. - -### Google Cloud Research Credits - -Google offers up to **`$5,000` in cloud credits** for faculty, postdoctoral, and non-profit researchers (up to `$1,000` for PhD students). - -- [Apply for Google Cloud Research Credits](https://edu.google.com/intl/ALL_us/programs/credits/research/) -- Applications accepted on a rolling basis; decisions typically take 6–8 weeks. - -### AWS Cloud Credit for Research - -AWS offers promotional credits for academic researchers through its [Cloud Credit for Research](https://aws.amazon.com/government-education/research-and-technical-computing/cloud-credit-for-research/) program. - -- **Students**: up to **`$5,000`** in AWS credits. -- **Faculty and staff**: award amounts vary by proposal (no fixed cap). -- Applications are reviewed on a rolling basis; typical review cycles are 90–120 days. -- Credits are valid for 1 year from issuance or until fully used. - -### Azure Research Credits - -Microsoft offers several credit programs for academic researchers: - -- **[Azure for Students](https://azure.microsoft.com/en-us/free/students/)**: **`$100`** in credits (12 months), no credit card required. -- **[Azure Research Credits](https://www.microsoft.com/en-us/azure-academic-research/)**: Open to faculty and researchers for proof-of-concept, migration, or tool-building projects. -- **[Internet2 Azure Accelerator Program](https://internet2.edu/)**: Up to **`$5,000`** in credits for research and education proposals (365-day expiration). - -### Grants for social impact & sustainability research - -The major cloud providers also offer larger grants for research focused on public good — sustainability, environmental science, public health, education, and underserved communities: - -- **Google**: The [Google.org Impact Challenge: AI for Science](https://opportunitydesk.org/2026/02/25/google-org-impact-challenge-ai-for-science-2026/) awards $500K–$3M for projects using AI to tackle scientific challenges, with a focus on climate resilience and environmental science. -- **AWS**: The [AWS Imagine Grant](https://aws.amazon.com/government-education/nonprofits/aws-imagine-grant-program/) provides up to `$200K` in unrestricted funding plus AWS credits to nonprofits and research organizations working on social impact. -- **Microsoft**: The [AI for Good Lab](https://www.microsoft.com/en-us/research/academic-program/ai-for-good-lab-open-call/) runs open calls awarding Azure credits and scientific collaboration for projects in sustainability, public health, education, and human rights. Microsoft also offers free access to petabytes of environmental data through the [Planetary Computer](https://planetarycomputer.microsoft.com/). - -### Free cloud training - -Each platform offers free, self-paced training to help you get started: - -- **GCP**: UW-Madison has a limited number of seats for [Google Cloud Skills Boost](https://www.cloudskillsboost.google/) — contact the Public Cloud Team at [cloud-services@cio.wisc.edu](mailto:cloud-services@cio.wisc.edu) to request access. -- **AWS**: [AWS Skill Builder](https://skillbuilder.aws/) offers 600+ free courses covering compute, ML, and more. -- **Azure**: [Microsoft Learn](https://learn.microsoft.com/en-us/training/azure/) provides free, structured learning paths for Azure services. - -## Data protection and compliance - -Cloud eligibility depends on your data classification: - -| Data type | Cloud eligible? | Requirements | -|-----------|----------------|--------------| -| Public / Internal | Yes | Standard UW cloud account | -| Sensitive | Yes, with assessment | [Cybersecurity risk assessment](https://kb.wisc.edu/115296) required | -| Restricted (HIPAA, etc.) | Yes, with assessment | Risk assessment + risk executive approval + HIPAA-eligible services | - -Key compliance resources: - -- [Data classification policy](https://kb.wisc.edu/itpolicy/page.php?id=59205) -- [Data elements allowed in public cloud](https://kb.wisc.edu/100124) -- [GCP for sensitive and restricted data](https://kb.wisc.edu/115296) -- [Shared responsibility model](https://kb.wisc.edu/data/page.php?id=115300) -- [HIPAA Security Program](https://it.wisc.edu/about/division-of-information-technology/enterprise-information-security-services/office-of-cybersecurity/hipaa-security-program/) -- SMPH researchers using Azure: contact [platformx-support@mailplus.wisc.edu](mailto:platformx-support@mailplus.wisc.edu) about [Platform X](https://it.wisc.edu/services/microsoft-azure/) for HIPAA workloads. - -## On-campus compute alternatives - -Cloud is not the only option. UW-Madison offers several on-campus resources that are **free for UW researchers**: - -### Center for High Throughput Computing (CHTC) - -[CHTC](https://chtc.cs.wisc.edu/) is UW-Madison's core research computing center, providing access to 20,000+ CPU cores and hundreds of GPUs (including A100s) at no cost to UW researchers. Key features: - -- **GPU Lab** — Supports up to dozens of concurrent GPU jobs per user, including 40 GB and 80 GB A100s, with runtimes from hours to seven days. -- **Research facilitation** — Personalized consultations, online guides, and drop-in office hours to help you get started. -- **HTCondor** — CHTC's job scheduler lets you submit large batches of independent training runs (e.g., hyperparameter sweeps) across many machines. - -CHTC is a strong choice for researchers who need GPU access but do not need cloud-specific services like managed APIs or cloud storage. - -For more details, see the [CHTC page on Nexus](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/CHTC.html). - -### BadgerCompute - -[BadgerCompute](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/BadgerCompute.html) is a lightweight, NetID-authenticated Jupyter notebook service available to UW-Madison users. It is suitable for quick prototyping and small-scale work without spinning up cloud resources. - -### Google Colab - -[Google Colab](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/GoogleColab.html) provides free cloud-based Jupyter notebooks with optional GPU access. It is not a UW service, but it is a useful option for quick experiments and teaching. - -## Getting help - -- **Office hours** — The RCI and Public Cloud Team hold drop-in hours on **Thursdays, 2–3:15 PM** via [Zoom](https://kb.wisc.edu/101516). Open to the entire UW community. -- **Cloud Community** — Join the [UW Cloud Community](https://it.wisc.edu/research-ci/building-cloud-community-at-uw-madison/) group — they meet every other month to share cloud computing experiences and tips. -- **Email** — [cloud-services@cio.wisc.edu](mailto:cloud-services@cio.wisc.edu) -- **Public Cloud KnowledgeBase** — [kb.wisc.edu](https://kb.wisc.edu/page.php?id=109785) — FAQs, pricing info, and how-to guides. -- **ML+X Community** — Join [ML+X](https://uw-madison-datascience.github.io/ML-X-Nexus/) for monthly meetings on machine learning and AI at UW-Madison. Contact [endemann@wisc.edu](mailto:endemann@wisc.edu) or join the `#ml-community` channel in the [Data Science Hub Slack](https://hub.datascience.wisc.edu/). -- **RCI** — The [Research Cyberinfrastructure](https://it.wisc.edu/about/division-of-information-technology/research-cyberinfrastructure/) team can help with architecture design, cost estimates, and comparing cloud vs. on-premises options. Email [rci@g-groups.wisc.edu](mailto:rci@g-groups.wisc.edu). - -## Related resources - -- [Intro to GCP for ML & AI](https://uw-madison-datascience.github.io/ML-X-Nexus/Learn/Workshops/Intro-GCP.html) — Hands-on workshop covering Vertex AI, model training/tuning, and RAG with Gemini on GCP. -- [Intro to AWS SageMaker for Predictive ML/AI](https://uw-madison-datascience.github.io/ML-X-Nexus/Learn/Workshops/Intro-Amazon_SageMaker.html) — Workshop covering ML workflows in AWS SageMaker. -- [Google Colab](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/GoogleColab.html) — Free cloud-based Jupyter notebooks with GPU access. -- [Center for High Throughput Computing (CHTC)](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/CHTC.html) — Free on-campus HPC/HTC resources for UW researchers. -- [BadgerCompute](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/Compute/BadgerCompute.html) — UW-Madison's lightweight, NetID-authenticated Jupyter service. -- [UW Generative AI Services & Policies](https://uw-madison-datascience.github.io/ML-X-Nexus/Toolbox/GenAI/GenAI-at-UW-Madison.html) — Overview of UW-vetted AI tools including pay-as-you-go cloud AI services. -- [Introduction to AWS for Researchers (RCI)](https://researchci.it.wisc.edu/introduction-to-aws-for-researchers/) — RCI's guide for getting started with AWS. diff --git a/links.md b/links.md index c8fcef18..b683fb68 100644 --- a/links.md +++ b/links.md @@ -4,10 +4,9 @@ any links that you are not going to use. --> [carpentries-workbench]: https://carpentries.github.io/sandpaper-docs/ -[vertex-ai-docs]: https://cloud.google.com/vertex-ai/docs -[gcs-pricing]: https://cloud.google.com/storage/pricing -[compute-pricing]: https://cloud.google.com/compute/vm-instance-pricing -[vertex-training-containers]: https://cloud.google.com/vertex-ai/docs/training/pre-built-containers -[vertex-model-garden]: https://cloud.google.com/vertex-ai/docs/start/explore-models -[github-pat-docs]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens - +[chtc-home]: https://chtc.cs.wisc.edu/ +[chtc-gpu-lab]: https://chtc.cs.wisc.edu/uw-research-computing/gpu-lab +[chtc-file-avail]: https://chtc.cs.wisc.edu/uw-research-computing/file-avail-largedata +[chtc-docker]: https://chtc.cs.wisc.edu/uw-research-computing/docker-jobs +[htcondor-manual]: https://htcondor.readthedocs.io/en/latest/ +[htcondor-dagman]: https://htcondor.readthedocs.io/en/latest/automated-workflows/index.html diff --git a/profiles/learner-profiles.md b/profiles/learner-profiles.md index cb35d163..28ad3618 100644 --- a/profiles/learner-profiles.md +++ b/profiles/learner-profiles.md @@ -4,12 +4,12 @@ title: Learner Profiles ### Alex — Graduate Researcher in Biology -Alex is a second-year PhD student who trains random forest and XGBoost models on tabular genomics data using scikit-learn on their laptop. Their datasets are growing beyond what fits in RAM, and their advisor has suggested moving to cloud compute. Alex has basic Python skills and has heard of GCP but has never used it. They want to learn how to store data in the cloud, run training jobs without babysitting a notebook, and keep costs under control. +Alex is a second-year PhD student who trains random forest and XGBoost models on tabular genomics data using scikit-learn on their laptop. Their datasets are growing beyond what fits in RAM, and their advisor has suggested using CHTC. Alex has basic Python skills and has heard of HTCondor but has never used it. They want to learn how to run training jobs on shared hardware without babysitting a terminal session. ### Jordan — Data Scientist at a Research Lab -Jordan has 3 years of experience training deep learning models with PyTorch on a local GPU workstation. They are comfortable with the command line and Git. Their lab has GCP credits and wants to scale up hyperparameter tuning for a new project. Jordan needs to learn how to submit managed training jobs, attach GPUs, and compare tuning trial results without managing infrastructure manually. +Jordan has 3 years of experience training deep learning models with PyTorch on a local GPU workstation. They are comfortable with the command line and Git. Their lab wants to scale up hyperparameter tuning using CHTC's GPU Lab. Jordan needs to learn how to submit GPU jobs, run parallel tuning sweeps, and collect results across many HTCondor jobs. ### Sam — Postdoc Exploring LLMs for Literature Review -Sam is a postdoc in environmental science who wants to use retrieval-augmented generation (RAG) to extract information from research papers. They have intermediate Python skills and have used Jupyter notebooks extensively, but have no cloud experience. Sam is primarily interested in the RAG episode but needs the foundational GCP knowledge from earlier episodes to set up their environment and manage costs responsibly. +Sam is a postdoc in environmental science who wants to use retrieval-augmented generation (RAG) to extract information from research papers. They have intermediate Python skills and have used Jupyter notebooks extensively, but have no HTC experience. Sam is primarily interested in the RAG episode but needs the foundational CHTC knowledge from earlier episodes to set up their environment and submit jobs. diff --git a/scripts/aggregate_results.py b/scripts/aggregate_results.py new file mode 100644 index 00000000..6436e895 --- /dev/null +++ b/scripts/aggregate_results.py @@ -0,0 +1,62 @@ +"""Aggregate hyperparameter tuning results from multiple HTCondor jobs. + +This script reads metrics.json files from a results directory and prints +a summary table sorted by validation accuracy. Used in Episode 06. + +Usage: + python aggregate_results.py --results-dir results/ +""" + +import argparse +import json +import os +import sys + + +def main(): + parser = argparse.ArgumentParser(description="Aggregate HP tuning results") + parser.add_argument( + "--results-dir", + type=str, + default="results", + help="Directory containing metrics_*.json files", + ) + args = parser.parse_args() + + results = [] + for fname in sorted(os.listdir(args.results_dir)): + if fname.startswith("metrics_") and fname.endswith(".json"): + path = os.path.join(args.results_dir, fname) + with open(path) as f: + data = json.load(f) + data["file"] = fname + results.append(data) + + if not results: + print(f"No metrics_*.json files found in {args.results_dir}") + sys.exit(1) + + # Sort by validation accuracy (descending) + results.sort(key=lambda r: r.get("final_val_accuracy", 0), reverse=True) + + # Print summary table + print(f"{'File':<40} {'Val Acc':>8} {'Val Loss':>9} {'LR':>10} {'Patience':>9} {'Best Epoch':>11}") + print("-" * 90) + for r in results: + print( + f"{r['file']:<40} " + f"{r.get('final_val_accuracy', 'N/A'):>8.4f} " + f"{r.get('final_val_loss', 'N/A'):>9.4f} " + f"{r.get('learning_rate', 'N/A'):>10.6f} " + f"{r.get('patience', 'N/A'):>9} " + f"{r.get('best_epoch', 'N/A'):>11}" + ) + + print(f"\nBest trial: {results[0]['file']}") + print(f" Validation accuracy: {results[0].get('final_val_accuracy', 'N/A'):.4f}") + print(f" Learning rate: {results[0].get('learning_rate', 'N/A')}") + print(f" Patience: {results[0].get('patience', 'N/A')}") + + +if __name__ == "__main__": + main() diff --git a/scripts/evaluate_model.py b/scripts/evaluate_model.py new file mode 100644 index 00000000..94936310 --- /dev/null +++ b/scripts/evaluate_model.py @@ -0,0 +1,60 @@ +"""Evaluate a trained XGBoost model on the Titanic test set. + +Usage: + python evaluate_model.py + python evaluate_model.py --model xgboost-model --test titanic_test.csv +""" + +import argparse +import json + +import joblib +import pandas as pd +import xgboost as xgb +from sklearn.metrics import accuracy_score + + +def preprocess_data(df): + """Apply the same preprocessing as train_xgboost.py.""" + df = df.copy() + df["Age"] = df["Age"].fillna(df["Age"].median()) + df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0]) + df = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], errors="ignore") + df["Sex"] = df["Sex"].map({"male": 1, "female": 0}) + df["Embarked"] = df["Embarked"].map({"S": 0, "C": 1, "Q": 2}) + X = df.drop(columns=["Survived"]) + y = df["Survived"] + return X, y + + +def main(): + parser = argparse.ArgumentParser(description="Evaluate XGBoost model") + parser.add_argument("--model", type=str, default="xgboost-model") + parser.add_argument("--test", type=str, default="titanic_test.csv") + args = parser.parse_args() + + # Load model + model = joblib.load(args.model) + + # Load and preprocess test data + test_df = pd.read_csv(args.test) + X_test, y_test = preprocess_data(test_df) + + # Predict + dtest = xgb.DMatrix(X_test) + y_pred = model.predict(dtest) + y_pred_binary = (y_pred > 0.5).astype(int) + + # Compute accuracy + acc = accuracy_score(y_test, y_pred_binary) + print(f"Test accuracy: {acc:.4f}") + + # Save results + results = {"test_accuracy": float(acc), "test_samples": int(len(y_test))} + with open("evaluation_results.json", "w") as f: + json.dump(results, f, indent=2) + print("Results saved to evaluation_results.json") + + +if __name__ == "__main__": + main() diff --git a/scripts/preprocess_data.py b/scripts/preprocess_data.py new file mode 100644 index 00000000..d7c0b687 --- /dev/null +++ b/scripts/preprocess_data.py @@ -0,0 +1,54 @@ +"""Preprocess Titanic dataset into .npz files for PyTorch training. + +This script reads raw CSV files, applies preprocessing (encoding, scaling, +imputation), and saves train/val arrays as compressed .npz files. + +Usage: + python preprocess_data.py + python preprocess_data.py --train titanic_train.csv --test titanic_test.csv +""" + +import argparse + +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder, StandardScaler + + +def main(): + parser = argparse.ArgumentParser(description="Preprocess Titanic data to .npz") + parser.add_argument("--train", type=str, default="titanic_train.csv") + parser.add_argument("--test", type=str, default="titanic_test.csv") + args = parser.parse_args() + + df = pd.read_csv(args.train) + + # Encode categorical features + sex_enc = LabelEncoder().fit(df["Sex"]) + df["Sex"] = sex_enc.transform(df["Sex"]) + df["Embarked"] = df["Embarked"].fillna("S") + emb_enc = LabelEncoder().fit(df["Embarked"]) + df["Embarked"] = emb_enc.transform(df["Embarked"]) + df["Age"] = df["Age"].fillna(df["Age"].median()) + df["Fare"] = df["Fare"].fillna(df["Fare"].median()) + + X = df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].values + y = df["Survived"].values + + scaler = StandardScaler() + X = scaler.fit_transform(X) + + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + np.savez("train_data.npz", X_train=X_train, y_train=y_train) + np.savez("val_data.npz", X_val=X_val, y_val=y_val) + + print(f"Train shape: {X_train.shape}, Val shape: {X_val.shape}") + print("Saved: train_data.npz, val_data.npz") + + +if __name__ == "__main__": + main() diff --git a/submit_files/evaluate.sub b/submit_files/evaluate.sub new file mode 100644 index 00000000..ef3eea1c --- /dev/null +++ b/submit_files/evaluate.sub @@ -0,0 +1,21 @@ +# HTCondor submit file for model evaluation step +# Used in the DAGMan workflow example (Episode 08) + +universe = docker +docker_image = python:3.10-slim + +executable = run_evaluate.sh + +transfer_input_files = ../scripts/evaluate_model.py, run_evaluate.sh, requirements_xgboost.txt, titanic_test.csv, xgboost-model +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +log = logs/evaluate_$(Cluster).log +output = logs/evaluate_$(Cluster).out +error = logs/evaluate_$(Cluster).err + +request_cpus = 1 +request_memory = 2GB +request_disk = 1GB + +queue 1 diff --git a/submit_files/hp_tune.sub b/submit_files/hp_tune.sub new file mode 100644 index 00000000..d4654f0d --- /dev/null +++ b/submit_files/hp_tune.sub @@ -0,0 +1,34 @@ +# HTCondor submit file for hyperparameter tuning on CHTC +# Episode 06: Hyperparameter Tuning with HTCondor +# +# This submit file launches one job per line in params.txt. +# Each line should contain: learning_rate,patience,min_delta + +universe = docker +docker_image = pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime + +executable = run_nn.sh +arguments = --train train_data.npz --val val_data.npz --epochs 500 --learning_rate $(lr) --patience $(pat) --min_delta $(delta) + +# Transfer input files to the execute node +transfer_input_files = ../scripts/train_nn.py, run_nn.sh, train_data.npz, val_data.npz + +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +# Remap output files to include job identifiers +transfer_output_remaps = "model.pt = results/model_$(Cluster)_$(Process).pt; metrics.json = results/metrics_$(Cluster)_$(Process).json; eval_history.csv = results/history_$(Cluster)_$(Process).csv" + +# Output, error, and log files +log = logs/tune_$(Cluster)_$(Process).log +output = logs/tune_$(Cluster)_$(Process).out +error = logs/tune_$(Cluster)_$(Process).err + +# Resource requests +request_cpus = 1 +request_memory = 4GB +request_disk = 2GB + +# Submit one job per line in params.txt +# Format: learning_rate, patience, min_delta +queue lr,pat,delta from params.txt diff --git a/submit_files/params.txt b/submit_files/params.txt new file mode 100644 index 00000000..1daf7354 --- /dev/null +++ b/submit_files/params.txt @@ -0,0 +1,6 @@ +0.001, 10, 0.001 +0.005, 15, 0.0005 +0.0001, 20, 0.0001 +0.01, 5, 0.001 +0.0005, 10, 0.0005 +0.002, 15, 0.0001 diff --git a/submit_files/preprocess.sub b/submit_files/preprocess.sub new file mode 100644 index 00000000..bdc9588c --- /dev/null +++ b/submit_files/preprocess.sub @@ -0,0 +1,21 @@ +# HTCondor submit file for data preprocessing step +# Used in the DAGMan workflow example (Episode 08) + +universe = docker +docker_image = python:3.10-slim + +executable = run_preprocess.sh + +transfer_input_files = ../scripts/preprocess_data.py, run_preprocess.sh, ../data/titanic_train.csv, ../data/titanic_test.csv +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +log = logs/preprocess_$(Cluster).log +output = logs/preprocess_$(Cluster).out +error = logs/preprocess_$(Cluster).err + +request_cpus = 1 +request_memory = 2GB +request_disk = 1GB + +queue 1 diff --git a/submit_files/requirements_xgboost.txt b/submit_files/requirements_xgboost.txt new file mode 100644 index 00000000..5fcddce2 --- /dev/null +++ b/submit_files/requirements_xgboost.txt @@ -0,0 +1,4 @@ +xgboost==2.1.0 +pandas +scikit-learn +joblib diff --git a/submit_files/run_evaluate.sh b/submit_files/run_evaluate.sh new file mode 100755 index 00000000..337fbfe0 --- /dev/null +++ b/submit_files/run_evaluate.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# Wrapper script for model evaluation on CHTC +set -euo pipefail + +pip install --quiet -r requirements_xgboost.txt +python evaluate_model.py + +echo "Evaluation complete." diff --git a/submit_files/run_nn.sh b/submit_files/run_nn.sh new file mode 100755 index 00000000..b87a8c12 --- /dev/null +++ b/submit_files/run_nn.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Wrapper script for PyTorch neural network training on CHTC +# This script runs the training script inside a PyTorch Docker container. + +set -euo pipefail + +# Run the training script, passing all arguments through +python train_nn.py "$@" + +echo "Training complete. Output files:" +ls -la diff --git a/submit_files/run_preprocess.sh b/submit_files/run_preprocess.sh new file mode 100755 index 00000000..7834ef3a --- /dev/null +++ b/submit_files/run_preprocess.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Wrapper script for data preprocessing on CHTC +set -euo pipefail + +pip install --quiet pandas scikit-learn numpy +python preprocess_data.py + +echo "Preprocessing complete. Output files:" +ls -la *.npz diff --git a/submit_files/run_xgboost.sh b/submit_files/run_xgboost.sh new file mode 100755 index 00000000..882ab7e7 --- /dev/null +++ b/submit_files/run_xgboost.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Wrapper script for XGBoost training on CHTC +# This script installs dependencies and runs the training script. + +set -euo pipefail + +# Install Python dependencies +pip install --quiet -r requirements_xgboost.txt + +# Run the training script, passing all arguments through +python train_xgboost.py "$@" + +echo "Training complete. Output files:" +ls -la diff --git a/submit_files/train_nn_cpu.sub b/submit_files/train_nn_cpu.sub new file mode 100644 index 00000000..2b6a048d --- /dev/null +++ b/submit_files/train_nn_cpu.sub @@ -0,0 +1,26 @@ +# HTCondor submit file for PyTorch CPU training on CHTC +# Episode 05: Training Models on CHTC (CPU fallback) + +universe = docker +docker_image = pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime + +executable = run_nn.sh +arguments = --train train_data.npz --val val_data.npz --epochs 500 --learning_rate 0.001 --patience 50 + +# Transfer input files to the execute node +transfer_input_files = ../scripts/train_nn.py, run_nn.sh, train_data.npz, val_data.npz + +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +# Output, error, and log files +log = logs/nn_cpu_$(Cluster).log +output = logs/nn_cpu_$(Cluster).out +error = logs/nn_cpu_$(Cluster).err + +# Resource requests (CPU only — no GPU) +request_cpus = 1 +request_memory = 4GB +request_disk = 2GB + +queue 1 diff --git a/submit_files/train_nn_gpu.sub b/submit_files/train_nn_gpu.sub new file mode 100644 index 00000000..110bda58 --- /dev/null +++ b/submit_files/train_nn_gpu.sub @@ -0,0 +1,32 @@ +# HTCondor submit file for PyTorch GPU training on CHTC +# Episode 05: Training Models on CHTC (GPUs) + +universe = docker +docker_image = pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime + +executable = run_nn.sh +arguments = --train train_data.npz --val val_data.npz --epochs 500 --learning_rate 0.001 --patience 50 + +# Transfer input files to the execute node +transfer_input_files = ../scripts/train_nn.py, run_nn.sh, train_data.npz, val_data.npz + +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +# Output, error, and log files +log = logs/nn_gpu_$(Cluster).log +output = logs/nn_gpu_$(Cluster).out +error = logs/nn_gpu_$(Cluster).err + +# Resource requests +request_cpus = 1 +request_gpus = 1 +require_gpus = (GlobalMemoryMb >= 15000) +request_memory = 8GB +request_disk = 4GB + +# GPU Lab settings ++WantGPULab = true ++GPUJobLength = "short" + +queue 1 diff --git a/submit_files/train_xgboost.sub b/submit_files/train_xgboost.sub new file mode 100644 index 00000000..c6fe8954 --- /dev/null +++ b/submit_files/train_xgboost.sub @@ -0,0 +1,26 @@ +# HTCondor submit file for XGBoost training on CHTC +# Episode 04: Training Models on CHTC + +universe = docker +docker_image = python:3.10-slim + +executable = run_xgboost.sh +arguments = --train titanic_train.csv --max_depth 3 --eta 0.1 --subsample 0.8 --colsample_bytree 0.8 --num_round 100 + +# Transfer input files to the execute node +transfer_input_files = ../scripts/train_xgboost.py, ../data/titanic_train.csv, run_xgboost.sh, requirements_xgboost.txt + +should_transfer_files = YES +when_to_transfer_output = ON_EXIT + +# Output, error, and log files +log = logs/xgb_$(Cluster).log +output = logs/xgb_$(Cluster).out +error = logs/xgb_$(Cluster).err + +# Resource requests — start small, scale up based on actual usage +request_cpus = 1 +request_memory = 2GB +request_disk = 1GB + +queue 1 diff --git a/submit_files/workflow.dag b/submit_files/workflow.dag new file mode 100644 index 00000000..28b62bc4 --- /dev/null +++ b/submit_files/workflow.dag @@ -0,0 +1,16 @@ +# DAGMan workflow: preprocess → train → evaluate +# Episode 08: Advanced HTCondor Workflows +# +# Submit with: condor_submit_dag workflow.dag +# Monitor with: condor_q -dag + +JOB preprocess preprocess.sub +JOB train train_xgboost.sub +JOB evaluate evaluate.sub + +# Define dependencies +PARENT preprocess CHILD train +PARENT train CHILD evaluate + +# Retry training up to 2 times if it fails +RETRY train 2