Skip to content

Commit ebf5224

Browse files
pseudo-rnd-thoughtsMark Towers
authored andcommitted
[Release / Train] Fix Train tutorials (ray-project#63225)
## Description `doc/source/train/tutorials` were raising an error that `ImportError: Dataset requires pyarrow >= 17.0.0, but 14.0.2 is installed. Reinstall with 'pip install -U "pyarrow"'.` Therefore, this PR fixes the tutorials by updating all pyarrows to 17.0.0 and at the same time removing unnecessary import statements --------- Signed-off-by: Mark Towers <mark@anyscale.com> Co-authored-by: Mark Towers <mark@anyscale.com>
1 parent 414ab4f commit ebf5224

8 files changed

Lines changed: 24 additions & 57 deletions

doc/source/train/tutorials/ci/py_scripts/01_02_03_intro_to_ray_train.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,13 @@
1717
"torch==2.8.0",
1818
"torchvision==0.23.0",
1919
"matplotlib==3.10.6",
20-
"pyarrow==14.0.2",
20+
"pyarrow==17.0.0",
2121
]
2222
)
2323

2424
# 01. Imports
2525

2626
# --- Standard library: file IO, paths, timestamps, temp dirs, cleanup ---
27-
import csv # Simple CSV logging for metrics in single-GPU section
28-
import datetime # Timestamps for run directories / filenames
2927
import os # Filesystem utilities (paths, env vars)
3028
import tempfile # Ephemeral dirs for checkpoint staging with ray.train.report()
3129
import shutil # Cleanup of artifacts (later cells)

doc/source/train/tutorials/ci/py_scripts/04a_vision_pattern.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"torch==2.8.0",
1313
"torchvision==0.23.0",
1414
"matplotlib==3.10.6",
15-
"pyarrow==14.0.2",
15+
"pyarrow==17.0.0",
1616
"datasets==2.19.2",
1717
])
1818

@@ -23,15 +23,12 @@
2323
# ————————————————————————
2424
import os
2525
import io
26-
import tempfile
2726
import shutil # file I/O and temp dirs
28-
import json # reading/writing configs
2927
import random, uuid # randomness and unique IDs
3028

3129
# ————————————————————————
3230
# Core Data & Storage Libraries
3331
# ————————————————————————
34-
import pandas as pd # tabular data handling
3532
import numpy as np # numerical ops
3633
import pyarrow as pa # in-memory columnar format
3734
import pyarrow.parquet as pq # reading/writing Parquet files

doc/source/train/tutorials/ci/py_scripts/04b_tabular_workload_pattern.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,14 @@
1111
sys.executable, "-m", "pip", "install", "--no-cache-dir",
1212
"matplotlib==3.10.6",
1313
"scikit-learn==1.7.2",
14-
"pyarrow==14.0.2",
14+
"pyarrow==17.0.0",
1515
"xgboost-cpu==3.0.5",
1616
"seaborn==0.13.2",
1717
])
1818

1919
# 01. Imports
2020
import os
2121
import shutil
22-
import json
23-
import uuid
24-
import tempfile
25-
import random
2622
import numpy as np
2723
import pandas as pd
2824
import matplotlib.pyplot as plt
@@ -36,7 +32,10 @@
3632
import ray
3733
import ray.data as rd
3834
from ray.data import ActorPoolStrategy
39-
from ray.train import RunConfig, ScalingConfig, CheckpointConfig, FailureConfig, get_dataset_shard, get_checkpoint, get_context
35+
from ray.train import (
36+
RunConfig, ScalingConfig, CheckpointConfig, FailureConfig,
37+
get_dataset_shard, get_checkpoint, get_context
38+
)
4039
from ray.train.xgboost import XGBoostTrainer, RayTrainReportCallback
4140

4241
# 02. Load the UCI Cover type dataset (~580k rows, 54 features)

doc/source/train/tutorials/ci/py_scripts/04c_time_series_workload_pattern.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,14 @@
1111
sys.executable, "-m", "pip", "install", "--no-cache-dir",
1212
"torch==2.8.0",
1313
"matplotlib==3.10.6",
14-
"pyarrow==14.0.2",
14+
"pyarrow==17.0.0",
1515
"datasets==2.19.2",
1616
])
1717

1818
# 01. Imports
1919
import os
20-
import io
2120
import math
22-
import uuid
2321
import shutil
24-
import random
25-
import requests
26-
import sys
27-
from pathlib import Path
28-
from datetime import datetime, timedelta
29-
from datasets import load_dataset
3022

3123
import numpy as np
3224
import pandas as pd
@@ -39,14 +31,12 @@
3931
from torch.utils.data import Dataset, DataLoader
4032
import torch.optim as optim
4133

42-
import ray
4334
import ray.data as rdata
44-
import ray.train as train
4535
from ray.train import (
4636
ScalingConfig, RunConfig, FailureConfig,
47-
CheckpointConfig, Checkpoint, get_checkpoint, get_context
37+
CheckpointConfig, Checkpoint
4838
)
49-
from ray.train.torch import prepare_model, prepare_data_loader, TorchTrainer
39+
from ray.train.torch import prepare_model, TorchTrainer
5040

5141
# 02. Load NYC taxi passenger counts (30-min) from GitHub raw – no auth, ~1 MB
5242

doc/source/train/tutorials/ci/py_scripts/04d1_generative_cv_pattern.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# 00. Runtime setup
22
import os
3-
import sys
43
import subprocess
4+
import sys
55

66
# Non-secret env var (safe to set here)
77
os.environ["RAY_TRAIN_V2_ENABLED"] = "1"
@@ -12,29 +12,25 @@
1212
"torch==2.8.0",
1313
"torchvision==0.23.0",
1414
"matplotlib==3.10.6",
15-
"pyarrow==14.0.2",
15+
"pyarrow==17.0.0",
1616
"datasets==2.19.2",
1717
"lightning==2.5.5",
1818
])
1919

2020
# 01. Imports
2121

2222
# Standard libraries
23-
import os
2423
import io
25-
import json
2624
import shutil
2725
import tempfile
2826
import numpy as np
2927
import matplotlib.pyplot as plt
30-
import pandas as pd
3128
from PIL import Image
3229

3330
# Ray
3431
import ray, ray.data
35-
from ray.train import ScalingConfig, get_context, RunConfig, FailureConfig, CheckpointConfig, Checkpoint, get_checkpoint
32+
from ray.train import ScalingConfig, RunConfig, FailureConfig, CheckpointConfig
3633
from ray.train.torch import TorchTrainer
37-
from ray.train.lightning import RayLightningEnvironment
3834

3935
# PyTorch / Lightning
4036
import lightning.pytorch as pl
@@ -43,11 +39,7 @@
4339

4440
# Dataset
4541
from datasets import load_dataset
46-
import pyarrow as pa
47-
import pyarrow.parquet as pq
48-
from tqdm import tqdm
4942
from torchvision.transforms import Compose, Resize, CenterCrop
50-
import random
5143

5244
# 02. Load 10% of food101 (~7,500 images)
5345
hf_ds = load_dataset("ethz/food101", split="train[:10%]")
@@ -237,7 +229,6 @@ def train_loop(config):
237229
message="barrier.*using the device under current context",
238230
)
239231
import os
240-
import torch
241232
import lightning.pytorch as pl
242233
from ray.train import get_checkpoint, get_context
243234
from ray.train.lightning import (
@@ -372,7 +363,6 @@ def sample_image(model, steps=50, device="cpu"):
372363
# 14. Generate and display samples
373364

374365
import glob
375-
from ray.train import Checkpoint
376366

377367
assert best_ckpt is not None, "Checkpoint is missing. Did training run and complete?"
378368

doc/source/train/tutorials/ci/py_scripts/04d2_policy_learning_pattern.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# 00. Runtime setup
22
import os
3-
import sys
43
import subprocess
4+
import sys
55

66
# Non-secret env var
77
os.environ["RAY_TRAIN_V2_ENABLED"] = "1"
@@ -12,7 +12,7 @@
1212
"torch==2.8.0",
1313
"matplotlib==3.10.6",
1414
"lightning==2.5.5",
15-
"pyarrow==14.0.2",
15+
"pyarrow==17.0.0",
1616
])
1717

1818
# 01. Imports
@@ -21,18 +21,14 @@
2121
import os
2222
import shutil
2323
import glob
24-
import json
25-
import uuid
2624
import numpy as np
2725
import matplotlib.pyplot as plt
28-
import pandas as pd
2926
import gymnasium as gym
3027

3128
# Ray libraries for distributed data and training
3229
import ray
3330
import ray.data
34-
from ray.train.lightning import RayLightningEnvironment
35-
from ray.train import ScalingConfig, RunConfig, FailureConfig, CheckpointConfig, get_context, get_checkpoint, report, Checkpoint
31+
from ray.train import ScalingConfig, RunConfig, FailureConfig, CheckpointConfig
3632
from ray.train.torch import TorchTrainer
3733

3834
# PyTorch Lightning and base PyTorch for model definition and training
@@ -148,7 +144,7 @@ def configure_optimizers(self):
148144
# 05. Ray Train Lightning-native training loop
149145

150146
def train_loop(config):
151-
import os, tempfile, torch, warnings
147+
import os, tempfile, warnings
152148
import lightning.pytorch as pl
153149
from ray.train import get_checkpoint, get_context
154150
from ray.train.lightning import (

doc/source/train/tutorials/ci/py_scripts/04e_rec_sys_workload_pattern.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# 00. Runtime setup
22
import os
3-
import sys
43
import subprocess
4+
import sys
55

66
# Non-secret env var
77
os.environ["RAY_TRAIN_V2_ENABLED"] = "1"
@@ -11,35 +11,32 @@
1111
sys.executable, "-m", "pip", "install", "--no-cache-dir",
1212
"torch==2.8.0",
1313
"matplotlib==3.10.6",
14-
"pyarrow==14.0.2",
14+
"pyarrow==17.0.0",
1515
])
1616

1717
# 01. Imports
1818

1919
# Standard libraries
2020
import os
21-
import uuid
22-
import json
2321
import pandas as pd
2422
import numpy as np
2523
import matplotlib.pyplot as plt
26-
import zipfile
2724
import shutil
28-
import tempfile
2925

3026
# PyTorch
3127
import torch
3228
from torch import nn
3329
import torch.nn.functional as F
3430

3531
# Ray
36-
import ray
3732
import ray.data
38-
from ray.train import ScalingConfig, RunConfig, CheckpointConfig, FailureConfig, Checkpoint, get_checkpoint, get_context, get_dataset_shard, report
33+
from ray.train import (
34+
ScalingConfig, RunConfig, CheckpointConfig, FailureConfig,
35+
Checkpoint, get_checkpoint, get_context, get_dataset_shard, report
36+
)
3937
from ray.train.torch import TorchTrainer, prepare_model
4038

4139
# Other
42-
from tqdm import tqdm
4340

4441
import subprocess
4542
# 02. Load MovieLens 100K Dataset and store in /mnt/cluster_storage/ as CSV + Parquet

release/ray_release/byod/byod_ray_train_workloads.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ pip3 install --no-cache-dir \
1010
"torch==2.8.0" \
1111
"torchvision==0.23.0" \
1212
"matplotlib==3.10.6" \
13-
"pyarrow==14.0.2" \
13+
"pyarrow==17.0.0" \
1414
"datasets==2.19.2" \
1515
"lightning==2.5.5" \
1616
"scikit-learn==1.7.2" \

0 commit comments

Comments
 (0)