Skip to content

Commit 2910da9

Browse files
committed
Initial commit
1 parent 49d0821 commit 2910da9

14 files changed

Lines changed: 1144 additions & 0 deletions

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
/.vscode/
2+
**/__pycache__/
3+
/output/*
4+
!output/
5+
6+
*.o

README.md

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
2+
# cmdstan-cuda-example
3+
4+
5+
## About this repository
6+
7+
A simple example of offloading computations in a Stan model to the GPU. Uses the beta negative binomial distribution and its gradient as an example.
8+
9+
10+
## Preliminaries
11+
12+
This repository is written for CmdStan version 2.35.0. For python users, here's install script. Installation scripts via other interfaces are similar to this one.
13+
14+
```python
15+
from cmdstanpy import cmdstan_path, set_cmdstan_path, install_cmdstan
16+
install_cmdstan(version="2.35.0", verbose=True, progress=True, cores=4)
17+
set_cmdstan_path(os.path.join(os.path.expanduser('~'), '.cmdstan', 'cmdstan-2.35.0'))
18+
cmdstan_path()
19+
```
20+
21+
Ensure that NVIDIA GPU drivers are correctly installed.
22+
23+
```sh
24+
nvidia-smi
25+
```
26+
27+
Ensure that CUDA Toolkit are correctly installed and is accessible via PATH
28+
29+
```sh
30+
nvcc --version
31+
echo $CUDA_HOME
32+
```
33+
34+
35+
## Quick start
36+
37+
Clone this repository, rename it to `cmdstan-cuda`, and move it to the root directory of CmdStan.
38+
In most cases, this will be located at: `$HOME/.cmdstan/cmdstan-2.35.0`.
39+
40+
```sh
41+
git clone https://github.com/MLGlobalHealth/cmdstan-cuda-example.git cmdstan-cuda
42+
CMDSTAN_HOME="$HOME/.cmdstan/cmdstan-2.35.0"
43+
mv cmdstan-cuda "$CMDSTAN_HOME/"
44+
```
45+
46+
Execute `main.py`, and go to the `output` and `model` folders to have a look.
47+
48+
The `output` folder should contain a comparison histogram.
49+
50+
The `model` folder should contain the saved results, logs, and diagnostics.
51+
52+
## Profiling
53+
54+
To find the critical point on your current machine where CPU and GPU performance are equal, run:
55+
56+
```sh
57+
python prof/prof.py
58+
```
59+
60+
Then go to the `output` folder to check. It should contain CSV output of the runtime test and comparative plots.
61+
62+
To view the details and timeline of CUDA API calls by CmdStan model, run:
63+
64+
```sh
65+
nsys profile --stats=true --force-overwrite=true -o report ./prof_cu_bnb.sh
66+
```
67+
68+
Could use NVIDIA Nsight Systems provided to open the generated `report.nsys-rep` file.
69+
70+
71+
72+
73+
74+

main.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import os
2+
import subprocess
3+
import numpy as np
4+
import pandas as pd
5+
import arviz as az
6+
import plotly.express as px
7+
from pathlib import Path
8+
from datetime import datetime
9+
from cmdstanpy import write_stan_json
10+
11+
# ===============================
12+
# config
13+
# ===============================
14+
15+
args = {}
16+
args['seed'] = 1234
17+
args['main'] = Path(__file__)
18+
args['cwd'] = args['main'].parent
19+
args['src'] = args['cwd'] / 'src'
20+
args['hpp'] = args['src'] / 'headers.hpp'
21+
args['cmdstan'] = Path(os.path.expanduser("~")) / '.cmdstan' / 'cmdstan-2.35.0'
22+
args['stan_dir'] = args['cwd'] / 'model'
23+
args['stanc_args'] = {"include-paths": [str(args['src'])]}
24+
args['cpp_options'] = {"LDFLAGS": f"{str(args['src'])}/bnb/bnb_lpmf.o -L{os.environ['CUDA_HOME']}/lib64 -lcudart"}
25+
26+
import sys
27+
sys.path.append(str(args['cwd']))
28+
from utils import stan_model, beta_neg_binomial_rng
29+
30+
data = beta_neg_binomial_rng(r=6, alpha=4, beta=0.5, y_max=500, size=10000, seed=args['seed'])
31+
stan_data_0 = {
32+
'N': len(data),
33+
'y': data,
34+
}
35+
write_stan_json(args['cwd'] / 'output/data.json', stan_data_0)
36+
37+
# ===============================
38+
# execution
39+
# ===============================
40+
41+
command = ["nvcc", "-O3", "-lineinfo", "-c", str(args['src'])+"/bnb/bnb_lpmf.cu", "-o", str(args['src'])+"/bnb/bnb_lpmf.o"]
42+
result = subprocess.run(command, capture_output=True, text=True)
43+
print("stdout:", result.stdout)
44+
print("stderr:", result.stderr)
45+
46+
47+
model = stan_model(args['stan_dir'] / 'model_bnb.stan')
48+
model.compile(user_header=args['hpp'],
49+
stanc_options=args['stanc_args'],
50+
# cpp_options=args['cpp_options']
51+
)
52+
model.sample(stan_data_0,
53+
**{"chains": 4, "iter_warmup": 1000, "iter_sampling": 1000,
54+
"show_console": False, "seed": args['seed'], "refresh": 20,})
55+
model.diagnose()
56+
model.save()
57+
58+
59+
model_cu = stan_model(args['stan_dir'] / 'model_cu_bnb.stan')
60+
model_cu.compile(user_header=args['hpp'],
61+
stanc_options=args['stanc_args'],
62+
cpp_options=args['cpp_options'])
63+
model_cu.sample(stan_data_0,
64+
**{"chains": 4, "iter_warmup": 1000, "iter_sampling": 1000,
65+
"show_console": False, "seed": args['seed'], "refresh": 20,})
66+
model_cu.diagnose()
67+
model_cu.save()
68+
69+
# ===============================
70+
# comparison
71+
# ===============================
72+
params = ['r', 'a', 'b']
73+
df_list = []
74+
75+
for param in params:
76+
param_cpu = model.fit.stan_variable(param).flatten()
77+
param_gpu = model_cu.fit.stan_variable(param).flatten()
78+
79+
df_param = pd.DataFrame({
80+
'value': list(param_cpu) + list(param_gpu),
81+
'type': ['CPU'] * len(param_cpu) + ['GPU'] * len(param_gpu),
82+
'parameter': [param] * (len(param_cpu) + len(param_gpu))
83+
})
84+
85+
df_list.append(df_param)
86+
87+
df_all_params = pd.concat(df_list, ignore_index=True)
88+
89+
fig = px.histogram(df_all_params, x='value', color='type', facet_col='parameter',
90+
barmode='group', opacity=0.7,
91+
labels={'value': 'Posterior Value', 'type': 'Model Type', 'parameter': 'Parameter'},
92+
title='Comparison of Posterior Distributions for CPU and GPU Models')
93+
94+
fig.write_image(args['cwd'] / 'output/posterior_comparison.pdf')
95+
96+

model/model_bnb.stan

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
functions {
2+
#include headers.stan
3+
}
4+
data {
5+
int<lower=0> N;
6+
array[N] int<lower=0> y;
7+
}
8+
transformed data {
9+
10+
}
11+
parameters {
12+
real<lower=0> r;
13+
real<lower=0> a;
14+
real<lower=0> b;
15+
}
16+
model {
17+
r ~ normal(0, 1);
18+
a ~ normal(0, 1);
19+
b ~ normal(0, 1);
20+
target += beta_neg_binomial_lpmf(y | r, a, b);
21+
}
22+
generated quantities {
23+
vector[N] log_lik;
24+
for (i in 1:N) {
25+
log_lik[i] = beta_neg_binomial_lpmf(y[i] | r, a, b);
26+
}
27+
}
28+

model/model_cu_bnb.stan

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
functions {
2+
#include headers.stan
3+
}
4+
data {
5+
int<lower=0> N;
6+
array[N] int<lower=0> y;
7+
}
8+
transformed data {
9+
10+
}
11+
parameters {
12+
real<lower=0> r;
13+
real<lower=0> a;
14+
real<lower=0> b;
15+
}
16+
model {
17+
r ~ normal(0, 1);
18+
a ~ normal(0, 1);
19+
b ~ normal(0, 1);
20+
target += cu_beta_neg_binomial_lpmf(y | r, a, b);
21+
}
22+
generated quantities {
23+
vector[N] log_lik;
24+
for (i in 1:N) {
25+
log_lik[i] = beta_neg_binomial_lpmf(y[i] | r, a, b);
26+
}
27+
}
28+

prof/prof.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import os
2+
import time
3+
import subprocess
4+
from pathlib import Path
5+
import numpy as np
6+
import pandas as pd
7+
import plotly.graph_objects as go
8+
from plotly.subplots import make_subplots
9+
10+
# ===============================
11+
# config
12+
# ===============================
13+
14+
args = {}
15+
args['seed'] = 1234
16+
args['main'] = Path(__file__)
17+
args['cwd'] = args['main'].parent.parent
18+
args['src'] = args['cwd'] / 'src'
19+
args['hpp'] = args['src'] / 'headers.hpp'
20+
args['cmdstan'] = Path(os.path.expanduser("~")) / '.cmdstan' / 'cmdstan-2.35.0'
21+
args['stanc_args'] = {"include-paths": [str(args['src'])]}
22+
args['cpp_options'] = {"LDFLAGS": f"{str(args['src'])}/bnb/bnb_lpmf.o -L{os.environ['CUDA_HOME']}/lib64 -lcudart"}
23+
args['stan_dir'] = args['cwd'] / 'model'
24+
25+
import sys
26+
sys.path.append(str(args['cwd']))
27+
from utils import stan_model, beta_neg_binomial_rng
28+
29+
30+
# ===============================
31+
# execution
32+
# ===============================
33+
34+
command = ["nvcc", "-O3", "-lineinfo", "-c", str(args['src'])+"/bnb/bnb_lpmf.cu", "-o", str(args['src'])+"/bnb/bnb_lpmf.o"]
35+
result = subprocess.run(command, capture_output=True, text=True)
36+
print("stdout:", result.stdout)
37+
print("stderr:", result.stderr)
38+
39+
data_sizes = [4000, 6000, 8000, 10000, 12000, 14000, 18000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000]
40+
results = []
41+
42+
def run_and_measure_time(model, data_size):
43+
data = beta_neg_binomial_rng(r=6, alpha=4, beta=0.5, y_max=500, size=data_size, seed=args['seed'])
44+
stan_data = {
45+
'N': len(data),
46+
'y': data,
47+
}
48+
start_time = time.time()
49+
model.sample(stan_data,
50+
**{"chains": 4, "iter_warmup": 500, "iter_sampling": 500,
51+
"show_console": False, "seed": args['seed'], "refresh": 20})
52+
elapsed_time = time.time() - start_time
53+
return elapsed_time
54+
55+
cpu_model = stan_model(args['stan_dir'] / 'model_bnb.stan')
56+
cpu_model.compile(user_header=args['hpp'],
57+
stanc_options=args['stanc_args'])
58+
gpu_model = stan_model(args['stan_dir'] / 'model_cu_bnb.stan')
59+
gpu_model.compile(user_header=args['hpp'],
60+
stanc_options=args['stanc_args'],
61+
cpp_options=args['cpp_options'])
62+
63+
for size in data_sizes:
64+
# Test CPU version
65+
cpu_time = run_and_measure_time(cpu_model, size)
66+
results.append({'Data Size': size, 'Model Type': 'CPU', 'Time (seconds)': cpu_time})
67+
# Test GPU version
68+
gpu_time = run_and_measure_time(gpu_model, size)
69+
results.append({'Data Size': size, 'Model Type': 'GPU', 'Time (seconds)': gpu_time})
70+
71+
df = pd.DataFrame(results)
72+
df.to_csv(args['cwd'] / 'output' / 'model_performance.csv', index=False)
73+
74+
75+
76+
# ===============================
77+
# comparison
78+
# ===============================
79+
80+
df_cpu = df[df['Model Type'] == 'CPU'].set_index('Data Size')
81+
df_gpu = df[df['Model Type'] == 'GPU'].set_index('Data Size')
82+
df_ratio = (df_cpu['Time (seconds)'] / df_gpu['Time (seconds)']).reset_index()
83+
df_ratio.columns = ['Data Size', 'CPU/GPU Ratio']
84+
85+
fig = make_subplots(rows=1, cols=2,
86+
horizontal_spacing=0.05,
87+
vertical_spacing=0.03,
88+
subplot_titles=('Time (seconds)', 'CPU/GPU Time Ratio')
89+
)
90+
91+
fig.add_trace(
92+
go.Scatter(x=df[df['Model Type'] == 'CPU']['Data Size'], y=df[df['Model Type'] == 'CPU']['Time (seconds)'], mode='lines', name='CPU'),
93+
row=1, col=1
94+
)
95+
fig.add_trace(
96+
go.Scatter(x=df[df['Model Type'] == 'CPU']['Data Size'], y=df[df['Model Type'] == 'GPU']['Time (seconds)'], mode='lines', name='GPU'),
97+
row=1, col=1
98+
)
99+
100+
fig.add_trace(
101+
go.Scatter(x=df_ratio['Data Size'], y=df_ratio['CPU/GPU Ratio'], mode='lines', name='CPU/GPU'),
102+
row=1, col=2
103+
)
104+
105+
# fig.update_yaxes(title_text='Time (seconds)', row=1, col=1)
106+
# fig.update_yaxes(title_text='CPU/GPU Ratio', row=1, col=2)
107+
fig.update_xaxes(title_text='Data Size', row=1, col=1)
108+
fig.update_xaxes(title_text='Data Size', row=1, col=2)
109+
110+
fig.update_yaxes(zeroline=False,)
111+
fig.update_xaxes(dtick="20000")
112+
fig.update_layout(title_text='',
113+
showlegend=True,
114+
height=500,
115+
width=1200)
116+
fig.write_image(args['cwd'] / 'output' / 'model_performance.pdf')
117+
118+
119+

prof/prof_cu_bnb.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
CMDSTAN_HOME="$HOME/.cmdstan/cmdstan-2.35.0"
3+
$CMDSTAN_HOME/cmdstan-cuda/model/model_cu_bnb \
4+
random seed=1234 \
5+
data file=$CMDSTAN_HOME/cmdstan-cuda/output/data.json \
6+
output file=$CMDSTAN_HOME/cmdstan-cuda/output/model_cu_bnb-$(date +"%Y%m%d%H%M%S").csv \
7+
refresh=20 \
8+
method=sample \
9+
num_samples=100 \
10+
num_warmup=100 \
11+
algorithm=hmc \
12+
adapt engaged=1

0 commit comments

Comments
 (0)