Skip to content

Commit 97500ac

Browse files
feat(plotnine): implement survival-kaplan-meier (#2453)
## Implementation: `survival-kaplan-meier` - plotnine Implements the **plotnine** version of `survival-kaplan-meier`. **File:** `plots/survival-kaplan-meier/implementations/plotnine.py` --- :robot: *[impl-generate workflow](https://github.com/MarkusNeusinger/pyplots/actions/runs/20584329185)* --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 8ce3a74 commit 97500ac

2 files changed

Lines changed: 202 additions & 0 deletions

File tree

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
""" pyplots.ai
2+
survival-kaplan-meier: Kaplan-Meier Survival Plot
3+
Library: plotnine 0.15.2 | Python 3.13.11
4+
Quality: 91/100 | Created: 2025-12-29
5+
"""
6+
7+
import numpy as np
8+
import pandas as pd
9+
from plotnine import (
10+
aes,
11+
element_line,
12+
element_text,
13+
geom_point,
14+
geom_ribbon,
15+
geom_step,
16+
ggplot,
17+
labs,
18+
scale_color_manual,
19+
scale_fill_manual,
20+
scale_x_continuous,
21+
scale_y_continuous,
22+
theme,
23+
theme_minimal,
24+
)
25+
26+
27+
# Seed for reproducibility
28+
np.random.seed(42)
29+
30+
31+
# Generate survival data for two treatment groups
32+
def generate_survival_data(n, hazard_rate, group_name):
33+
"""Generate time-to-event data with censoring."""
34+
# Exponential distribution for survival times
35+
times = np.random.exponential(1 / hazard_rate, n)
36+
# Random censoring (30% of observations)
37+
censor_times = np.random.uniform(0, np.percentile(times, 80), n)
38+
censored = times > censor_times
39+
observed_times = np.where(censored, censor_times, times)
40+
events = (~censored).astype(int)
41+
return pd.DataFrame({"time": observed_times, "event": events, "group": group_name})
42+
43+
44+
# Create two groups with different hazard rates
45+
group_a = generate_survival_data(80, hazard_rate=0.02, group_name="Treatment A")
46+
group_b = generate_survival_data(80, hazard_rate=0.035, group_name="Treatment B")
47+
data = pd.concat([group_a, group_b], ignore_index=True)
48+
49+
50+
# Compute Kaplan-Meier survival estimates
51+
def kaplan_meier(df):
52+
"""Calculate Kaplan-Meier survival estimates with confidence intervals."""
53+
df = df.sort_values("time").reset_index(drop=True)
54+
n = len(df)
55+
times = [0]
56+
survival = [1.0]
57+
ci_lower = [1.0]
58+
ci_upper = [1.0]
59+
var_sum = 0 # For Greenwood's formula
60+
61+
at_risk = n
62+
current_survival = 1.0
63+
64+
# Process each unique event time
65+
unique_times = df[df["event"] == 1]["time"].unique()
66+
unique_times.sort()
67+
68+
for t in unique_times:
69+
# Number at risk just before time t
70+
at_risk = (df["time"] >= t).sum()
71+
# Number of events at time t
72+
events = ((df["time"] == t) & (df["event"] == 1)).sum()
73+
74+
if at_risk > 0:
75+
# Survival probability at this step
76+
current_survival *= (at_risk - events) / at_risk
77+
# Greenwood's formula for variance
78+
if at_risk > events:
79+
var_sum += events / (at_risk * (at_risk - events))
80+
81+
times.append(t)
82+
survival.append(current_survival)
83+
84+
# 95% confidence interval (log transformation)
85+
if current_survival > 0 and var_sum > 0:
86+
se = current_survival * np.sqrt(var_sum)
87+
z = 1.96
88+
log_surv = np.log(current_survival)
89+
log_se = se / current_survival
90+
ci_lower.append(np.exp(log_surv - z * log_se))
91+
ci_upper.append(np.exp(log_surv + z * log_se))
92+
else:
93+
ci_lower.append(current_survival)
94+
ci_upper.append(current_survival)
95+
96+
# Extend to max time
97+
max_time = df["time"].max()
98+
times.append(max_time)
99+
survival.append(survival[-1])
100+
ci_lower.append(ci_lower[-1])
101+
ci_upper.append(ci_upper[-1])
102+
103+
return pd.DataFrame(
104+
{"time": times, "survival": survival, "ci_lower": np.clip(ci_lower, 0, 1), "ci_upper": np.clip(ci_upper, 0, 1)}
105+
)
106+
107+
108+
# Calculate KM estimates for each group
109+
km_a = kaplan_meier(data[data["group"] == "Treatment A"])
110+
km_a["group"] = "Treatment A"
111+
112+
km_b = kaplan_meier(data[data["group"] == "Treatment B"])
113+
km_b["group"] = "Treatment B"
114+
115+
km_data = pd.concat([km_a, km_b], ignore_index=True)
116+
117+
# Get censored observations for tick marks
118+
censored = data[data["event"] == 0].copy()
119+
# Add survival probability at censoring time for each censored observation
120+
censored_marks = []
121+
for _, row in censored.iterrows():
122+
group = row["group"]
123+
t = row["time"]
124+
km_group = km_a if group == "Treatment A" else km_b
125+
# Find survival at this time
126+
surv = km_group[km_group["time"] <= t]["survival"].iloc[-1]
127+
censored_marks.append({"time": t, "survival": surv, "group": group})
128+
129+
censored_df = pd.DataFrame(censored_marks)
130+
131+
# Define colors (Python Blue and a complementary color)
132+
colors = {"Treatment A": "#306998", "Treatment B": "#FFD43B"}
133+
134+
# Create the plot
135+
plot = (
136+
ggplot()
137+
# Confidence interval ribbons
138+
+ geom_ribbon(km_data, aes(x="time", ymin="ci_lower", ymax="ci_upper", fill="group"), alpha=0.2)
139+
# Survival step curves
140+
+ geom_step(km_data, aes(x="time", y="survival", color="group"), size=1.5)
141+
# Censored observation marks (vertical ticks)
142+
+ geom_point(censored_df, aes(x="time", y="survival", color="group"), shape="|", size=4, stroke=1.5)
143+
# Colors
144+
+ scale_color_manual(values=colors)
145+
+ scale_fill_manual(values=colors)
146+
# Axis scales
147+
+ scale_y_continuous(limits=(0, 1.05), breaks=[0, 0.25, 0.5, 0.75, 1.0], labels=["0%", "25%", "50%", "75%", "100%"])
148+
+ scale_x_continuous(limits=(0, None))
149+
# Labels
150+
+ labs(
151+
title="survival-kaplan-meier · plotnine · pyplots.ai",
152+
x="Time (months)",
153+
y="Survival Probability",
154+
color="Treatment Group",
155+
fill="Treatment Group",
156+
)
157+
# Theme
158+
+ theme_minimal()
159+
+ theme(
160+
figure_size=(16, 9),
161+
text=element_text(size=14),
162+
axis_title=element_text(size=20),
163+
axis_text=element_text(size=16),
164+
plot_title=element_text(size=24),
165+
legend_title=element_text(size=18),
166+
legend_text=element_text(size=16),
167+
legend_position=(0.85, 0.85),
168+
panel_grid_minor=element_line(alpha=0.2),
169+
panel_grid_major=element_line(alpha=0.3),
170+
)
171+
)
172+
173+
# Save
174+
plot.save("plot.png", dpi=300, verbose=False)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
library: plotnine
2+
specification_id: survival-kaplan-meier
3+
created: '2025-12-29T22:47:28Z'
4+
updated: '2025-12-29T22:50:20Z'
5+
generated_by: claude-opus-4-5-20251101
6+
workflow_run: 20584329185
7+
issue: 0
8+
python_version: 3.13.11
9+
library_version: 0.15.2
10+
preview_url: https://storage.googleapis.com/pyplots-images/plots/survival-kaplan-meier/plotnine/plot.png
11+
preview_thumb: https://storage.googleapis.com/pyplots-images/plots/survival-kaplan-meier/plotnine/plot_thumb.png
12+
preview_html: null
13+
quality_score: 91
14+
review:
15+
strengths:
16+
- Excellent implementation of Kaplan-Meier survival curves using plotnine native
17+
grammar of graphics
18+
- 'All spec requirements met: step functions, confidence intervals, censored marks,
19+
group comparison'
20+
- Custom KM estimator with proper Greenwood formula for variance estimation
21+
- Strong colorblind-accessible color contrast between treatment groups (blue vs
22+
yellow)
23+
- Professional appearance with appropriate text sizing for 16:9 format
24+
- Proper handling of censored observations displayed as tick marks on curves
25+
weaknesses:
26+
- Code uses helper functions instead of flat KISS structure (imports → data → plot
27+
→ save)
28+
- Legend position overlaps with data region

0 commit comments

Comments
 (0)