Skip to content

Commit d0b8bde

Browse files
committed
server: Add RollupPlanTrigger for auto query plan capture
To enable add: - name: AUTOPLAN_ROLLUP_ENABLED value: "true" Rule 1: never-captured (every M10 rollup) Rule 2: stale plan (every M60 rollup) Plan-capable metrics that had activity in the last hour and whose most recent captured plan is older than {@code autoplan.rollup.stalePlanDays} Rule 3: regression (every M60 rollup) mean execution time over the last hour has increased by at least 1.5 times compared to their mean over the prior days
1 parent b9f87f3 commit d0b8bde

6 files changed

Lines changed: 535 additions & 2 deletions

File tree

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package org.ebean.monitor.rollup;
2+
3+
/**
4+
* A plan-capable metric whose recent mean execution time has regressed
5+
* significantly compared to its historical baseline.
6+
*
7+
* @param app application name
8+
* @param key metric hash / key
9+
* @param label human-readable label (coalesce(tags->>'label', name))
10+
* @param recentMeanMicros mean execution time over the recent window (microseconds)
11+
* @param baselineMeanMicros mean execution time over the baseline window (microseconds)
12+
*/
13+
public record RegressionPlanMetric(
14+
String app,
15+
String key,
16+
String label,
17+
long recentMeanMicros,
18+
long baselineMeanMicros
19+
) {}
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
package org.ebean.monitor.rollup;
2+
3+
import io.avaje.config.Config;
4+
import jakarta.inject.Inject;
5+
import jakarta.inject.Singleton;
6+
import org.ebean.monitor.v1.model.MissingPlanMetric;
7+
import org.ebean.monitor.v1.web.V1QueryService;
8+
import org.slf4j.Logger;
9+
import org.slf4j.LoggerFactory;
10+
11+
import java.time.Instant;
12+
import java.time.ZoneOffset;
13+
import java.util.List;
14+
import java.util.concurrent.ConcurrentHashMap;
15+
import java.util.concurrent.ConcurrentMap;
16+
import java.util.function.IntFunction;
17+
import java.util.function.LongSupplier;
18+
19+
/**
20+
* Rollup-triggered automatic query-plan capture.
21+
*
22+
* <p>Called by {@link RollupService} after each M1 rollup. Evaluates rules
23+
* against freshly-rolled-up data and pushes capture requests into the delivery
24+
* queue (via {@link V1QueryService#autoPushCapture}) so the next forwarder poll
25+
* collects the plans.
26+
*
27+
* <h3>Rule: never-captured (every M10 rollup)</h3>
28+
* <p>Plan-capable metrics that had activity in the last 10 minutes and have
29+
* <em>never</em> had a query plan captured are automatically requested.
30+
*
31+
* <h3>Rule: stale plan (every M60 rollup)</h3>
32+
* <p>Plan-capable metrics that had activity in the last hour and whose most
33+
* recent captured plan is older than {@code autoplan.rollup.stalePlanDays}
34+
* days are automatically re-requested.
35+
*
36+
* <h3>Rule: regression (every M60 rollup)</h3>
37+
* <p>Plan-capable metrics whose mean execution time over the last hour has
38+
* increased by at least {@code autoplan.rollup.regressionRatio} times compared
39+
* to their mean over the prior {@code autoplan.rollup.regressionBaselineDays}
40+
* days are requested. Requires a minimum baseline mean of
41+
* {@code autoplan.rollup.regressionMinMicros} to filter noise from very fast queries.
42+
*
43+
* <p>All three rules share a per-(app, hash) cooldown to prevent duplicate requests.
44+
*
45+
* <h3>Configuration</h3>
46+
* <ul>
47+
* <li>{@code autoplan.rollup.enabled} — master switch, default {@code false}</li>
48+
* <li>{@code autoplan.rollup.neverCapturedLimit} — max metrics per M10 sweep, default 20</li>
49+
* <li>{@code autoplan.rollup.stalePlanLimit} — max metrics per M60 sweep, default 10</li>
50+
* <li>{@code autoplan.rollup.stalePlanDays} — plan age threshold in days, default 7</li>
51+
* <li>{@code autoplan.rollup.regressionLimit} — max metrics per M60 sweep, default 10</li>
52+
* <li>{@code autoplan.rollup.regressionRatioPct} — mean ratio × 100 (e.g. 150 = 1.5×), default 150</li>
53+
* <li>{@code autoplan.rollup.regressionMinMicros} — min baseline mean to avoid noise, default 10000</li>
54+
* <li>{@code autoplan.rollup.regressionMinCount} — min calls in each window, default 5</li>
55+
* <li>{@code autoplan.rollup.regressionBaselineDays} — prior-window length in days, default 7</li>
56+
* <li>{@code autoplan.rollup.cooldownMinutes} — per-(app,hash) re-request cooldown, default 180</li>
57+
* </ul>
58+
*/
59+
@Singleton
60+
public final class RollupPlanTrigger {
61+
62+
private static final Logger log = LoggerFactory.getLogger(RollupPlanTrigger.class);
63+
64+
private final boolean enabled;
65+
private final int neverCapturedLimit;
66+
private final int stalePlanLimit;
67+
private final int regressionLimit;
68+
private final long cooldownMillis;
69+
70+
private final IntFunction<List<MissingPlanMetric>> neverCapturedQuery;
71+
private final IntFunction<List<MissingPlanMetric>> stalePlanQuery;
72+
private final IntFunction<List<RegressionPlanMetric>> regressionQuery;
73+
private final CaptureRequester captureRequester;
74+
private final LongSupplier clock;
75+
76+
private final ConcurrentMap<String, Long> recentlyRequested = new ConcurrentHashMap<>();
77+
78+
@Inject
79+
public RollupPlanTrigger(V1QueryService queryService) {
80+
this.clock = System::currentTimeMillis;
81+
this.enabled = Config.getBool("autoplan.rollup.enabled", false);
82+
this.neverCapturedLimit = Config.getInt("autoplan.rollup.neverCapturedLimit", 20);
83+
this.stalePlanLimit = Config.getInt("autoplan.rollup.stalePlanLimit", 10);
84+
this.regressionLimit = Config.getInt("autoplan.rollup.regressionLimit", 10);
85+
var cooldownMinutes = Config.getLong("autoplan.rollup.cooldownMinutes", 180);
86+
this.cooldownMillis = cooldownMinutes * 60_000L;
87+
final long stalePlanDays = Config.getLong("autoplan.rollup.stalePlanDays", 7);
88+
final long regressionRatioPct = Config.getLong("autoplan.rollup.regressionRatioPct", 150); // 150 = 1.5x
89+
final long regressionMinMicros = Config.getLong("autoplan.rollup.regressionMinMicros", 10_000);
90+
final int regressionMinCount = Config.getInt("autoplan.rollup.regressionMinCount", 5);
91+
final int regressionBaselineDays = Config.getInt("autoplan.rollup.regressionBaselineDays", 7);
92+
this.neverCapturedQuery = limit -> queryService.topMissingPlans("total", 10L, null, null, null, limit, null);
93+
this.stalePlanQuery = limit -> queryService.topMissingPlans("total", null, 1L, stalePlanDays * 24 * 60, null, limit, null);
94+
this.regressionQuery = limit -> queryService.topRegressionPlans(1, regressionBaselineDays, regressionRatioPct / 100.0, regressionMinMicros, regressionMinCount, limit);
95+
this.captureRequester = queryService::autoPushCapture;
96+
if (enabled) {
97+
log.info("rollup autoplan enabled neverCapturedLimit={} stalePlanLimit={} stalePlanDays={} " +
98+
"regressionLimit={} regressionRatioPct={} regressionMinMicros={} cooldownMinutes={}",
99+
neverCapturedLimit, stalePlanLimit, stalePlanDays,
100+
regressionLimit, regressionRatioPct, regressionMinMicros, cooldownMinutes);
101+
}
102+
}
103+
104+
/** Package-private constructor for testing with stubs. */
105+
RollupPlanTrigger(IntFunction<List<MissingPlanMetric>> neverCapturedQuery,
106+
IntFunction<List<MissingPlanMetric>> stalePlanQuery,
107+
IntFunction<List<RegressionPlanMetric>> regressionQuery,
108+
CaptureRequester captureRequester,
109+
LongSupplier clock) {
110+
this.neverCapturedQuery = neverCapturedQuery;
111+
this.stalePlanQuery = stalePlanQuery;
112+
this.regressionQuery = regressionQuery;
113+
this.captureRequester = captureRequester;
114+
this.clock = clock;
115+
this.enabled = Config.getBool("autoplan.rollup.enabled", false);
116+
this.neverCapturedLimit = Config.getInt("autoplan.rollup.neverCapturedLimit", 20);
117+
this.stalePlanLimit = Config.getInt("autoplan.rollup.stalePlanLimit", 10);
118+
this.regressionLimit = Config.getInt("autoplan.rollup.regressionLimit", 10);
119+
this.cooldownMillis = Config.getLong("autoplan.rollup.cooldownMinutes", 180) * 60_000L;
120+
}
121+
122+
/**
123+
* Evaluate auto-capture rules for the given rollup event time.
124+
* Called by {@link RollupService} after each M1 rollup.
125+
*/
126+
void onRollup(Instant eventTime) {
127+
if (!enabled) return;
128+
int minute = eventTime.atZone(ZoneOffset.UTC).getMinute();
129+
if (minute % 10 == 0) {
130+
triggerNeverCaptured();
131+
if (minute % 60 == 0) {
132+
triggerStalePlans();
133+
triggerRegressionPlans();
134+
}
135+
}
136+
}
137+
138+
private void triggerNeverCaptured() {
139+
int pushed = pushMissingCandidates(neverCapturedQuery.apply(neverCapturedLimit));
140+
if (pushed > 0) {
141+
log.info("autoplan rollup: requested {} never-captured plan capture(s)", pushed);
142+
}
143+
}
144+
145+
private void triggerStalePlans() {
146+
int pushed = pushMissingCandidates(stalePlanQuery.apply(stalePlanLimit));
147+
if (pushed > 0) {
148+
log.info("autoplan rollup: requested {} stale-plan recapture(s)", pushed);
149+
}
150+
}
151+
152+
private void triggerRegressionPlans() {
153+
int pushed = pushRegressionCandidates(regressionQuery.apply(regressionLimit));
154+
if (pushed > 0) {
155+
log.info("autoplan rollup: requested {} regression-detected plan capture(s)", pushed);
156+
}
157+
}
158+
159+
private int pushMissingCandidates(List<MissingPlanMetric> candidates) {
160+
if (candidates.isEmpty()) return 0;
161+
final long now = clock.getAsLong();
162+
int pushed = 0;
163+
for (MissingPlanMetric m : candidates) {
164+
if (tryPush(m.app(), m.key(), m.label(), now)) pushed++;
165+
}
166+
return pushed;
167+
}
168+
169+
private int pushRegressionCandidates(List<RegressionPlanMetric> candidates) {
170+
if (candidates.isEmpty()) return 0;
171+
final long now = clock.getAsLong();
172+
int pushed = 0;
173+
for (RegressionPlanMetric m : candidates) {
174+
if (tryPush(m.app(), m.key(), m.label(), now)) {
175+
log.debug("autoplan rollup regression: app={} key={} recentMean={}us baselineMean={}us",
176+
m.app(), m.key(), m.recentMeanMicros(), m.baselineMeanMicros());
177+
pushed++;
178+
}
179+
}
180+
return pushed;
181+
}
182+
183+
/** Checks cooldown and pushes the capture request. Returns true if the request was pushed. */
184+
private boolean tryPush(String app, String key, String label, long now) {
185+
final String cooldownKey = app + "|" + key;
186+
final Long prev = recentlyRequested.get(cooldownKey);
187+
if (prev != null && (now - prev) < cooldownMillis) return false;
188+
recentlyRequested.put(cooldownKey, now);
189+
captureRequester.request(app, key, label);
190+
return true;
191+
}
192+
193+
/** Functional interface for pushing + recording a capture request. Kept narrow for testability. */
194+
@FunctionalInterface
195+
interface CaptureRequester {
196+
void request(String appName, String metricKey, String metricLabel);
197+
}
198+
}

server/src/main/java/org/ebean/monitor/rollup/RollupService.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@ public class RollupService implements Runnable {
3535
private boolean active;
3636

3737
private final Database database;
38+
private final RollupPlanTrigger rollupPlanTrigger;
3839

39-
public RollupService(Database database) {
40+
public RollupService(Database database, RollupPlanTrigger rollupPlanTrigger) {
4041
this.database = database;
42+
this.rollupPlanTrigger = rollupPlanTrigger;
4143
}
4244

4345
@PostConstruct
@@ -98,6 +100,7 @@ private void performRollup() {
98100
if (lastRollupTime == null || lastRollupTime.isBefore(currentRollupTime)) {
99101
final Rollup rollup = new Rollup(database, currentRollupTime);
100102
rollup.rollup();
103+
rollupPlanTrigger.onRollup(currentRollupTime);
101104
} else {
102105
log.info("skipping already existing rollup");
103106
}

server/src/main/java/org/ebean/monitor/v1/web/V1QueryService.java

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.ebean.monitor.v1.model.MetricTimeBucket;
2929
import org.ebean.monitor.v1.model.MetricTimeseries;
3030
import org.ebean.monitor.v1.model.MissingPlanMetric;
31+
import org.ebean.monitor.rollup.RegressionPlanMetric;
3132
import org.ebean.monitor.v1.model.PendingResponse;
3233
import org.ebean.monitor.v1.model.PendingPlan;
3334
import org.ebean.monitor.v1.model.PlanChange;
@@ -669,6 +670,75 @@ private static MissingPlanMetric toMissingPlanMetric(ResultSet rs, long windowMi
669670
.build();
670671
}
671672

673+
// ---------------------------------------------------------------------------
674+
// Regression plan detection
675+
// ---------------------------------------------------------------------------
676+
677+
/**
678+
* Returns plan-capable metrics whose mean execution time over the last
679+
* {@code recentHours} hours has regressed by at least {@code ratio} times
680+
* compared to their mean over the prior {@code baselineDays} days.
681+
*
682+
* <p>A minimum baseline mean of {@code minMicros} prevents noise from very
683+
* fast queries with large percentage swings. A minimum of {@code minCount}
684+
* calls in both windows filters single-execution spikes.
685+
*/
686+
public List<RegressionPlanMetric> topRegressionPlans(int recentHours, int baselineDays,
687+
double ratio, long minMicros,
688+
int minCount, int limit) {
689+
final Instant now = Instant.now();
690+
final Instant recentFrom = now.minus(Duration.ofHours(recentHours));
691+
final Instant baselineFrom = recentFrom.minus(Duration.ofDays(baselineDays));
692+
final String sql = """
693+
WITH recent AS (
694+
SELECT metric_id,
695+
SUM(total) / NULLIF(SUM(count), 0) AS mean
696+
FROM ebean_insight.timed_m60
697+
WHERE event_time > :recentFrom
698+
GROUP BY metric_id
699+
HAVING SUM(count) >= :minCount
700+
),
701+
baseline AS (
702+
SELECT metric_id,
703+
SUM(total) / NULLIF(SUM(count), 0) AS mean
704+
FROM ebean_insight.timed_m60
705+
WHERE event_time > :baselineFrom AND event_time <= :recentFrom
706+
GROUP BY metric_id
707+
HAVING SUM(count) >= :minCount
708+
)
709+
SELECT m.id AS metric_id,
710+
a.name AS app_name,
711+
COALESCE(m.tags ->> 'label', m.name) AS label,
712+
m.key AS key,
713+
r.mean AS recent_mean,
714+
b.mean AS baseline_mean
715+
FROM recent r
716+
JOIN baseline b ON b.metric_id = r.metric_id
717+
JOIN ebean_insight.app_metric m ON m.id = r.metric_id
718+
JOIN ebean_insight.app a ON a.id = m.app_id
719+
WHERE m.plan_capable = true
720+
AND b.mean > :minMicros
721+
AND r.mean > b.mean * :ratio
722+
ORDER BY (r.mean::double precision / NULLIF(b.mean, 1)) DESC, m.name ASC
723+
LIMIT :limit
724+
""";
725+
return DB.sqlQuery(sql)
726+
.setParameter("recentFrom", recentFrom)
727+
.setParameter("baselineFrom", baselineFrom)
728+
.setParameter("minCount", minCount)
729+
.setParameter("minMicros", minMicros)
730+
.setParameter("ratio", ratio)
731+
.setParameter("limit", limit)
732+
.mapTo((rs, _) -> new RegressionPlanMetric(
733+
rs.getString("app_name"),
734+
rs.getString("key"),
735+
rs.getString("label"),
736+
rs.getLong("recent_mean"),
737+
rs.getLong("baseline_mean")
738+
))
739+
.findList();
740+
}
741+
672742
// ---------------------------------------------------------------------------
673743
// Plans
674744
// ---------------------------------------------------------------------------
@@ -738,6 +808,24 @@ private void recordCaptureRequest(DApp app, @Nullable String envName, DAppMetric
738808
.save();
739809
}
740810

811+
/**
812+
* Push and record a capture request from an internal trigger (e.g. rollup-based
813+
* auto-capture). Bypasses HTTP-layer validation; the caller must have already
814+
* determined the metric is plan-capable. Uses {@code ANY_ENV}: delivered to
815+
* whichever forwarder polls first, regardless of environment.
816+
*/
817+
public void autoPushCapture(String appName, String metricKey, String metricLabel) {
818+
final DApp app = findApp(appName);
819+
if (app == null) {
820+
return;
821+
}
822+
messageService.pushMessage(appName, MessageService.ANY_ENV, "qp:" + metricKey);
823+
new DCaptureRequest(app, metricKey)
824+
.setLabel(metricLabel)
825+
.setRequestedAt(Instant.now())
826+
.save();
827+
}
828+
741829
public List<PendingPlan> listPendingPlans(@Nullable String app, @Nullable String env,
742830
@Nullable String hash, @Nullable String label) {
743831
final Instant from = Instant.now().minus(Duration.ofMinutes(PENDING_STALE_MINUTES));

0 commit comments

Comments
 (0)