Skip to content

Commit b9e46d8

Browse files
authored
Refocus forecast output on effective-token predictions by removing yield and episode metrics (#34750)
1 parent b094538 commit b9e46d8

2 files changed

Lines changed: 2 additions & 150 deletions

File tree

pkg/cli/forecast.go

Lines changed: 2 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
package cli
22

33
// This file implements the `forecast` command, which samples a workflow's recent
4-
// GitHub Actions run history and projects forward effective token usage and yield
5-
// on a per-week or per-month basis.
4+
// GitHub Actions run history and projects forward effective token usage (including
5+
// Monte Carlo probability distributions) on a per-week or per-month basis.
66
//
77
// Workflow metadata (trigger types, concurrency, experiments) is read from the
88
// workflow's Markdown frontmatter so that projections account for how often the
@@ -60,27 +60,6 @@ var (
6060
}
6161
)
6262

63-
// ForecastEpisodeSummary contains episode-level aggregate metrics derived from
64-
// run history without downloading artifacts. Episodes are reconstructed from the
65-
// fields available in the GitHub Actions run list (event type, head SHA, branch).
66-
// Dispatch and workflow_call linkages that require aw_info.json are not available
67-
// in this lightweight analysis, so the episode count is a lower-bound estimate.
68-
type ForecastEpisodeSummary struct {
69-
// SampledEpisodes is the number of distinct episodes detected in the sampled
70-
// run history. Each "episode" represents one logical task execution, which may
71-
// span multiple runs when a workflow dispatches sub-workflows.
72-
SampledEpisodes int `json:"sampled_episodes"`
73-
// RunsPerEpisode is the average number of runs per episode (SampledRuns /
74-
// SampledEpisodes). Values > 1 indicate orchestrator-style workflows that
75-
// dispatch multiple sub-workflows per task.
76-
RunsPerEpisode float64 `json:"runs_per_episode"`
77-
// AvgEffectiveTokensPerEpisode is the mean effective-token count per episode.
78-
AvgEffectiveTokensPerEpisode int `json:"avg_effective_tokens_per_episode"`
79-
// ObservedEpisodesPerPeriod is the projected number of episodes in the forecast
80-
// period, scaled from the observed episode frequency.
81-
ObservedEpisodesPerPeriod float64 `json:"observed_episodes_per_period"`
82-
}
83-
8463
// ForecastWorkflowResult contains the projected metrics for a single workflow.
8564
type ForecastWorkflowResult struct {
8665
// WorkflowID is the short identifier of the workflow (basename without .md).
@@ -97,8 +76,6 @@ type ForecastWorkflowResult struct {
9776

9877
// SuccessRate is the fraction of sampled runs that completed successfully (0–1).
9978
SuccessRate float64 `json:"success_rate"`
100-
// Yield is the effective throughput: success rate × observed runs per period.
101-
Yield float64 `json:"yield"`
10279

10380
// Average per-run metrics (from completed runs).
10481
AvgEffectiveTokens int `json:"avg_effective_tokens"`
@@ -107,10 +84,6 @@ type ForecastWorkflowResult struct {
10784
// Projected totals for the period.
10885
ProjectedEffectiveTokens int `json:"projected_effective_tokens"`
10986

110-
// EpisodeAnalysis contains episode-level metrics derived from the sampled runs.
111-
// Nil when no completed runs were available to analyze.
112-
EpisodeAnalysis *ForecastEpisodeSummary `json:"episode_analysis,omitempty"`
113-
11487
// MonteCarlo contains the probability distribution of projected effective-token
11588
// counts derived from a Monte Carlo simulation (10 000 trials).
11689
// Nil when no completed runs were available.
@@ -559,9 +532,6 @@ func forecastWorkflow(ctx context.Context, workflowName, startDate string, confi
559532
// scaled to the projection period.
560533
result.ObservedRunsPerPeriod = float64(n) / float64(config.Days) * float64(periodDays)
561534

562-
// Effective throughput (yield) accounts for the success rate.
563-
result.Yield = result.ObservedRunsPerPeriod * result.SuccessRate
564-
565535
// Projected token usage (point estimate using simple means).
566536
result.ProjectedEffectiveTokens = int(math.Round(result.ObservedRunsPerPeriod * float64(result.AvgEffectiveTokens)))
567537

@@ -573,10 +543,6 @@ func forecastWorkflow(ctx context.Context, workflowName, startDate string, confi
573543
// Populate experiment variant fractions from run history when metadata has variants.
574544
result.ExperimentVariants = computeVariantFractions(result.ExperimentVariants, completed)
575545

576-
// Build lightweight episode analysis from the completed runs using the fields
577-
// available in the GitHub Actions run list (no artifact download required).
578-
result.EpisodeAnalysis = buildForecastEpisodeSummary(completed, config.Days, periodDays)
579-
580546
return result, nil
581547
}
582548

@@ -753,75 +719,6 @@ func extractWorkflowIDFromName(name string) string {
753719
return name
754720
}
755721

756-
// workflowRunToRunData converts a WorkflowRun (sourced from the GitHub Actions API)
757-
// to a RunData using the fields available without artifact downloads. Fields that
758-
// require aw_info.json (AwContext, Repository, Ref, SHA, Actor, RunAttempt, …) are
759-
// left as zero values; the episode engine degrades gracefully when they are absent.
760-
func workflowRunToRunData(r WorkflowRun) RunData {
761-
return RunData{
762-
RunID: r.DatabaseID,
763-
Number: r.Number,
764-
WorkflowName: r.WorkflowName,
765-
WorkflowPath: r.WorkflowPath,
766-
Status: r.Status,
767-
Conclusion: r.Conclusion,
768-
URL: r.URL,
769-
Event: r.Event,
770-
Branch: r.HeadBranch,
771-
HeadSHA: r.HeadSha,
772-
DisplayTitle: r.DisplayTitle,
773-
CreatedAt: r.CreatedAt,
774-
StartedAt: r.StartedAt,
775-
UpdatedAt: r.UpdatedAt,
776-
TokenUsage: r.TokenUsage,
777-
EffectiveTokens: r.EffectiveTokens,
778-
EstimatedCost: r.EstimatedCost,
779-
}
780-
}
781-
782-
// buildForecastEpisodeSummary derives episode-level metrics from a slice of
783-
// completed WorkflowRun objects using the lightweight episode engine. Returns nil
784-
// when no runs are provided.
785-
//
786-
// Because only GitHub API fields are available (no aw_info.json artifacts), the
787-
// episode engine can link runs via workflow_run event SHA/branch matching but
788-
// cannot detect dispatch or workflow_call lineage. The resulting episode count is
789-
// therefore a lower-bound estimate for orchestrator-style workflows.
790-
func buildForecastEpisodeSummary(runs []WorkflowRun, historyDays, periodDays int) *ForecastEpisodeSummary {
791-
if len(runs) == 0 {
792-
return nil
793-
}
794-
795-
runData := make([]RunData, 0, len(runs))
796-
for _, r := range runs {
797-
runData = append(runData, workflowRunToRunData(r))
798-
}
799-
800-
// buildEpisodeData returns (episodes, edges); edges are not needed for
801-
// the lightweight forecast summary so they are intentionally discarded.
802-
episodes, _ := buildEpisodeData(runData, nil)
803-
numEpisodes := len(episodes)
804-
if numEpisodes == 0 {
805-
return nil
806-
}
807-
808-
var totalEpisodeET int
809-
for _, ep := range episodes {
810-
totalEpisodeET += ep.TotalEffectiveTokens
811-
}
812-
813-
avgETPerEpisode := totalEpisodeET / numEpisodes
814-
runsPerEpisode := float64(len(runs)) / float64(numEpisodes)
815-
observedEpisodesPerPeriod := float64(numEpisodes) / float64(historyDays) * float64(periodDays)
816-
817-
return &ForecastEpisodeSummary{
818-
SampledEpisodes: numEpisodes,
819-
RunsPerEpisode: runsPerEpisode,
820-
AvgEffectiveTokensPerEpisode: avgETPerEpisode,
821-
ObservedEpisodesPerPeriod: observedEpisodesPerPeriod,
822-
}
823-
}
824-
825722
// loadCachedEffectiveTokens looks up a locally-cached RunSummary for the given
826723
// run ID and returns the TotalEffectiveTokens from its TokenUsage summary.
827724
// Returns 0 when no cache exists or the cache does not contain token data.
@@ -953,7 +850,6 @@ type forecastTableRow struct {
953850
Workflow string `json:"workflow" console:"header:Workflow"`
954851
Runs int `json:"runs" console:"header:Sampled Runs"`
955852
SuccessRate string `json:"success_rate" console:"header:Success Rate"`
956-
Yield string `json:"yield" console:"header:Yield/Period"`
957853
AvgEffectiveTokens string `json:"avg_effective_tokens" console:"header:Avg ET"`
958854
ProjectedTokens string `json:"projected_tokens" console:"header:Proj. ET (P50)"`
959855
ETRange string `json:"et_range" console:"header:80% CI (P10–P90)"`
@@ -992,7 +888,6 @@ func renderForecastTable(output ForecastResult, config ForecastConfig) error {
992888
Workflow: wf.WorkflowID + unreliableMark,
993889
Runs: wf.SampledRuns,
994890
SuccessRate: formatForecastPercent(wf.SuccessRate, wf.SampledRuns > 0),
995-
Yield: fmt.Sprintf("%.1f", wf.Yield),
996891
AvgEffectiveTokens: formatForecastTokens(wf.AvgEffectiveTokens),
997892
ProjectedTokens: projETStr,
998893
ETRange: etRangeStr,
@@ -1004,18 +899,6 @@ func renderForecastTable(output ForecastResult, config ForecastConfig) error {
1004899
fmt.Fprint(os.Stderr, console.RenderStruct(rows))
1005900
fmt.Fprintln(os.Stderr, "")
1006901

1007-
// Show episode analysis when any workflow has multi-run episodes.
1008-
anyMultiRunEpisodes := false
1009-
for _, wf := range output.Workflows {
1010-
if wf.EpisodeAnalysis != nil && wf.EpisodeAnalysis.RunsPerEpisode > 1.0 {
1011-
anyMultiRunEpisodes = true
1012-
break
1013-
}
1014-
}
1015-
if anyMultiRunEpisodes {
1016-
printEpisodeBreakdown(output.Workflows)
1017-
}
1018-
1019902
// Show experiment variant details when present.
1020903
for _, wf := range output.Workflows {
1021904
if len(wf.ExperimentVariants) > 0 {
@@ -1039,36 +922,6 @@ func renderForecastTable(output ForecastResult, config ForecastConfig) error {
1039922
return nil
1040923
}
1041924

1042-
// printEpisodeBreakdown renders per-episode ET metrics for workflows that have
1043-
// multi-run episodes (i.e. orchestrator-style workflows dispatching sub-workflows).
1044-
func printEpisodeBreakdown(workflows []ForecastWorkflowResult) {
1045-
type episodeRow struct {
1046-
Workflow string `json:"workflow" console:"header:Workflow"`
1047-
Episodes int `json:"episodes" console:"header:Episodes"`
1048-
RunsPerEpisode string `json:"runs_per_episode" console:"header:Runs/Episode"`
1049-
AvgETPerEpisode string `json:"avg_et_per_episode" console:"header:Avg ET/Episode"`
1050-
EpisodesPerPeriod string `json:"episodes_per_period" console:"header:Episodes/Period"`
1051-
}
1052-
1053-
fmt.Fprintln(os.Stderr, console.FormatInfoMessage("Episode analysis (runs grouped by logical task):"))
1054-
epRows := make([]episodeRow, 0, len(workflows))
1055-
for _, wf := range workflows {
1056-
ep := wf.EpisodeAnalysis
1057-
if ep == nil {
1058-
continue
1059-
}
1060-
epRows = append(epRows, episodeRow{
1061-
Workflow: wf.WorkflowID,
1062-
Episodes: ep.SampledEpisodes,
1063-
RunsPerEpisode: fmt.Sprintf("%.1f", ep.RunsPerEpisode),
1064-
AvgETPerEpisode: formatForecastTokens(ep.AvgEffectiveTokensPerEpisode),
1065-
EpisodesPerPeriod: fmt.Sprintf("%.1f", ep.ObservedEpisodesPerPeriod),
1066-
})
1067-
}
1068-
fmt.Fprint(os.Stderr, console.RenderStruct(epRows))
1069-
fmt.Fprintln(os.Stderr, "")
1070-
}
1071-
1072925
// printEvalBreakdown renders the backtesting comparison table.
1073926
func printEvalBreakdown(workflows []ForecastWorkflowResult) {
1074927
type evalRow struct {

pkg/cli/forecast_test.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,6 @@ func TestRenderForecastTable_ZeroMonteCarloRangeRendersDash(t *testing.T) {
229229
WorkflowID: "smoke-copilot",
230230
SampledRuns: 1,
231231
SuccessRate: 1,
232-
Yield: 1,
233232
MonteCarlo: &ForecastMonteCarloSummary{
234233
P10ProjectedEffectiveTokens: 0,
235234
P50ProjectedEffectiveTokens: 0,

0 commit comments

Comments
 (0)