Refocus forecast output on effective-token predictions by removing yield and episode metrics (#34750)

Copilot · web-flow · commit b9e46d81ee12 · 2026-05-25T13:10:38.000-07:00
diff --git a/pkg/cli/forecast.go b/pkg/cli/forecast.go
@@ -1,8 +1,8 @@
 package cli
 
 // This file implements the `forecast` command, which samples a workflow's recent
-// GitHub Actions run history and projects forward effective token usage and yield
-// on a per-week or per-month basis.
+// GitHub Actions run history and projects forward effective token usage (including
+// Monte Carlo probability distributions) on a per-week or per-month basis.
 //
 // Workflow metadata (trigger types, concurrency, experiments) is read from the
 // workflow's Markdown frontmatter so that projections account for how often the
@@ -60,27 +60,6 @@ var (
 	}
 )
 
-// ForecastEpisodeSummary contains episode-level aggregate metrics derived from
-// run history without downloading artifacts.  Episodes are reconstructed from the
-// fields available in the GitHub Actions run list (event type, head SHA, branch).
-// Dispatch and workflow_call linkages that require aw_info.json are not available
-// in this lightweight analysis, so the episode count is a lower-bound estimate.
-type ForecastEpisodeSummary struct {
-	// SampledEpisodes is the number of distinct episodes detected in the sampled
-	// run history.  Each "episode" represents one logical task execution, which may
-	// span multiple runs when a workflow dispatches sub-workflows.
-	SampledEpisodes int `json:"sampled_episodes"`
-	// RunsPerEpisode is the average number of runs per episode (SampledRuns /
-	// SampledEpisodes).  Values > 1 indicate orchestrator-style workflows that
-	// dispatch multiple sub-workflows per task.
-	RunsPerEpisode float64 `json:"runs_per_episode"`
-	// AvgEffectiveTokensPerEpisode is the mean effective-token count per episode.
-	AvgEffectiveTokensPerEpisode int `json:"avg_effective_tokens_per_episode"`
-	// ObservedEpisodesPerPeriod is the projected number of episodes in the forecast
-	// period, scaled from the observed episode frequency.
-	ObservedEpisodesPerPeriod float64 `json:"observed_episodes_per_period"`
-}
-
 // ForecastWorkflowResult contains the projected metrics for a single workflow.
 type ForecastWorkflowResult struct {
 	// WorkflowID is the short identifier of the workflow (basename without .md).
@@ -97,8 +76,6 @@ type ForecastWorkflowResult struct {
 
 	// SuccessRate is the fraction of sampled runs that completed successfully (0–1).
 	SuccessRate float64 `json:"success_rate"`
-	// Yield is the effective throughput: success rate × observed runs per period.
-	Yield float64 `json:"yield"`
 
 	// Average per-run metrics (from completed runs).
 	AvgEffectiveTokens int     `json:"avg_effective_tokens"`
@@ -107,10 +84,6 @@ type ForecastWorkflowResult struct {
 	// Projected totals for the period.
 	ProjectedEffectiveTokens int `json:"projected_effective_tokens"`
 
-	// EpisodeAnalysis contains episode-level metrics derived from the sampled runs.
-	// Nil when no completed runs were available to analyze.
-	EpisodeAnalysis *ForecastEpisodeSummary `json:"episode_analysis,omitempty"`
-
 	// MonteCarlo contains the probability distribution of projected effective-token
 	// counts derived from a Monte Carlo simulation (10 000 trials).
 	// Nil when no completed runs were available.
@@ -559,9 +532,6 @@ func forecastWorkflow(ctx context.Context, workflowName, startDate string, confi
 	// scaled to the projection period.
 	result.ObservedRunsPerPeriod = float64(n) / float64(config.Days) * float64(periodDays)
 
-	// Effective throughput (yield) accounts for the success rate.
-	result.Yield = result.ObservedRunsPerPeriod * result.SuccessRate
-
 	// Projected token usage (point estimate using simple means).
 	result.ProjectedEffectiveTokens = int(math.Round(result.ObservedRunsPerPeriod * float64(result.AvgEffectiveTokens)))
 
@@ -573,10 +543,6 @@ func forecastWorkflow(ctx context.Context, workflowName, startDate string, confi
 	// Populate experiment variant fractions from run history when metadata has variants.
 	result.ExperimentVariants = computeVariantFractions(result.ExperimentVariants, completed)
 
-	// Build lightweight episode analysis from the completed runs using the fields
-	// available in the GitHub Actions run list (no artifact download required).
-	result.EpisodeAnalysis = buildForecastEpisodeSummary(completed, config.Days, periodDays)
-
 	return result, nil
 }
 
@@ -753,75 +719,6 @@ func extractWorkflowIDFromName(name string) string {
 	return name
 }
 
-// workflowRunToRunData converts a WorkflowRun (sourced from the GitHub Actions API)
-// to a RunData using the fields available without artifact downloads.  Fields that
-// require aw_info.json (AwContext, Repository, Ref, SHA, Actor, RunAttempt, …) are
-// left as zero values; the episode engine degrades gracefully when they are absent.
-func workflowRunToRunData(r WorkflowRun) RunData {
-	return RunData{
-		RunID:           r.DatabaseID,
-		Number:          r.Number,
-		WorkflowName:    r.WorkflowName,
-		WorkflowPath:    r.WorkflowPath,
-		Status:          r.Status,
-		Conclusion:      r.Conclusion,
-		URL:             r.URL,
-		Event:           r.Event,
-		Branch:          r.HeadBranch,
-		HeadSHA:         r.HeadSha,
-		DisplayTitle:    r.DisplayTitle,
-		CreatedAt:       r.CreatedAt,
-		StartedAt:       r.StartedAt,
-		UpdatedAt:       r.UpdatedAt,
-		TokenUsage:      r.TokenUsage,
-		EffectiveTokens: r.EffectiveTokens,
-		EstimatedCost:   r.EstimatedCost,
-	}
-}
-
-// buildForecastEpisodeSummary derives episode-level metrics from a slice of
-// completed WorkflowRun objects using the lightweight episode engine.  Returns nil
-// when no runs are provided.
-//
-// Because only GitHub API fields are available (no aw_info.json artifacts), the
-// episode engine can link runs via workflow_run event SHA/branch matching but
-// cannot detect dispatch or workflow_call lineage.  The resulting episode count is
-// therefore a lower-bound estimate for orchestrator-style workflows.
-func buildForecastEpisodeSummary(runs []WorkflowRun, historyDays, periodDays int) *ForecastEpisodeSummary {
-	if len(runs) == 0 {
-		return nil
-	}
-
-	runData := make([]RunData, 0, len(runs))
-	for _, r := range runs {
-		runData = append(runData, workflowRunToRunData(r))
-	}
-
-	// buildEpisodeData returns (episodes, edges); edges are not needed for
-	// the lightweight forecast summary so they are intentionally discarded.
-	episodes, _ := buildEpisodeData(runData, nil)
-	numEpisodes := len(episodes)
-	if numEpisodes == 0 {
-		return nil
-	}
-
-	var totalEpisodeET int
-	for _, ep := range episodes {
-		totalEpisodeET += ep.TotalEffectiveTokens
-	}
-
-	avgETPerEpisode := totalEpisodeET / numEpisodes
-	runsPerEpisode := float64(len(runs)) / float64(numEpisodes)
-	observedEpisodesPerPeriod := float64(numEpisodes) / float64(historyDays) * float64(periodDays)
-
-	return &ForecastEpisodeSummary{
-		SampledEpisodes:              numEpisodes,
-		RunsPerEpisode:               runsPerEpisode,
-		AvgEffectiveTokensPerEpisode: avgETPerEpisode,
-		ObservedEpisodesPerPeriod:    observedEpisodesPerPeriod,
-	}
-}
-
 // loadCachedEffectiveTokens looks up a locally-cached RunSummary for the given
 // run ID and returns the TotalEffectiveTokens from its TokenUsage summary.
 // Returns 0 when no cache exists or the cache does not contain token data.
@@ -953,7 +850,6 @@ type forecastTableRow struct {
 	Workflow           string `json:"workflow"                console:"header:Workflow"`
 	Runs               int    `json:"runs"                    console:"header:Sampled Runs"`
 	SuccessRate        string `json:"success_rate"            console:"header:Success Rate"`
-	Yield              string `json:"yield"                   console:"header:Yield/Period"`
 	AvgEffectiveTokens string `json:"avg_effective_tokens"    console:"header:Avg ET"`
 	ProjectedTokens    string `json:"projected_tokens"        console:"header:Proj. ET (P50)"`
 	ETRange            string `json:"et_range"                console:"header:80% CI (P10–P90)"`
@@ -992,7 +888,6 @@ func renderForecastTable(output ForecastResult, config ForecastConfig) error {
 			Workflow:           wf.WorkflowID + unreliableMark,
 			Runs:               wf.SampledRuns,
 			SuccessRate:        formatForecastPercent(wf.SuccessRate, wf.SampledRuns > 0),
-			Yield:              fmt.Sprintf("%.1f", wf.Yield),
 			AvgEffectiveTokens: formatForecastTokens(wf.AvgEffectiveTokens),
 			ProjectedTokens:    projETStr,
 			ETRange:            etRangeStr,
@@ -1004,18 +899,6 @@ func renderForecastTable(output ForecastResult, config ForecastConfig) error {
 	fmt.Fprint(os.Stderr, console.RenderStruct(rows))
 	fmt.Fprintln(os.Stderr, "")
 
-	// Show episode analysis when any workflow has multi-run episodes.
-	anyMultiRunEpisodes := false
-	for _, wf := range output.Workflows {
-		if wf.EpisodeAnalysis != nil && wf.EpisodeAnalysis.RunsPerEpisode > 1.0 {
-			anyMultiRunEpisodes = true
-			break
-		}
-	}
-	if anyMultiRunEpisodes {
-		printEpisodeBreakdown(output.Workflows)
-	}
-
 	// Show experiment variant details when present.
 	for _, wf := range output.Workflows {
 		if len(wf.ExperimentVariants) > 0 {
@@ -1039,36 +922,6 @@ func renderForecastTable(output ForecastResult, config ForecastConfig) error {
 	return nil
 }
 
-// printEpisodeBreakdown renders per-episode ET metrics for workflows that have
-// multi-run episodes (i.e. orchestrator-style workflows dispatching sub-workflows).
-func printEpisodeBreakdown(workflows []ForecastWorkflowResult) {
-	type episodeRow struct {
-		Workflow          string `json:"workflow"               console:"header:Workflow"`
-		Episodes          int    `json:"episodes"               console:"header:Episodes"`
-		RunsPerEpisode    string `json:"runs_per_episode"       console:"header:Runs/Episode"`
-		AvgETPerEpisode   string `json:"avg_et_per_episode"     console:"header:Avg ET/Episode"`
-		EpisodesPerPeriod string `json:"episodes_per_period"    console:"header:Episodes/Period"`
-	}
-
-	fmt.Fprintln(os.Stderr, console.FormatInfoMessage("Episode analysis (runs grouped by logical task):"))
-	epRows := make([]episodeRow, 0, len(workflows))
-	for _, wf := range workflows {
-		ep := wf.EpisodeAnalysis
-		if ep == nil {
-			continue
-		}
-		epRows = append(epRows, episodeRow{
-			Workflow:          wf.WorkflowID,
-			Episodes:          ep.SampledEpisodes,
-			RunsPerEpisode:    fmt.Sprintf("%.1f", ep.RunsPerEpisode),
-			AvgETPerEpisode:   formatForecastTokens(ep.AvgEffectiveTokensPerEpisode),
-			EpisodesPerPeriod: fmt.Sprintf("%.1f", ep.ObservedEpisodesPerPeriod),
-		})
-	}
-	fmt.Fprint(os.Stderr, console.RenderStruct(epRows))
-	fmt.Fprintln(os.Stderr, "")
-}
-
 // printEvalBreakdown renders the backtesting comparison table.
 func printEvalBreakdown(workflows []ForecastWorkflowResult) {
 	type evalRow struct {
diff --git a/pkg/cli/forecast_test.go b/pkg/cli/forecast_test.go
@@ -229,7 +229,6 @@ func TestRenderForecastTable_ZeroMonteCarloRangeRendersDash(t *testing.T) {
 				WorkflowID:  "smoke-copilot",
 				SampledRuns: 1,
 				SuccessRate: 1,
-				Yield:       1,
 				MonteCarlo: &ForecastMonteCarloSummary{
 					P10ProjectedEffectiveTokens: 0,
 					P50ProjectedEffectiveTokens: 0,