Skip to content

Commit 94eb353

Browse files
committed
rewrite
1 parent 640bd44 commit 94eb353

16 files changed

Lines changed: 1705 additions & 1111 deletions

File tree

README.md

Lines changed: 92 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,69 @@
11
# terrascaler
22

3-
Terrascaler is an external gRPC cloud provider for Kubernetes Cluster Autoscaler.
4-
Instead of creating machines directly, it updates a configured integer value in a
5-
Terraform/OpenTofu repository through the GitLab API. The resulting GitLab commit
6-
can then trigger the repository's CI pipeline to run Terraform/OpenTofu and add
7-
workers.
3+
Terrascaler is a small Kubernetes autoscaler for clusters whose worker count is
4+
managed by Terraform/OpenTofu in GitLab.
85

9-
The implementation follows the Cluster Autoscaler external gRPC provider service:
10-
`clusterautoscaler.cloudprovider.v1.externalgrpc.CloudProvider`.
6+
It does not implement a Kubernetes Cluster Autoscaler cloud provider. Instead,
7+
Terrascaler runs its own reduced autoscaling loop:
8+
9+
1. Read the current Terraform worker count from GitLab.
10+
2. List Kubernetes nodes and pods.
11+
3. Find pending unscheduled pods.
12+
4. Estimate whether those pods fit on current worker capacity.
13+
5. If not, compute how many worker nodes are needed.
14+
6. Commit the updated Terraform worker count to GitLab.
15+
16+
The GitLab commit is expected to trigger the repository's CI pipeline, which then
17+
runs Terraform/OpenTofu and adds workers.
1118

1219
## Scope
1320

14-
Terrascaler currently supports scale-up only. `NodeGroupIncreaseSize` reads the
15-
configured Terraform attribute, adds the autoscaler delta, and commits the updated
16-
file to GitLab. Node deletion and target-size decreases intentionally return
17-
`Unimplemented`.
21+
Terrascaler intentionally supports a narrow feature set:
22+
23+
- scale up only
24+
- one Terraform-managed worker group
25+
- CPU, memory, and pod-count bin packing
26+
- optional node label selector for Terraform-managed workers
27+
- no cloud provider integrations
28+
- no scale down
29+
- no expander strategies, priorities, balancing, or pricing logic
1830

1931
## Configuration
2032

2133
All settings can be passed as flags or environment variables.
2234

23-
| Flag | Environment | Required | Default | Description |
24-
| ---------------------- | -------------------------------- | -------- | ---------- | ------------------------------------------------------------- |
25-
| `--listen` | `TERRASCALER_LISTEN` | no | `:8080` | gRPC listen address |
26-
| `--tls-cert-file` | `TERRASCALER_TLS_CERT_FILE` | no | | Server TLS certificate file |
27-
| `--tls-key-file` | `TERRASCALER_TLS_KEY_FILE` | no | | Server TLS key file |
28-
| `--tls-client-ca-file` | `TERRASCALER_TLS_CLIENT_CA_FILE` | no | | Client CA file; enables mTLS when set |
29-
| `--gitlab-base-url` | `GITLAB_BASE_URL` | no | GitLab.com | GitLab URL, for example `https://gitlab.example.com` |
30-
| `--gitlab-token` | `GITLAB_TOKEN` | yes | | GitLab API token with repository write access |
31-
| `--gitlab-project` | `GITLAB_PROJECT` | yes | | GitLab project ID or path, for example `group/platform/infra` |
32-
| `--gitlab-branch` | `GITLAB_BRANCH` | no | `main` | Branch to update |
33-
| `--file` | `TERRASCALER_FILE` | yes | | Terraform file path in the repository |
34-
| `--block-type` | `TERRASCALER_BLOCK_TYPE` | no | `module` | Terraform block type containing the field |
35-
| `--block-labels` | `TERRASCALER_BLOCK_LABELS` | no | | Comma-separated block labels |
36-
| `--attribute` | `TERRASCALER_ATTRIBUTE` | yes | | Integer Terraform attribute to update |
37-
| `--node-group-id` | `TERRASCALER_NODE_GROUP_ID` | no | `default` | Node group ID exposed to Cluster Autoscaler |
38-
| `--min-size` | `TERRASCALER_MIN_SIZE` | no | `0` | Node group minimum |
39-
| `--max-size` | `TERRASCALER_MAX_SIZE` | no | `100` | Node group maximum |
40-
| `--template-cpu` | `TERRASCALER_TEMPLATE_CPU` | no | `2` | Template node CPU capacity for scale-up simulation |
41-
| `--template-memory` | `TERRASCALER_TEMPLATE_MEMORY` | no | `8Gi` | Template node memory capacity |
42-
| `--template-pods` | `TERRASCALER_TEMPLATE_PODS` | no | `110` | Template node pod capacity |
43-
| `--template-labels` | `TERRASCALER_TEMPLATE_LABELS` | no | | Comma-separated labels for the template node, `key=value` |
35+
| Flag | Environment | Required | Default | Description |
36+
| ----------------------- | --------------------------------- | -------- | ----------------- | ------------------------------------------------------------------------------- |
37+
| `--kubeconfig` | `KUBECONFIG` | no | in-cluster config | Path to kubeconfig |
38+
| `--check-interval` | `TERRASCALER_CHECK_INTERVAL` | no | `1m` | Autoscaling check interval |
39+
| `--scale-up-cooldown` | `TERRASCALER_SCALE_UP_COOLDOWN` | no | `5m` | Minimum time between scale-up commits |
40+
| `--pending-pod-min-age` | `TERRASCALER_PENDING_POD_MIN_AGE` | no | `30s` | Minimum age for pending pods without an Unschedulable condition |
41+
| `--metrics-address` | `TERRASCALER_METRICS_ADDRESS` | no | `:8080` | Prometheus metrics listen address; empty disables metrics |
42+
| `--once` | `TERRASCALER_ONCE` | no | `false` | Run one autoscaling check and exit |
43+
| `--dry-run` | `TERRASCALER_DRY_RUN` | no | `false` | Log intended scaling actions without updating GitLab |
44+
| `--gitlab-base-url` | `GITLAB_BASE_URL` | no | GitLab.com | GitLab URL, for example `https://gitlab.example.com` |
45+
| `--gitlab-token` | `GITLAB_TOKEN` | yes | | GitLab API token with repository write access |
46+
| `--gitlab-project` | `GITLAB_PROJECT` | yes | | GitLab project ID or path, for example `group/platform/infra` |
47+
| `--gitlab-branch` | `GITLAB_BRANCH` | no | `main` | Branch to update |
48+
| `--gitlab-merge-request` | `TERRASCALER_GITLAB_MERGE_REQUEST` | no | `false` | Create or update a GitLab merge request instead of committing directly |
49+
| `--gitlab-mr-branch-prefix` | `TERRASCALER_GITLAB_MR_BRANCH_PREFIX` | no | `terrascaler/scale` | Branch prefix for merge request mode |
50+
| `--gitlab-mr-title` | `TERRASCALER_GITLAB_MR_TITLE` | no | `terrascaler: scale worker count` | Merge request title |
51+
| `--gitlab-mr-description` | `TERRASCALER_GITLAB_MR_DESCRIPTION` | no | automated proposal text | Merge request description prefix |
52+
| `--gitlab-mr-labels` | `TERRASCALER_GITLAB_MR_LABELS` | no | `terrascaler` | Comma-separated merge request labels |
53+
| `--gitlab-mr-assignee-ids` | `TERRASCALER_GITLAB_MR_ASSIGNEE_IDS` | no | | Comma-separated GitLab user IDs to assign |
54+
| `--gitlab-mr-reviewer-ids` | `TERRASCALER_GITLAB_MR_REVIEWER_IDS` | no | | Comma-separated GitLab user IDs to request review from |
55+
| `--gitlab-mr-remove-source-branch` | `TERRASCALER_GITLAB_MR_REMOVE_SOURCE_BRANCH` | no | `true` | Remove MR source branch after merge |
56+
| `--file` | `TERRASCALER_FILE` | yes | | Terraform file path in the repository |
57+
| `--block-type` | `TERRASCALER_BLOCK_TYPE` | no | `module` | Terraform block type containing the field |
58+
| `--block-labels` | `TERRASCALER_BLOCK_LABELS` | no | | Comma-separated Terraform block labels |
59+
| `--attribute` | `TERRASCALER_ATTRIBUTE` | yes | | Integer Terraform attribute to update |
60+
| `--min-size` | `TERRASCALER_MIN_SIZE` | no | `0` | Minimum target size |
61+
| `--max-size` | `TERRASCALER_MAX_SIZE` | no | `100` | Maximum target size |
62+
| `--node-selector` | `TERRASCALER_NODE_SELECTOR` | no | | Comma-separated node labels, `key=value`, identifying Terraform-managed workers |
63+
| `--template-cpu` | `TERRASCALER_TEMPLATE_CPU` | no | `2` | New worker CPU capacity |
64+
| `--template-memory` | `TERRASCALER_TEMPLATE_MEMORY` | no | `8Gi` | New worker memory capacity |
65+
| `--template-pods` | `TERRASCALER_TEMPLATE_PODS` | no | `110` | New worker pod capacity |
66+
| `--template-labels` | `TERRASCALER_TEMPLATE_LABELS` | no | | Reserved metadata for new worker labels |
4467

4568
Example for:
4669

@@ -59,30 +82,53 @@ terrascaler \
5982
--block-type module \
6083
--block-labels hostedcluster \
6184
--attribute worker_count \
62-
--node-group-id hostedcluster-workers \
6385
--min-size 3 \
64-
--max-size 20
86+
--max-size 20 \
87+
--node-selector node-role.kubernetes.io/worker= \
88+
--template-cpu 4 \
89+
--template-memory 16Gi \
90+
--template-pods 110
6591
```
6692

67-
Cluster Autoscaler should use `--cloud-provider=externalgrpc` and a cloud config
68-
that points at this service address, as documented by upstream Kubernetes
69-
Autoscaler.
93+
## Autoscaling Behavior
7094

71-
```yaml
72-
address: terrascaler.default.svc.cluster.local:8080
73-
grpc_timeout: 30s
74-
```
95+
Terrascaler considers a pod eligible for scale-up when it is pending, not bound to
96+
a node, and either:
97+
98+
- has `PodScheduled=False` with reason `Unschedulable`, or
99+
- is older than `--pending-pod-min-age`.
75100

76-
For TLS or mTLS, provide `--tls-cert-file` and `--tls-key-file` to Terrascaler.
77-
Providing `--tls-client-ca-file` enables client certificate verification, matching
78-
the upstream external gRPC provider recommendation.
101+
It subtracts scheduled pod requests from matching ready schedulable nodes, then
102+
tries to pack eligible pending pods into the remaining capacity. Any pods that do
103+
not fit are packed onto synthetic nodes using `--template-cpu`,
104+
`--template-memory`, and `--template-pods`. The number of synthetic nodes needed
105+
is added to the current Terraform target, capped at `--max-size`.
79106

80-
## Notes
107+
## Monitoring
81108

82-
The target Terraform value must currently be a literal integer. Values such as
109+
Terrascaler exposes Prometheus metrics on `/metrics` at `--metrics-address`.
110+
111+
Important metrics:
112+
113+
- `terrascaler_scale_down_potential_nodes`: approximate number of nodes that may
114+
be removable. Terrascaler only reports this and does not scale down.
115+
- `terrascaler_current_target_nodes`: current Terraform target node count.
116+
- `terrascaler_desired_target_nodes`: desired target node count from the latest
117+
plan.
118+
- `terrascaler_new_nodes_required`: new nodes needed by the latest plan.
119+
- `terrascaler_last_check_success`: `1` when the last check succeeded.
120+
121+
The target Terraform value must be a literal integer. Values such as
83122
`worker_count = var.worker_count` are rejected because Terrascaler cannot safely
84123
evaluate them from a single file.
85124

125+
## Merge Request Mode
126+
127+
By default Terrascaler commits directly to `--gitlab-branch`. With
128+
`--gitlab-merge-request`, Terrascaler creates a branch from the target branch,
129+
commits the Terraform change there, and opens a GitLab merge request. Optional
130+
assignee and reviewer settings use GitLab numeric user IDs.
131+
86132
## Development
87133

88134
Common targets follow the other Containeroo Go tools:

cmd/terrascaler/main.go

Lines changed: 87 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,23 @@
11
package main
22

33
import (
4-
"crypto/tls"
5-
"crypto/x509"
4+
"context"
65
"errors"
76
"log"
87
"log/slog"
9-
"net"
8+
"net/http"
109
"os"
10+
"os/signal"
11+
"syscall"
1112

12-
"google.golang.org/grpc"
13-
"google.golang.org/grpc/credentials"
13+
"github.com/prometheus/client_golang/prometheus/promhttp"
14+
"k8s.io/client-go/kubernetes"
15+
"k8s.io/client-go/rest"
16+
"k8s.io/client-go/tools/clientcmd"
1417

18+
"github.com/containeroo/terrascaler/internal/autoscaler"
1519
"github.com/containeroo/terrascaler/internal/config"
16-
"github.com/containeroo/terrascaler/internal/externalgrpc"
1720
"github.com/containeroo/terrascaler/internal/gitlab"
18-
"github.com/containeroo/terrascaler/internal/provider"
1921
"github.com/containeroo/terrascaler/internal/terraform"
2022
)
2123

@@ -32,12 +34,31 @@ func main() {
3234
log.Fatalf("load config: %v", err)
3335
}
3436

37+
kubeConfig, err := loadKubeConfig(cfg.Kubeconfig)
38+
if err != nil {
39+
log.Fatalf("load Kubernetes config: %v", err)
40+
}
41+
kubeClient, err := kubernetes.NewForConfig(kubeConfig)
42+
if err != nil {
43+
log.Fatalf("create Kubernetes client: %v", err)
44+
}
45+
3546
gitlabClient, err := gitlab.New(gitlab.Config{
3647
BaseURL: cfg.GitLabBaseURL,
3748
Token: cfg.GitLabToken,
3849
Project: cfg.GitLabProject,
3950
Branch: cfg.GitLabBranch,
40-
File: cfg.FilePath,
51+
MR: gitlab.MergeRequestConfig{
52+
Enabled: cfg.GitLabMR.Enabled,
53+
BranchPrefix: cfg.GitLabMR.BranchPrefix,
54+
Title: cfg.GitLabMR.Title,
55+
Description: cfg.GitLabMR.Description,
56+
Labels: cfg.GitLabMR.Labels,
57+
AssigneeIDs: cfg.GitLabMR.AssigneeIDs,
58+
ReviewerIDs: cfg.GitLabMR.ReviewerIDs,
59+
RemoveSourceBranch: cfg.GitLabMR.RemoveSourceBranch,
60+
},
61+
File: cfg.FilePath,
4162
Target: terraform.Target{
4263
BlockType: cfg.BlockType,
4364
Labels: cfg.Labels,
@@ -48,68 +69,89 @@ func main() {
4869
log.Fatalf("create GitLab client: %v", err)
4970
}
5071

51-
listener, err := net.Listen("tcp", cfg.ListenAddress)
52-
if err != nil {
53-
log.Fatalf("listen on %s: %v", cfg.ListenAddress, err)
54-
}
55-
56-
serverOptions, err := grpcServerOptions(cfg)
72+
runner, err := autoscaler.NewRunner(cfg, kubeClient, gitlabClient, logger.With("component", "autoscaler"))
5773
if err != nil {
58-
log.Fatalf("configure gRPC server: %v", err)
74+
log.Fatalf("create autoscaler: %v", err)
5975
}
76+
runner.SetMetrics(autoscaler.NewMetrics(nil))
6077

61-
grpcServer := grpc.NewServer(serverOptions...)
62-
providerServer := provider.New(cfg, gitlabClient, logger.With("component", "provider"))
63-
externalgrpc.RegisterCloudProviderServer(grpcServer, providerServer)
78+
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
79+
defer stop()
6480

6581
logger.Info("starting terrascaler",
6682
"version", Version,
67-
"listen_address", cfg.ListenAddress,
68-
"node_group", cfg.NodeGroupID,
83+
"check_interval", cfg.CheckInterval.String(),
84+
"scale_up_cooldown", cfg.ScaleUpCooldown.String(),
85+
"pending_pod_min_age", cfg.PendingPodMinAge.String(),
86+
"metrics_address", cfg.MetricsAddress,
6987
"min_size", cfg.MinSize,
7088
"max_size", cfg.MaxSize,
89+
"dry_run", cfg.DryRun,
90+
"once", cfg.Once,
7191
"gitlab_base_url", cfg.GitLabBaseURL,
7292
"gitlab_project", cfg.GitLabProject,
7393
"gitlab_branch", cfg.GitLabBranch,
94+
"gitlab_merge_request", cfg.GitLabMR.Enabled,
95+
"gitlab_mr_branch_prefix", cfg.GitLabMR.BranchPrefix,
7496
"terraform_file", cfg.FilePath,
7597
"terraform_block_type", cfg.BlockType,
7698
"terraform_block_labels", cfg.Labels,
7799
"terraform_attribute", cfg.Attribute,
78-
"tls_enabled", cfg.TLSCertFile != "",
79-
"mtls_enabled", cfg.TLSClientCA != "",
100+
"node_selector", cfg.NodeSelector,
101+
"template_cpu", cfg.TemplateCPU,
102+
"template_memory", cfg.TemplateMemory,
103+
"template_pods", cfg.TemplatePods,
80104
)
81-
if err := grpcServer.Serve(listener); err != nil {
82-
log.Fatalf("serve gRPC: %v", err)
105+
106+
metricsServer := startMetricsServer(cfg.MetricsAddress, logger)
107+
if metricsServer != nil {
108+
defer func() {
109+
if err := metricsServer.Shutdown(context.Background()); err != nil {
110+
logger.Error("shutdown metrics server", "error", err)
111+
}
112+
}()
113+
}
114+
115+
if err := runner.Run(ctx); err != nil && !errors.Is(err, context.Canceled) {
116+
log.Fatalf("run autoscaler: %v", err)
83117
}
84118
}
85119

86-
func grpcServerOptions(cfg config.Config) ([]grpc.ServerOption, error) {
87-
if cfg.TLSCertFile == "" {
88-
return nil, nil
120+
func startMetricsServer(address string, logger *slog.Logger) *http.Server {
121+
if address == "" {
122+
logger.Info("metrics server disabled")
123+
return nil
89124
}
90125

91-
cert, err := tls.LoadX509KeyPair(cfg.TLSCertFile, cfg.TLSKeyFile)
92-
if err != nil {
93-
return nil, err
126+
mux := http.NewServeMux()
127+
mux.Handle("/metrics", promhttp.Handler())
128+
server := &http.Server{
129+
Addr: address,
130+
Handler: mux,
94131
}
132+
go func() {
133+
logger.Info("starting metrics server", "address", address)
134+
if err := server.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
135+
logger.Error("metrics server failed", "error", err)
136+
}
137+
}()
138+
return server
139+
}
95140

96-
tlsConfig := &tls.Config{
97-
MinVersion: tls.VersionTLS12,
98-
Certificates: []tls.Certificate{cert},
141+
func loadKubeConfig(path string) (*rest.Config, error) {
142+
if path != "" {
143+
return clientcmd.BuildConfigFromFlags("", path)
99144
}
100145

101-
if cfg.TLSClientCA != "" {
102-
caPEM, err := os.ReadFile(cfg.TLSClientCA)
103-
if err != nil {
104-
return nil, err
105-
}
106-
pool := x509.NewCertPool()
107-
if !pool.AppendCertsFromPEM(caPEM) {
108-
return nil, errors.New("failed to parse client CA")
109-
}
110-
tlsConfig.ClientAuth = tls.RequireAndVerifyClientCert
111-
tlsConfig.ClientCAs = pool
146+
config, err := rest.InClusterConfig()
147+
if err == nil {
148+
return config, nil
112149
}
113150

114-
return []grpc.ServerOption{grpc.Creds(credentials.NewTLS(tlsConfig))}, nil
151+
loadingRules := clientcmd.NewDefaultClientConfigLoadingRules()
152+
clientConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
153+
loadingRules,
154+
&clientcmd.ConfigOverrides{},
155+
)
156+
return clientConfig.ClientConfig()
115157
}

0 commit comments

Comments
 (0)