Skip to content

Commit b278a04

Browse files
klakin-pivotalDavid AlvaradoMerricdeLauneymoleske
committed
Creates metrics endpoint for use by prom-scraper
Notable changes in this commit: * Adds an `/internal/v4/metrics` endpoint which serves Prometheus-style metrics. * Adds function to Prometheus::Client::Registry to re-create the library's metrics registry. This is used by the PrometheusClient class's tests to wipe out the metrics registry, which is global state. Global state is the #1 cause of test pollution and this change seemed to be the most straightforward way to eliminate some order-dependent test failures. NOTE: This function definition is only in the `prometheus_updater_spec.rb` file, which we believe is not shipped in capi-release, so the odds of actually writing production code that depends on this test-only functionality are pretty slim. Co-authored-by: David Alvarado <alvaradoda@vmware.com> Co-authored-by: Kenneth Lakin <klakin@vmware.com> Co-authored-by: Merric de Launey <mdelauney@pivotal.io> Co-authored-by: Michael Oleske <moleske@pivotal.io>
1 parent 81b5b65 commit b278a04

File tree

20 files changed

+820
-55
lines changed

20 files changed

+820
-55
lines changed

Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ gem 'sequel_pg', require: 'sequel'
4040
gem 'sinatra', '~> 2.2'
4141
gem 'sinatra-contrib'
4242
gem 'statsd-ruby', '~> 1.4.0'
43+
gem 'prometheus-client'
4344
gem 'steno'
4445
gem 'talentbox-delayed_job_sequel', '~> 4.3.0'
4546
gem 'thin'

Gemfile.lock

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ GEM
352352
ast (~> 2.4.1)
353353
pg (1.3.5)
354354
posix-spawn (0.3.15)
355+
prometheus-client (3.0.0)
355356
protobuf (3.6.12)
356357
activesupport (>= 3.2)
357358
middleware
@@ -600,6 +601,7 @@ DEPENDENCIES
600601
parallel_tests
601602
pg
602603
posix-spawn (~> 0.3.15)
604+
prometheus-client
603605
protobuf (= 3.6.12)
604606
pry-byebug
605607
psych (>= 4.0.4)
@@ -640,4 +642,4 @@ DEPENDENCIES
640642
yajl-ruby
641643

642644
BUNDLED WITH
643-
2.1.4
645+
2.2.26
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
require 'prometheus/client'
2+
require 'prometheus/client/formats/text'
3+
require 'cloud_controller/metrics/prometheus_updater'
4+
5+
module VCAP::CloudController
6+
module Internal
7+
class MetricsController < RestController::BaseController
8+
allow_unauthenticated_access
9+
get '/internal/v4/metrics', :index
10+
11+
def index
12+
periodic_updater = VCAP::CloudController::Metrics::PeriodicUpdater.new(
13+
Time.now.utc,
14+
Steno::Sink::Counter.new,
15+
Steno.logger('cc.api'),
16+
[
17+
VCAP::CloudController::Metrics::StatsdUpdater.new,
18+
VCAP::CloudController::Metrics::PrometheusUpdater.new
19+
])
20+
periodic_updater.update!
21+
[200, Prometheus::Client::Formats::Text.marshal(Prometheus::Client.registry)]
22+
end
23+
end
24+
end
25+
end

app/controllers/internal/staging_completion_controller.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,21 @@ def report_metrics(bbs_staging_response)
117117
duration = Time.now.utc.to_i * 1e9 - bbs_staging_response[:created_at]
118118
if bbs_staging_response[:failed]
119119
statsd_updater.report_staging_failure_metrics(duration)
120+
prometheus_updater.report_staging_failure_metrics(duration)
120121
else
121122
statsd_updater.report_staging_success_metrics(duration)
123+
prometheus_updater.report_staging_success_metrics(duration)
122124
end
123125
end
124126

125127
def statsd_updater
126128
@statsd_updater ||= VCAP::CloudController::Metrics::StatsdUpdater.new
127129
end
128130

131+
def prometheus_updater
132+
@prometheus_updater ||= VCAP::CloudController::Metrics::PrometheusUpdater.new # this should be using singleton
133+
end
134+
129135
attr_reader :stagers
130136

131137
def read_body

app/jobs/diego/sync.rb

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,28 @@ module VCAP::CloudController
77
module Jobs
88
module Diego
99
class Sync < VCAP::CloudController::Jobs::CCJob
10-
def initialize(statsd=Statsd.new)
10+
def initialize(statsd=Statsd.new, prometheus_updater=VCAP::CloudController::Metrics::PrometheusUpdater.new)
1111
@statsd = statsd
12+
@prometheus_updater = prometheus_updater
1213
end
1314

1415
def perform
1516
config = CloudController::DependencyLocator.instance.config
16-
@statsd.time('cc.diego_sync.duration') do
17+
begin
18+
## TODO: At some point in the future, start using a monotonic time source, rather than wall-clock time!
19+
start = Time.now
1720
VCAP::CloudController::Diego::ProcessesSync.new(config: config).sync
1821
VCAP::CloudController::Diego::TasksSync.new(config: config).sync
22+
ensure
23+
finish = Time.now
24+
## NOTE: We're taking time in seconds and multiplying by 1000 because we don't have
25+
## access to time in milliseconds. If you ever get access to reliable time in
26+
## milliseconds, then do know that the lack of precision here is not desired
27+
## so feed in the entire value!
28+
elapsed_ms = ((finish - start) * 1000).round
29+
30+
@statsd.timing('cc.diego_sync.duration', elapsed_ms)
31+
@prometheus_updater.report_diego_cell_sync_duration(elapsed_ms)
1932
end
2033
end
2134

lib/cloud_controller/dependency_locator.rb

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
require 'cloud_controller/opi/instances_client'
3535
require 'cloud_controller/opi/stager_client'
3636
require 'cloud_controller/opi/task_client'
37+
require 'cloud_controller/metrics/prometheus_updater'
3738

3839
require 'bits_service_client'
3940

@@ -70,6 +71,13 @@ def runners
7071
@dependencies[:runners] || register(:runners, VCAP::CloudController::Runners.new(config))
7172
end
7273

74+
def prometheus_updater
75+
unless @dependencies[:prometheus_updater]
76+
register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new)
77+
end
78+
@dependencies[:prometheus_updater]
79+
end
80+
7381
def stagers
7482
@dependencies[:stagers] || register(:stagers, VCAP::CloudController::Stagers.new(config))
7583
end

lib/cloud_controller/deployment_updater/scheduler.rb

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@ def start
1010
with_error_logging('cc.deployment_updater') do
1111
config = CloudController::DependencyLocator.instance.config
1212
statsd_client = CloudController::DependencyLocator.instance.statsd_client
13+
prometheus_updater = CloudController::DependencyLocator.instance.prometheus_updater
1314

1415
update_step = proc { update(
1516
update_frequency: config.get(:deployment_updater, :update_frequency_in_seconds),
16-
statsd_client: statsd_client
17+
statsd_client: statsd_client,
18+
prometheus_updater: prometheus_updater
1719
)
1820
}
1921

@@ -39,14 +41,20 @@ def start
3941

4042
private
4143

42-
def update(update_frequency:, statsd_client:)
44+
def update(update_frequency:, statsd_client:, prometheus_updater:)
4345
logger = Steno.logger('cc.deployment_updater.scheduler')
4446

4547
update_start_time = Time.now
46-
statsd_client.time('cc.deployments.update.duration') do
47-
Dispatcher.dispatch
48-
end
48+
Dispatcher.dispatch
4949
update_duration = Time.now - update_start_time
50+
## NOTE: We're taking time in seconds and multiplying by 1000 because we don't have
51+
## access to time in milliseconds. If you ever get access to reliable time in
52+
## milliseconds, then do know that the lack of precision here is not desired
53+
## so feed in the entire value!
54+
update_duration_ms = update_duration * 1000
55+
statsd_client.timing('cc.deployments.update.duration', update_duration_ms)
56+
prometheus_updater.report_deployment_duration(update_duration_ms)
57+
5058
logger.info("Update loop took #{update_duration}s")
5159

5260
sleep_duration = update_frequency - update_duration

lib/cloud_controller/metrics/periodic_updater.rb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
module VCAP::CloudController::Metrics
55
class PeriodicUpdater
6-
def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpdater.new])
6+
def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpdater.new, PrometheusUpdater.new])
77
@start_time = start_time
88
@updaters = updaters
99
@log_counter = log_counter
@@ -15,7 +15,7 @@ def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpd
1515

1616
def setup_updates
1717
update!
18-
EM.add_periodic_timer(600) { catch_error { record_user_count } }
18+
EM.add_periodic_timer(600) { catch_error { update_user_count } }
1919
EM.add_periodic_timer(30) { catch_error { update_job_queue_length } }
2020
EM.add_periodic_timer(30) { catch_error { update_thread_info } }
2121
EM.add_periodic_timer(30) { catch_error { update_failed_job_count } }
@@ -26,7 +26,7 @@ def setup_updates
2626
end
2727

2828
def update!
29-
record_user_count
29+
update_user_count
3030
update_job_queue_length
3131
update_thread_info
3232
update_failed_job_count
@@ -67,10 +67,10 @@ def update_deploying_count
6767
@updaters.each { |u| u.update_deploying_count(deploying_count) }
6868
end
6969

70-
def record_user_count
70+
def update_user_count
7171
user_count = VCAP::CloudController::User.count
7272

73-
@updaters.each { |u| u.record_user_count(user_count) }
73+
@updaters.each { |u| u.update_user_count(user_count) }
7474
end
7575

7676
def update_job_queue_length
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
require 'prometheus/client'
2+
3+
module VCAP::CloudController::Metrics
4+
class PrometheusUpdater
5+
def initialize(registry=Prometheus::Client.registry)
6+
@registry = registry
7+
end
8+
9+
def update_gauge_metric(metric, value, message)
10+
unless @registry.exist?(metric)
11+
@registry.gauge(metric, docstring: message)
12+
end
13+
@registry.get(metric).set(value)
14+
end
15+
16+
def increment_gauge_metric(metric, message)
17+
unless @registry.exist?(metric)
18+
@registry.gauge(metric, docstring: message)
19+
end
20+
@registry.get(metric).increment
21+
end
22+
23+
def decrement_gauge_metric(metric, message)
24+
unless @registry.exist?(metric)
25+
@registry.gauge(metric, docstring: message)
26+
end
27+
@registry.get(metric).decrement
28+
end
29+
30+
def increment_counter_metric(metric, message)
31+
unless @registry.exist?(metric)
32+
@registry.counter(metric, docstring: message)
33+
end
34+
@registry.get(metric).increment
35+
end
36+
37+
def update_histogram_metric(metric, value, message, buckets)
38+
unless @registry.exist?(metric)
39+
@registry.histogram(metric, buckets: buckets, docstring: message)
40+
end
41+
@registry.get(metric).observe(value)
42+
end
43+
44+
def update_summary_metric(metric, value, message)
45+
unless @registry.exist?(metric)
46+
@registry.summary(metric, docstring: message)
47+
end
48+
@registry.get(metric).observe(value)
49+
end
50+
51+
def update_deploying_count(deploying_count)
52+
update_gauge_metric(:cc_deployments_deploying, deploying_count, 'Number of in progress deployments')
53+
end
54+
55+
def update_user_count(user_count)
56+
update_gauge_metric(:cc_total_users, user_count, 'Number of users')
57+
end
58+
59+
def update_job_queue_length(pending_job_count_by_queue, total)
60+
pending_job_count_by_queue.each do |key, value|
61+
metric_key = :"cc_job_queue_length_#{key.to_s.underscore}"
62+
update_gauge_metric(metric_key, value, docstring: "Job queue length for worker #{key}")
63+
end
64+
65+
update_gauge_metric(:cc_job_queue_length_total, total, 'Total job queue length')
66+
end
67+
68+
def update_thread_info(thread_info)
69+
update_gauge_metric(:cc_thread_info_thread_count, thread_info[:thread_count], 'Thread count')
70+
update_gauge_metric(:cc_thread_info_event_machine_connection_count, thread_info[:event_machine][:connection_count], 'Event Machine connection count')
71+
update_gauge_metric(:cc_thread_info_event_machine_threadqueue_size, thread_info[:event_machine][:threadqueue][:size], 'EventMachine thread queue size')
72+
update_gauge_metric(:cc_thread_info_event_machine_threadqueue_num_waiting, thread_info[:event_machine][:threadqueue][:num_waiting], 'EventMachine num waiting in thread')
73+
update_gauge_metric(:cc_thread_info_event_machine_resultqueue_size, thread_info[:event_machine][:resultqueue][:size], 'EventMachine queue size')
74+
update_gauge_metric(:cc_thread_info_event_machine_resultqueue_num_waiting, thread_info[:event_machine][:resultqueue][:num_waiting], 'EventMachine requests waiting in queue')
75+
end
76+
77+
def update_failed_job_count(failed_jobs_by_queue, total)
78+
failed_jobs_by_queue.each do |key, value|
79+
metric_key = :"cc_failed_job_count_#{key.to_s.underscore}"
80+
update_gauge_metric(metric_key, value, "Failed jobs for worker #{key}")
81+
end
82+
83+
update_gauge_metric(:cc_failed_job_count_total, total, 'Total failed jobs')
84+
end
85+
86+
def update_vitals(vitals)
87+
vitals.each do |key, value|
88+
metric_key = :"cc_vitals_#{key.to_s.underscore}"
89+
update_gauge_metric(metric_key, value, "CloudController Vitals: #{key}")
90+
end
91+
end
92+
93+
def update_log_counts(counts)
94+
counts.each do |key, value|
95+
metric_key = :"cc_log_count_#{key.to_s.underscore}"
96+
update_gauge_metric(metric_key, value, "Log count for log level '#{key}'")
97+
end
98+
end
99+
100+
def update_task_stats(total_running_tasks, total_memory_in_mb)
101+
update_gauge_metric(:cc_tasks_running_count, total_running_tasks, 'Total running tasks')
102+
update_gauge_metric(:cc_tasks_running_memory_in_mb, total_memory_in_mb, 'Total memory consumed by running tasks')
103+
end
104+
105+
def update_synced_invalid_lrps(lrp_count)
106+
update_gauge_metric(:cc_diego_sync_invalid_desired_lrps, lrp_count, 'Invalid Desired LRPs')
107+
end
108+
109+
def start_staging_request_received
110+
increment_counter_metric(:cc_staging_requested, 'Number of staging requests')
111+
end
112+
113+
def report_staging_success_metrics(duration_ns)
114+
increment_counter_metric(:cc_staging_succeeded, 'Number of successful staging events')
115+
update_histogram_metric(:cc_staging_succeeded_duration, nanoseconds_to_milliseconds(duration_ns), 'Durations of successful staging events', duration_buckets)
116+
end
117+
118+
def report_staging_failure_metrics(duration_ns)
119+
increment_counter_metric(:cc_staging_failed, 'Number of failed staging events')
120+
update_histogram_metric(:cc_staging_failed_duration, nanoseconds_to_milliseconds(duration_ns), 'Durations of failed staging events', duration_buckets)
121+
end
122+
123+
def report_diego_cell_sync_duration(duration_ms)
124+
update_summary_metric(:cc_diego_sync_duration, duration_ms, 'Diego cell sync duration')
125+
update_gauge_metric(:cc_diego_sync_duration_gauge, duration_ms, 'Diego cell sync duration (gauge metric)')
126+
end
127+
128+
def report_deployment_duration(duration_ms)
129+
update_summary_metric(:cc_deployments_update_duration, duration_ms, 'Deployment duration')
130+
update_gauge_metric(:cc_deployments_update_duration_gauge, duration_ms, 'Deployment duration (gauge metric)')
131+
end
132+
133+
private
134+
135+
def duration_buckets
136+
Prometheus::Client::Histogram.linear_buckets(start: 10000, width: 5000, count: 5)
137+
end
138+
139+
def nanoseconds_to_milliseconds(time_ns)
140+
(time_ns / 1e6).to_i
141+
end
142+
end
143+
end

lib/cloud_controller/metrics/request_metrics.rb

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,36 @@
33
module VCAP::CloudController
44
module Metrics
55
class RequestMetrics
6-
def initialize(statsd=Statsd.new)
6+
def initialize(statsd=Statsd.new, prometheus_updater=PrometheusUpdater.new)
77
@counter = 0
88
@statsd = statsd
9+
@prometheus_updater = prometheus_updater
910
end
1011

1112
def start_request
1213
@counter += 1
1314
@statsd.gauge('cc.requests.outstanding.gauge', @counter)
1415
@statsd.increment 'cc.requests.outstanding'
16+
17+
@prometheus_updater.update_gauge_metric(:cc_requests_outstanding_gauge, @counter, 'Requests Outstanding Gauge')
18+
@prometheus_updater.increment_gauge_metric(:cc_requests_outstanding, 'Requests Outstanding')
1519
end
1620

1721
def complete_request(status)
22+
http_status_code = "#{status.to_s[0]}XX"
23+
http_status_metric = "cc.http_status.#{http_status_code}"
1824
@counter -= 1
1925
@statsd.gauge('cc.requests.outstanding.gauge', @counter)
2026
@statsd.batch do |batch|
2127
batch.decrement 'cc.requests.outstanding'
2228
batch.increment 'cc.requests.completed'
23-
batch.increment "cc.http_status.#{status.to_s[0]}XX"
29+
batch.increment http_status_metric
2430
end
31+
32+
@prometheus_updater.update_gauge_metric(:cc_requests_outstanding_gauge, @counter, 'Requests Outstanding Gauge')
33+
@prometheus_updater.decrement_gauge_metric(:cc_requests_outstanding, 'Requests Outstanding')
34+
@prometheus_updater.increment_gauge_metric(:cc_requests_completed, 'Requests Completed')
35+
@prometheus_updater.increment_gauge_metric(http_status_metric.gsub('.', '_').to_sym, "Times HTTP status #{http_status_code} have been received")
2536
end
2637
end
2738
end

0 commit comments

Comments
 (0)