cloudfoundry
diff --git a/‎Gemfile‎
Lines changed: 1 addition & 0 deletions b/‎Gemfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Gemfile.lock‎
Lines changed: 3 additions & 1 deletion b/‎Gemfile.lock‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎app/controllers/internal/metrics_controller.rb‎
Lines changed: 25 additions & 0 deletions b/‎app/controllers/internal/metrics_controller.rb‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎app/controllers/internal/staging_completion_controller.rb‎
Lines changed: 6 additions & 0 deletions b/‎app/controllers/internal/staging_completion_controller.rb‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎app/jobs/diego/sync.rb‎
Lines changed: 15 additions & 2 deletions b/‎app/jobs/diego/sync.rb‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎lib/cloud_controller/dependency_locator.rb‎
Lines changed: 8 additions & 0 deletions b/‎lib/cloud_controller/dependency_locator.rb‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎lib/cloud_controller/deployment_updater/scheduler.rb‎
Lines changed: 13 additions & 5 deletions b/‎lib/cloud_controller/deployment_updater/scheduler.rb‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎lib/cloud_controller/metrics/periodic_updater.rb‎
Lines changed: 5 additions & 5 deletions b/‎lib/cloud_controller/metrics/periodic_updater.rb‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎lib/cloud_controller/metrics/prometheus_updater.rb‎
Lines changed: 143 additions & 0 deletions b/‎lib/cloud_controller/metrics/prometheus_updater.rb‎
Lines changed: 143 additions & 0 deletions
diff --git a/‎lib/cloud_controller/metrics/request_metrics.rb‎
Lines changed: 13 additions & 2 deletions b/‎lib/cloud_controller/metrics/request_metrics.rb‎
Lines changed: 13 additions & 2 deletions
@@ -40,6 +40,7 @@ gem 'sequel_pg', require: 'sequel'
 gem 'sinatra', '~> 2.2'
 gem 'sinatra-contrib'
 gem 'statsd-ruby', '~> 1.4.0'
+gem 'prometheus-client'
 gem 'steno'
 gem 'talentbox-delayed_job_sequel', '~> 4.3.0'
 gem 'thin'
 
@@ -352,6 +352,7 @@ GEM
       ast (~> 2.4.1)
     pg (1.3.5)
     posix-spawn (0.3.15)
+    prometheus-client (3.0.0)
     protobuf (3.6.12)
       activesupport (>= 3.2)
       middleware
@@ -600,6 +601,7 @@ DEPENDENCIES
   parallel_tests
   pg
   posix-spawn (~> 0.3.15)
+  prometheus-client
   protobuf (= 3.6.12)
   pry-byebug
   psych (>= 4.0.4)
@@ -640,4 +642,4 @@ DEPENDENCIES
   yajl-ruby
 
 BUNDLED WITH
-   2.1.4
+   2.2.26
@@ -0,0 +1,25 @@
+require 'prometheus/client'
+require 'prometheus/client/formats/text'
+require 'cloud_controller/metrics/prometheus_updater'
+
+module VCAP::CloudController
+  module Internal
+    class MetricsController < RestController::BaseController
+      allow_unauthenticated_access
+      get '/internal/v4/metrics', :index
+
+      def index
+        periodic_updater = VCAP::CloudController::Metrics::PeriodicUpdater.new(
+          Time.now.utc,
+          Steno::Sink::Counter.new,
+          Steno.logger('cc.api'),
+          [
+            VCAP::CloudController::Metrics::StatsdUpdater.new,
+            VCAP::CloudController::Metrics::PrometheusUpdater.new
+          ])
+        periodic_updater.update!
+        [200, Prometheus::Client::Formats::Text.marshal(Prometheus::Client.registry)]
+      end
+    end
+  end
+end
@@ -117,15 +117,21 @@ def report_metrics(bbs_staging_response)
       duration = Time.now.utc.to_i * 1e9 - bbs_staging_response[:created_at]
       if bbs_staging_response[:failed]
         statsd_updater.report_staging_failure_metrics(duration)
+        prometheus_updater.report_staging_failure_metrics(duration)
       else
         statsd_updater.report_staging_success_metrics(duration)
+        prometheus_updater.report_staging_success_metrics(duration)
       end
     end
 
     def statsd_updater
       @statsd_updater ||= VCAP::CloudController::Metrics::StatsdUpdater.new
     end
 
+    def prometheus_updater
+      @prometheus_updater ||= VCAP::CloudController::Metrics::PrometheusUpdater.new # this should be using singleton
+    end
+
     attr_reader :stagers
 
     def read_body
 
@@ -7,15 +7,28 @@ module VCAP::CloudController
   module Jobs
     module Diego
       class Sync < VCAP::CloudController::Jobs::CCJob
-        def initialize(statsd=Statsd.new)
+        def initialize(statsd=Statsd.new, prometheus_updater=VCAP::CloudController::Metrics::PrometheusUpdater.new)
           @statsd = statsd
+          @prometheus_updater = prometheus_updater
         end
 
         def perform
           config = CloudController::DependencyLocator.instance.config
-          @statsd.time('cc.diego_sync.duration') do
+          begin
+            ## TODO: At some point in the future, start using a monotonic time source, rather than wall-clock time!
+            start = Time.now
             VCAP::CloudController::Diego::ProcessesSync.new(config: config).sync
             VCAP::CloudController::Diego::TasksSync.new(config: config).sync
+          ensure
+            finish = Time.now
+            ## NOTE: We're taking time in seconds and multiplying by 1000 because we don't have
+            ##       access to time in milliseconds. If you ever get access to reliable time in
+            ##       milliseconds, then do know that the lack of precision here is not desired
+            ##       so feed in the entire value!
+            elapsed_ms = ((finish - start) * 1000).round
+
+            @statsd.timing('cc.diego_sync.duration', elapsed_ms)
+            @prometheus_updater.report_diego_cell_sync_duration(elapsed_ms)
           end
         end
 
 
@@ -34,6 +34,7 @@
 require 'cloud_controller/opi/instances_client'
 require 'cloud_controller/opi/stager_client'
 require 'cloud_controller/opi/task_client'
+require 'cloud_controller/metrics/prometheus_updater'
 
 require 'bits_service_client'
 
@@ -70,6 +71,13 @@ def runners
       @dependencies[:runners] || register(:runners, VCAP::CloudController::Runners.new(config))
     end
 
+    def prometheus_updater
+      unless @dependencies[:prometheus_updater]
+        register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new)
+      end
+      @dependencies[:prometheus_updater]
+    end
+
     def stagers
       @dependencies[:stagers] || register(:stagers, VCAP::CloudController::Stagers.new(config))
     end
 
@@ -10,10 +10,12 @@ def start
           with_error_logging('cc.deployment_updater') do
             config = CloudController::DependencyLocator.instance.config
             statsd_client = CloudController::DependencyLocator.instance.statsd_client
+            prometheus_updater = CloudController::DependencyLocator.instance.prometheus_updater
 
             update_step = proc { update(
               update_frequency: config.get(:deployment_updater, :update_frequency_in_seconds),
-              statsd_client: statsd_client
+              statsd_client: statsd_client,
+              prometheus_updater: prometheus_updater
             )
             }
 
@@ -39,14 +41,20 @@ def start
 
         private
 
-        def update(update_frequency:, statsd_client:)
+        def update(update_frequency:, statsd_client:, prometheus_updater:)
           logger = Steno.logger('cc.deployment_updater.scheduler')
 
           update_start_time = Time.now
-          statsd_client.time('cc.deployments.update.duration') do
-            Dispatcher.dispatch
-          end
+          Dispatcher.dispatch
           update_duration = Time.now - update_start_time
+          ## NOTE: We're taking time in seconds and multiplying by 1000 because we don't have
+          ##       access to time in milliseconds. If you ever get access to reliable time in
+          ##       milliseconds, then do know that the lack of precision here is not desired
+          ##       so feed in the entire value!
+          update_duration_ms = update_duration * 1000
+          statsd_client.timing('cc.deployments.update.duration', update_duration_ms)
+          prometheus_updater.report_deployment_duration(update_duration_ms)
+
           logger.info("Update loop took #{update_duration}s")
 
           sleep_duration = update_frequency - update_duration
 
@@ -3,7 +3,7 @@
 
 module VCAP::CloudController::Metrics
   class PeriodicUpdater
-    def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpdater.new])
+    def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpdater.new, PrometheusUpdater.new])
       @start_time = start_time
       @updaters = updaters
       @log_counter = log_counter
@@ -15,7 +15,7 @@ def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpd
 
     def setup_updates
       update!
-      EM.add_periodic_timer(600) { catch_error { record_user_count } }
+      EM.add_periodic_timer(600) { catch_error { update_user_count } }
       EM.add_periodic_timer(30)  { catch_error { update_job_queue_length } }
       EM.add_periodic_timer(30)  { catch_error { update_thread_info } }
       EM.add_periodic_timer(30)  { catch_error { update_failed_job_count } }
@@ -26,7 +26,7 @@ def setup_updates
     end
 
     def update!
-      record_user_count
+      update_user_count
       update_job_queue_length
       update_thread_info
       update_failed_job_count
@@ -67,10 +67,10 @@ def update_deploying_count
       @updaters.each { |u| u.update_deploying_count(deploying_count) }
     end
 
-    def record_user_count
+    def update_user_count
       user_count = VCAP::CloudController::User.count
 
-      @updaters.each { |u| u.record_user_count(user_count) }
+      @updaters.each { |u| u.update_user_count(user_count) }
     end
 
     def update_job_queue_length
 
@@ -0,0 +1,143 @@
+require 'prometheus/client'
+
+module VCAP::CloudController::Metrics
+  class PrometheusUpdater
+    def initialize(registry=Prometheus::Client.registry)
+      @registry = registry
+    end
+
+    def update_gauge_metric(metric, value, message)
+      unless @registry.exist?(metric)
+        @registry.gauge(metric, docstring: message)
+      end
+      @registry.get(metric).set(value)
+    end
+
+    def increment_gauge_metric(metric, message)
+      unless @registry.exist?(metric)
+        @registry.gauge(metric, docstring: message)
+      end
+      @registry.get(metric).increment
+    end
+
+    def decrement_gauge_metric(metric, message)
+      unless @registry.exist?(metric)
+        @registry.gauge(metric, docstring: message)
+      end
+      @registry.get(metric).decrement
+    end
+
+    def increment_counter_metric(metric, message)
+      unless @registry.exist?(metric)
+        @registry.counter(metric, docstring: message)
+      end
+      @registry.get(metric).increment
+    end
+
+    def update_histogram_metric(metric, value, message, buckets)
+      unless @registry.exist?(metric)
+        @registry.histogram(metric, buckets: buckets, docstring: message)
+      end
+      @registry.get(metric).observe(value)
+    end
+
+    def update_summary_metric(metric, value, message)
+      unless @registry.exist?(metric)
+        @registry.summary(metric, docstring: message)
+      end
+      @registry.get(metric).observe(value)
+    end
+
+    def update_deploying_count(deploying_count)
+      update_gauge_metric(:cc_deployments_deploying, deploying_count, 'Number of in progress deployments')
+    end
+
+    def update_user_count(user_count)
+      update_gauge_metric(:cc_total_users, user_count, 'Number of users')
+    end
+
+    def update_job_queue_length(pending_job_count_by_queue, total)
+      pending_job_count_by_queue.each do |key, value|
+        metric_key = :"cc_job_queue_length_#{key.to_s.underscore}"
+        update_gauge_metric(metric_key, value, docstring: "Job queue length for worker #{key}")
+      end
+
+      update_gauge_metric(:cc_job_queue_length_total, total, 'Total job queue length')
+    end
+
+    def update_thread_info(thread_info)
+      update_gauge_metric(:cc_thread_info_thread_count, thread_info[:thread_count], 'Thread count')
+      update_gauge_metric(:cc_thread_info_event_machine_connection_count, thread_info[:event_machine][:connection_count], 'Event Machine connection count')
+      update_gauge_metric(:cc_thread_info_event_machine_threadqueue_size, thread_info[:event_machine][:threadqueue][:size], 'EventMachine thread queue size')
+      update_gauge_metric(:cc_thread_info_event_machine_threadqueue_num_waiting, thread_info[:event_machine][:threadqueue][:num_waiting], 'EventMachine num waiting in thread')
+      update_gauge_metric(:cc_thread_info_event_machine_resultqueue_size, thread_info[:event_machine][:resultqueue][:size], 'EventMachine queue size')
+      update_gauge_metric(:cc_thread_info_event_machine_resultqueue_num_waiting, thread_info[:event_machine][:resultqueue][:num_waiting], 'EventMachine requests waiting in queue')
+    end
+
+    def update_failed_job_count(failed_jobs_by_queue, total)
+      failed_jobs_by_queue.each do |key, value|
+        metric_key = :"cc_failed_job_count_#{key.to_s.underscore}"
+        update_gauge_metric(metric_key, value, "Failed jobs for worker #{key}")
+      end
+
+      update_gauge_metric(:cc_failed_job_count_total, total, 'Total failed jobs')
+    end
+
+    def update_vitals(vitals)
+      vitals.each do |key, value|
+        metric_key = :"cc_vitals_#{key.to_s.underscore}"
+        update_gauge_metric(metric_key, value, "CloudController Vitals: #{key}")
+      end
+    end
+
+    def update_log_counts(counts)
+      counts.each do |key, value|
+        metric_key = :"cc_log_count_#{key.to_s.underscore}"
+        update_gauge_metric(metric_key, value, "Log count for log level '#{key}'")
+      end
+    end
+
+    def update_task_stats(total_running_tasks, total_memory_in_mb)
+      update_gauge_metric(:cc_tasks_running_count, total_running_tasks, 'Total running tasks')
+      update_gauge_metric(:cc_tasks_running_memory_in_mb, total_memory_in_mb, 'Total memory consumed by running tasks')
+    end
+
+    def update_synced_invalid_lrps(lrp_count)
+      update_gauge_metric(:cc_diego_sync_invalid_desired_lrps, lrp_count, 'Invalid Desired LRPs')
+    end
+
+    def start_staging_request_received
+      increment_counter_metric(:cc_staging_requested, 'Number of staging requests')
+    end
+
+    def report_staging_success_metrics(duration_ns)
+      increment_counter_metric(:cc_staging_succeeded, 'Number of successful staging events')
+      update_histogram_metric(:cc_staging_succeeded_duration, nanoseconds_to_milliseconds(duration_ns), 'Durations of successful staging events', duration_buckets)
+    end
+
+    def report_staging_failure_metrics(duration_ns)
+      increment_counter_metric(:cc_staging_failed, 'Number of failed staging events')
+      update_histogram_metric(:cc_staging_failed_duration, nanoseconds_to_milliseconds(duration_ns), 'Durations of failed staging events', duration_buckets)
+    end
+
+    def report_diego_cell_sync_duration(duration_ms)
+      update_summary_metric(:cc_diego_sync_duration, duration_ms, 'Diego cell sync duration')
+      update_gauge_metric(:cc_diego_sync_duration_gauge, duration_ms, 'Diego cell sync duration (gauge metric)')
+    end
+
+    def report_deployment_duration(duration_ms)
+      update_summary_metric(:cc_deployments_update_duration, duration_ms, 'Deployment duration')
+      update_gauge_metric(:cc_deployments_update_duration_gauge, duration_ms, 'Deployment duration (gauge metric)')
+    end
+
+    private
+
+    def duration_buckets
+      Prometheus::Client::Histogram.linear_buckets(start: 10000, width: 5000, count: 5)
+    end
+
+    def nanoseconds_to_milliseconds(time_ns)
+      (time_ns / 1e6).to_i
+    end
+  end
+end
@@ -3,25 +3,36 @@
 module VCAP::CloudController
   module Metrics
     class RequestMetrics
-      def initialize(statsd=Statsd.new)
+      def initialize(statsd=Statsd.new, prometheus_updater=PrometheusUpdater.new)
         @counter = 0
         @statsd = statsd
+        @prometheus_updater = prometheus_updater
       end
 
       def start_request
         @counter += 1
         @statsd.gauge('cc.requests.outstanding.gauge', @counter)
         @statsd.increment 'cc.requests.outstanding'
+
+        @prometheus_updater.update_gauge_metric(:cc_requests_outstanding_gauge, @counter, 'Requests Outstanding Gauge')
+        @prometheus_updater.increment_gauge_metric(:cc_requests_outstanding, 'Requests Outstanding')
       end
 
       def complete_request(status)
+        http_status_code = "#{status.to_s[0]}XX"
+        http_status_metric = "cc.http_status.#{http_status_code}"
         @counter -= 1
         @statsd.gauge('cc.requests.outstanding.gauge', @counter)
         @statsd.batch do |batch|
           batch.decrement 'cc.requests.outstanding'
           batch.increment 'cc.requests.completed'
-          batch.increment "cc.http_status.#{status.to_s[0]}XX"
+          batch.increment http_status_metric
         end
+
+        @prometheus_updater.update_gauge_metric(:cc_requests_outstanding_gauge, @counter, 'Requests Outstanding Gauge')
+        @prometheus_updater.decrement_gauge_metric(:cc_requests_outstanding, 'Requests Outstanding')
+        @prometheus_updater.increment_gauge_metric(:cc_requests_completed, 'Requests Completed')
+        @prometheus_updater.increment_gauge_metric(http_status_metric.gsub('.', '_').to_sym, "Times HTTP status #{http_status_code} have been received")
       end
     end
   end