Skip to content
This repository was archived by the owner on Aug 7, 2025. It is now read-only.

Commit 866a20b

Browse files
committed
metrics: report pod communication latency percentiles in pdf
- Add R routine to load and plot percentiles from JSON. - Support any number of percentiles 1..n. - Store percentile configuration in JSON. Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
1 parent b6c7cf1 commit 866a20b

5 files changed

Lines changed: 506 additions & 470 deletions

File tree

metrics/lib/common.bash

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ framework_init() {
8484
k8s_api_init
8585

8686
# Launch our stats gathering pod
87-
if [ -n "$SMF_USE_COLLECTD" ]; then
87+
if [ "$SMF_USE_COLLECTD" == "true" ]; then
8888
info "Setting up collectd"
8989
init_stats $wait_time
9090
fi
@@ -104,7 +104,7 @@ framework_shutdown() {
104104
k8s_api_shutdown
105105
cpu_load_shutdown
106106

107-
if [ -n "$SMF_USE_COLLECTD" ]; then
107+
if [ "$SMF_USE_COLLECTD" == "true" ]; then
108108
cleanup_stats
109109
fi
110110

metrics/report/report_dockerfile/metrics_report.Rmd

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,18 @@ source('collectd_scaling.R')
5151

5252
\pagebreak
5353

54+
# Pod communication latency
55+
This [test](https://github.com/clearlinux/cloud-native-setup/metrics/scaling/k8s_scale_rapid_nc.sh)
56+
measures pod query--response latency when scaling up. The
57+
time is measured from sending a message directly to a socket, that `nc`
58+
listens to inside each pod, to reading the response from the pod.
59+
60+
```{r scaling_nc, echo=FALSE, fig.cap="K8S pod communication latency", results='asis'}
61+
source('tidy_scaling_nc.R')
62+
```
63+
64+
\pagebreak
65+
5466
# Test setup details
5567

5668
This table describes the test system details, as derived from the information contained
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#!/usr/bin/env Rscript
2+
# Copyright (c) 2019 Intel Corporation
3+
#
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
# Show pod communication latency
7+
8+
suppressMessages(suppressWarnings(library(ggplot2))) # ability to plot nicely.
9+
suppressWarnings(suppressWarnings(library(ggpubr))) # ggtexttable
10+
suppressMessages(library(jsonlite)) # to load the data.
11+
suppressMessages(library(scales)) # For de-science notation of axis
12+
library(tibble) # tibbles for tidy data
13+
14+
testnames=c(
15+
"k8s-rapid-nc"
16+
)
17+
18+
### For developers: uncomment following variables to run this as is in R
19+
# resultdirs=c("PATH/TO/RES1/", ...) # keep the ending slash on result paths
20+
# inputdir=""
21+
22+
latencydata=c()
23+
24+
# iterate over every set of results (test run)
25+
for (currentdir in resultdirs) {
26+
# For every results file we are interested in evaluating
27+
for (testname in testnames) {
28+
matchdir=paste(inputdir, currentdir, sep="")
29+
matchfile=paste(testname, '\\.json', sep="")
30+
files=list.files(matchdir, pattern=matchfile)
31+
32+
# For every matching results file
33+
for (ffound in files) {
34+
fname=paste(inputdir, currentdir, ffound, sep="")
35+
if (!file.exists(fname)) {
36+
warning(paste("Skipping non-existent file: ", fname))
37+
next
38+
}
39+
# Derive the name from the test result dirname
40+
datasetname=basename(currentdir)
41+
42+
# Import the data
43+
fdata=fromJSON(fname)
44+
# De-nest the test name specific data
45+
shortname=substr(ffound, 1, nchar(ffound)-nchar(".json"))
46+
fdata=fdata[[shortname]]
47+
testname=datasetname
48+
49+
# All the data we are looking for comes in BootResults,
50+
# so pick it out to make referencing easier
51+
br=fdata$BootResults
52+
53+
########################################################
54+
#### Now extract latency time percentiles (ltp) ########
55+
########################################################
56+
ltp=br$latency_time$Percentiles
57+
# Percentile thresholds, for example [5, 25, 50, 75, 95]
58+
ltp_perc=fdata$Config$nc_percentiles[[1]]
59+
perc_count = length(ltp_perc)
60+
# Measured times
61+
ltp_meas=matrix(unlist(ltp), nrow=perc_count)
62+
# Build latency percentiles tibble with nice headings
63+
ltpt=tibble(n_pods=br$n_pods$Result)
64+
for (n in seq(perc_count)) {
65+
p_title = paste0("p", ltp_perc[n])
66+
ltpt[p_title] = ltp_meas[n,]
67+
}
68+
# ltpt example: with percentiles [5, 50, 95]:
69+
# n_pods p5 p50 p95
70+
# 100 4 8 10
71+
# 200 5 11 15
72+
# 300 6 14 19
73+
ltpt$testname=testname
74+
latencydata=rbind(latencydata, ltpt)
75+
}
76+
}
77+
}
78+
79+
# Visualize data.
80+
if (length(latencydata[[1]]) <= 5 || length(unique(latencydata$testname)) > 1) {
81+
# If there are many tests to compare or only few data points, use boxplot with extra percentile points.
82+
latp = ggplot(data=latencydata, aes(x=n_pods)) + ylab("Latency (us)") + xlab("pods") + scale_y_continuous(labels=comma)
83+
perc_mid = floor((perc_count)/2)
84+
# Create boxplot around the middle percentile
85+
if (perc_count >= 3) {
86+
box_bottom=names(ltpt)[perc_mid+1]
87+
box_mid=names(ltpt)[perc_mid+2]
88+
box_top=names(ltpt)[perc_mid+3]
89+
if (perc_count >= 5) {
90+
whis_low=names(ltpt)[perc_mid]
91+
whis_high=names(ltpt)[perc_mid+4]
92+
latp = latp + geom_boxplot(aes_string(group="interaction(testname,n_pods)",ymin=whis_low,lower=box_bottom,middle=box_mid,upper=box_top,ymax=whis_high,fill="testname"),stat="identity")
93+
} else {
94+
latp = latp + geom_boxplot(aes_string(group="interaction(testname,n_pods)",lower=box_bottom,middle=box_mid,upper=box_top,fill="testname"),stat="identity")
95+
}
96+
}
97+
# Boxplot (above) covers at most 5 percentiles around the center (median).
98+
# Visualize the rest using a point for each percentile.
99+
if (perc_count > 5) {
100+
for (n in seq(1, (perc_count-5)/2)) {
101+
lower_name=names(ltpt)[n+1]
102+
upper_name=names(ltpt)[perc_count-n+2]
103+
latp = latp + geom_point(aes_string(group="interaction(testname,n_pods)",y=lower_name, color="testname"))
104+
latp = latp + geom_point(aes_string(group="interaction(testname,n_pods)",y=upper_name, color="testname"))
105+
}
106+
}
107+
} else {
108+
# Use colored areas and median lines when there are many ticks on X axis
109+
latp = ggplot(data=latencydata, aes(x=n_pods)) + ylab("Latency (us)") + xlab("pods") + scale_y_continuous(labels=comma)
110+
perc_mid = floor((perc_count)/2)
111+
perc_maxdist = perc_mid
112+
plot_number = 0
113+
for (plot_test in unique(latencydata$testname)) {
114+
plot_number = plot_number + 1
115+
for (n in seq(perc_mid)) {
116+
# First fill outmost areas, like p5..p25 and p75..p95,
117+
# then areas closer to the middle, like p25..p50 and p50..p75
118+
lower_name = names(ltpt)[n+1]
119+
lower_next_name = names(ltpt)[n+2]
120+
upper_name = names(ltpt)[perc_count-n+2]
121+
upper_prev_name = names(ltpt)[perc_count-n+1]
122+
alpha = 0.7 * ((n+1) / (perc_mid+1))**2
123+
latp = latp + geom_ribbon(data=latencydata[latencydata$testname==plot_test,],aes_string(x="n_pods",ymin=lower_name,ymax=lower_next_name,fill="testname"),alpha=alpha)
124+
latp = latp + geom_ribbon(data=latencydata[latencydata$testname==plot_test,],aes_string(x="n_pods",ymin=upper_prev_name,ymax=upper_name,fill="testname"),alpha=alpha)
125+
}
126+
median_index = match("p50", names(ltpt))
127+
if (!is.na(median_index)) {
128+
# Draw median line
129+
latp = latp + geom_line(data=latencydata[latencydata$testname==plot_test,],aes_string(x="n_pods",y=names(ltpt)[median_index],color="testname"))
130+
}
131+
}
132+
}
133+
134+
# Table presentation.
135+
lat_table=c()
136+
for (testname in unique(latencydata$testname)) {
137+
testlines=latencydata[latencydata$testname==testname,]
138+
lat_table=rbind(lat_table,testlines[1,])
139+
if (length(testlines) > 3) {
140+
# middle pod count
141+
lat_table=rbind(lat_table,testlines[(length(testlines)-1)/2,])
142+
}
143+
if (length(testlines) > 2) {
144+
# max pod count
145+
lat_table=rbind(lat_table,testlines[length(testlines)-1,])
146+
}
147+
}
148+
latt=ggtexttable(lat_table,rows=NULL)
149+
150+
cat("\n\nLatency percentiles illustrated in the Figure below: ", paste0(ltp_perc, "\\%"), "\n\n")
151+
152+
page1 = grid.arrange(latp, latt, ncol=1)
153+
154+
# pagebreak, as the graphs overflow the page otherwise
155+
cat("\n\n\\pagebreak\n")

0 commit comments

Comments
 (0)