Skip to content
This repository was archived by the owner on Aug 7, 2025. It is now read-only.

Commit b3c1aee

Browse files
committed
metrics: report pod communication latency percentiles in pdf
- Add R routine to load and plot percentiles from JSON. - Support any number of percentiles 1..n. - Store percentile configuration in JSON. - Fix handling percentile value 100. Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
1 parent a7f92a9 commit b3c1aee

3 files changed

Lines changed: 128 additions & 9 deletions

File tree

metrics/report/report_dockerfile/metrics_report.Rmd

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,18 @@ source('parallel.R')
3838

3939
\pagebreak
4040

41+
# Pod communication latency
42+
This [test](https://github.com/clearlinux/cloud-native-setup/metrics/scaling/k8s_scale_nc.sh)
43+
measures pod query--response latency when scaling up. The
44+
time is measured from sending a message directly to a socket, that `nc`
45+
listens to inside each pod, to reading the response from the pod.
46+
47+
```{r scaling_nc, echo=FALSE, fig.cap="K8S pod communication latency", results='asis'}
48+
source('tidy_scaling_nc.R')
49+
```
50+
51+
\pagebreak
52+
4153
# Test setup details
4254

4355
This table describes the test system details, as derived from the information contained
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/usr/bin/env Rscript
2+
# Copyright (c) 2019 Intel Corporation
3+
#
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
# Show pod communication latency
7+
8+
suppressMessages(suppressWarnings(library(ggplot2))) # ability to plot nicely.
9+
suppressMessages(library(jsonlite)) # to load the data.
10+
library(tibble) # tibbles for tidy data
11+
12+
testnames=c(
13+
"k8s-scaling-nc.*"
14+
)
15+
16+
### For developers: uncomment following variables to run this as is in R
17+
# resultdirs=c("")
18+
# inputdir="PATH/TO/DIR/CONTAINING/testnames/WITH/ENDING/SLASH/"
19+
20+
# iterate over every set of results (test run)
21+
for (currentdir in resultdirs) {
22+
# For every results file we are interested in evaluating
23+
for (testname in testnames) {
24+
matchdir=paste(inputdir, currentdir, sep="")
25+
matchfile=paste(testname, '\\.json', sep="")
26+
files=list.files(matchdir, pattern=matchfile)
27+
28+
# For every matching results file
29+
for (ffound in files) {
30+
fname=paste(inputdir, currentdir, ffound, sep="")
31+
if (!file.exists(fname)) {
32+
warning(paste("Skipping non-existent file: ", fname))
33+
next
34+
}
35+
36+
# Derive the name from the test result dirname
37+
datasetname=basename(currentdir)
38+
39+
# Import the data
40+
fdata=fromJSON(fname)
41+
# De-nest the test name specific data
42+
shortname=substr(ffound, 1, nchar(ffound)-nchar(".json"))
43+
fdata=fdata[[shortname]]
44+
testname=datasetname
45+
46+
# All the data we are looking for comes in BootResults,
47+
# so pick it out to make referencing easier
48+
br=fdata$BootResults
49+
50+
########################################################
51+
#### Now extract latency time percentiles (ltp) ########
52+
########################################################
53+
ltp=br$latency_time$Percentiles
54+
# Percentile thresholds, for example [5, 25, 50, 75, 95]
55+
ltp_perc=ltp[[1]]
56+
perc_count = length(ltp_perc)
57+
# Measured times
58+
ltp_meas=matrix(unlist(ltp[c(2:length(ltp))]), nrow=perc_count)
59+
# Build latency percentiles tibble with nice headings
60+
ltpt=tibble(n_pods=br$n_pods$Result[c(2:length(br$n_pods$Result))])
61+
for (n in seq(perc_count)) {
62+
p_title = paste0("p", ltp_perc[n])
63+
ltpt[p_title] = ltp_meas[n,]
64+
}
65+
# ltpt example: with percentiles [5, 50, 95]:
66+
# n_pods p5 p50 p95
67+
# 100 4 8 10
68+
# 200 5 11 15
69+
# 300 6 14 19
70+
}
71+
}
72+
}
73+
74+
########## Output pod communication latency page ##############
75+
ltpp = ggplot(data=ltpt, aes(x=n_pods)) + ylab("Latency (ms)") + xlab("pods")
76+
# Highlight the middle percentile (usually median)
77+
# and symmetrically belittle other percentage lines
78+
perc_mid = floor((perc_count+1)/2)
79+
perc_maxdist = perc_mid - 1
80+
for (n in seq(perc_count)) {
81+
# The sparser the dots the farther away the line is from the middle
82+
perc_dist = abs(n-perc_mid)
83+
if (perc_dist != 0) {
84+
perc_linetype = paste0(2*(1+perc_maxdist-perc_dist), perc_dist+1)
85+
} else {
86+
perc_linetype = "solid"
87+
}
88+
ltpp = ltpp + geom_line(
89+
aes_string(y=names(ltpt)[n+1]),
90+
alpha=1.0 - 0.4 * (perc_dist/perc_maxdist),
91+
linetype=perc_linetype,
92+
color="blue")
93+
}
94+
95+
cat("\n\nLatency percentiles illustrated in the Figure below: ", paste0(ltp_perc, "\\%"), "\n\n")
96+
97+
page1 = grid.arrange(ltpp, ncol=1)
98+
99+
# pagebreak, as the graphs overflow the page otherwise
100+
cat("\n\n\\pagebreak\n")

metrics/scaling/k8s_scale_nc.sh

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ nc_req_msg_len=${nc_req_msg_len:-1000}
2222
nc_port=33101
2323
# request message
2424
nc_req_msg=$(head -c $nc_req_msg_len /dev/zero | tr '\0' 'x')
25+
nc_percentiles=(5 25 50 75 95)
2526

2627
pod_command="[\"nc\", \"-lk\", \"-p\", \"${nc_port}\", \"-e\", \"/bin/cat\"]"
2728

@@ -276,7 +277,7 @@ run() {
276277
metrics_json_start_array
277278

278279
# grab starting stats before launching workload pods
279-
grab_stats 0 0 0
280+
grab_stats 0 0 ${nc_percentiles[@]}
280281

281282
for reqs in $(seq ${STEP} ${STEP} ${NUM_PODS}); do
282283
info "Testing replicas ${reqs} of ${NUM_PODS}"
@@ -360,16 +361,22 @@ run() {
360361
unset IFS
361362
local latency_pod_array_len=${#latency_pod_array[@]}
362363
local latency_percentiles=()
363-
latency_percentiles+=(${latency_pod_array_sorted[$(bc <<<"$latency_pod_array_len / 20")]})
364-
latency_percentiles+=(${latency_pod_array_sorted[$(bc <<<"$latency_pod_array_len / 4")]})
365-
latency_percentiles+=(${latency_pod_array_sorted[$(bc <<<"$latency_pod_array_len / 2")]})
366-
latency_percentiles+=(${latency_pod_array_sorted[$(bc <<<"$latency_pod_array_len / 1.25")]})
367-
latency_percentiles+=(${latency_pod_array_sorted[$(bc <<<"$latency_pod_array_len / 1.05")]})
368-
info "Latency percentiles [ms] 5-25-50-75-95 %: ${latency_percentiles[*]}"
364+
for p in ${nc_percentiles[@]}; do
365+
if [[ $p -lt 100 ]]; then
366+
latency_percentiles+=(${latency_pod_array_sorted[$(bc <<<"$latency_pod_array_len * $p / 100")]})
367+
else
368+
# Asking for a value that is greater than 100 % of measured values.
369+
# This is the way to save the maximum value.
370+
latency_percentiles+=(${latency_pod_array_sorted[$(bc <<<"$latency_pod_array_len - 1")]})
371+
fi
372+
done
373+
info "Latency percentiles [ms] ${nc_percentiles[@]} %: ${latency_percentiles[@]}"
369374
else
370375
local latency_avg_ms=0
371-
local latency_percentiles=(0 0 0 0 0)
372-
376+
local latency_percentiles=()
377+
for p in ${nc_percentiles[@]}; do
378+
latency_percentiles+=(0)
379+
done
373380
fi
374381

375382
grab_stats $total_milliseconds $reqs ${latency_percentiles[@]}

0 commit comments

Comments
 (0)