Skip to content

Commit 6ab9727

Browse files
committed
refactor: Improve Zipf fitting calculation in popularity analysis
* Updated the calculation of Zipf alpha to use a more robust linear regression method. * Enhanced logging for regression failures with specific error messages. * Cleaned up the code for calculating log frequencies and ranks for better readability.
1 parent 8c0f681 commit 6ab9727

4 files changed

Lines changed: 28 additions & 24 deletions

File tree

doc/quickstart_traceAnalyzer.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ The trace analyzer will generate statistics of the trace and save them to `stat`
6262
write: 0(0), overwrite: 0(0), del:0(0)
6363
request rate min 1753.7533 req/s, max 1986.3433 req/s, window 300s
6464
object rate min 300.3567 obj/s, max 319.8633 obj/s, window 300s
65-
popularity: Zipf linear fitting slope=0.9472
65+
popularity: Zipf alpha=0.9472, R2=0.97
6666
X-hit (number of obj accessed X times): 323699(0.3606), 218436(0.2433), 51516(0.0574), 128181(0.1428), 48785(0.0543), 25172(0.0280), 14606(0.0163), 14769(0.0165),
6767
freq (fraction) of the most popular obj: 546563(0.0547), 365140(0.0365), 221311(0.0221), 190811(0.0191), 154037(0.0154), 151832(0.0152), 127070(0.0127), 98851(0.0099),
6868
</details>

libCacheSim/traceAnalyzer/popularity.cpp

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -66,21 +66,28 @@ void Popularity::run(obj_info_map_type &obj_map) {
6666
WARN("%s\n", fit_fail_reason_.c_str());
6767
}
6868

69-
/* calculate Zipf alpha using linear regression */
70-
vector<double> log_freq(obj_map.size());
71-
vector<double> log_rank(obj_map.size());
69+
/* calculate Zipf alpha using linear regression: log(freq) = -alpha*log(rank) + c */
70+
const size_t n = freq_vec_.size();
71+
vector<double> log_freq(n);
72+
vector<double> log_rank(n);
7273

73-
int i = 0;
74-
for_each(log_freq.begin(), log_freq.end(),
75-
[&](double &item) { item = log(freq_vec_[i++]); });
76-
i = 0;
77-
for_each(log_rank.begin(), log_rank.end(), [&](double &item) {
78-
++i;
79-
item = log(i);
80-
});
74+
for (size_t i = 0; i < n; i++) {
75+
log_freq[i] = log(static_cast<double>(freq_vec_[i]));
76+
log_rank[i] = log(static_cast<double>(i + 1));
77+
}
8178

82-
/* TODO: a better linear regression with intercept and R2 */
83-
slope_ = -PopularityUtils::slope(log_rank, log_freq);
79+
double reg_slope, reg_intercept, r;
80+
int err = linreg(static_cast<int>(n), log_rank.data(), log_freq.data(),
81+
&reg_slope, &reg_intercept, &r);
82+
if (err != 0) {
83+
fit_fail_reason_ = "popularity: singular regression matrix (e.g. uniform)";
84+
WARN("%s\n", fit_fail_reason_.c_str());
85+
return;
86+
}
87+
/* Zipf: log(freq) = -alpha*log(rank) + c, so reg_slope = -alpha */
88+
slope_ = -reg_slope;
89+
intercept_ = reg_intercept;
90+
r2_ = (std::isfinite(r) ? r * r : 0.0);
8491

8592
has_run = true;
8693
}

libCacheSim/traceAnalyzer/popularity.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ class Popularity {
4747
os << popularity.fit_fail_reason_ << "\n";
4848
else
4949
os << std::setprecision(4)
50-
<< "popularity: Zipf linear fitting slope=" << popularity.slope_
51-
<< ", intercept=" << popularity.intercept_ << ", R2=" << popularity.r2_
50+
<< "popularity: Zipf alpha=" << popularity.slope_
51+
<< ", R2=" << popularity.r2_
5252
<< "\n";
5353

5454
return os;

scripts/traceAnalysis/popularity.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -80,17 +80,14 @@ def plot_popularity_Zipf(datapath, figname_prefix=""):
8080

8181
x = np.log(np.arange(1, 1 + len(sorted_freq)))
8282
y = np.log(np.array(sorted_freq))
83-
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
83+
reg_slope, _, r_value, p_value, std_err = stats.linregress(x, y)
84+
alpha = -reg_slope
85+
r2 = r_value * r_value
8486

8587
if sorted_freq[0] < 100:
86-
s = "{:48} {:12} obj alpha 0, r^2 0 (the most popular object has less than 100 requests)".format(
87-
figname_prefix,
88-
len(sorted_freq),
89-
)
88+
s = "popularity: Zipf alpha=0, R2=0 (the most popular object has less than 100 requests)"
9089
else:
91-
s = "{:48} {:12} obj alpha {:.4f}, r^2 {:.4f}".format(
92-
figname_prefix, len(sorted_freq), -slope, r_value * r_value
93-
)
90+
s = "popularity: Zipf alpha={:.4f}, R2={:.4f}".format(alpha, r2)
9491

9592
logger.info(s)
9693

0 commit comments

Comments
 (0)