Skip to content

Commit d282be7

Browse files
committed
feat: use check statuses 1h/1d summary to get the best window for
check_statuses [skip ci]
1 parent 73363e0 commit d282be7

4 files changed

Lines changed: 124 additions & 11 deletions

File tree

query/check_details.go

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,32 @@ import (
1414
"github.com/samber/lo"
1515
)
1616

17-
var (
17+
const (
1818
// Maximum number of past checks in the in-memory cache
1919
DefaultCacheCount = 5
2020

2121
// Default search window
2222
DefaultCheckQueryWindow = "1h"
23+
24+
// The number of data points that should be strived for
25+
// when aggregating check statuses.
26+
desiredNumOfCheckStatuses = 100
27+
)
28+
29+
var (
30+
// allowed list of window durations that are used when aggregating check statuses.
31+
allowedWindows = []time.Duration{
32+
time.Minute, // 1m
33+
time.Minute * 5, // 5m
34+
time.Minute * 15, // 15m
35+
time.Minute * 30, // 30m
36+
time.Hour, // 1h
37+
time.Hour * 3, // 3h
38+
time.Hour * 6, // 6h
39+
time.Hour * 12, // 12h
40+
time.Hour * 24, // 24h
41+
time.Hour * 24 * 7, // 1w
42+
}
2343
)
2444

2545
type Timeseries struct {
@@ -138,10 +158,84 @@ func (q CheckQueryParams) GetWhereClause() (string, map[string]interface{}, erro
138158
return strings.TrimSpace(clause), args, nil
139159
}
140160

141-
func (q CheckQueryParams) ExecuteDetails(ctx context.Context) ([]Timeseries, types.Uptime, types.Latency, error) {
161+
func getBestPartitioner(totalChecks int, rangeDuration time.Duration) time.Duration {
162+
if totalChecks <= desiredNumOfCheckStatuses {
163+
return 0 // No need to perform window aggregation
164+
}
165+
166+
bestDelta := 100000000 // sufficiently large delta to begin with
167+
bestWindow := allowedWindows[0]
168+
169+
for _, wp := range allowedWindows {
170+
numWindows := int(rangeDuration / wp)
171+
delta := abs(desiredNumOfCheckStatuses - numWindows)
172+
173+
if delta < bestDelta {
174+
bestDelta = delta
175+
bestWindow = wp
176+
} else {
177+
// as soon as we notice that the delta gets worse, we break the loop
178+
break
179+
}
180+
}
181+
182+
numWindows := int(rangeDuration / bestWindow)
183+
if abs(desiredNumOfCheckStatuses-totalChecks) <= abs(desiredNumOfCheckStatuses-numWindows) {
184+
// If this best partition creates windows such that the resulting number of data points deviate more
185+
// from the desired data points than the actual data points, then we do not aggregate.
186+
// Example: if there are 144 checks for the duration of 6 days,
187+
// then the best partition, 1 hour, would generate 144 data points.
188+
// But the original data points (120) are closer to 100, so we do not aggregate.
189+
return 0
190+
}
191+
192+
return bestWindow
193+
}
194+
195+
func optimalWindow(ctx context.Context, from, to time.Time) (time.Duration, error) {
196+
var view string
197+
timeRange := to.Sub(from)
198+
if timeRange > time.Hour*24*21 {
199+
view = "check_statuses_1d"
200+
} else if timeRange > time.Hour*48 {
201+
view = "check_statuses_1h"
202+
} else {
203+
return -1, nil //
204+
}
205+
206+
q := fmt.Sprintf(`
207+
SELECT
208+
SUM(total) AS total,
209+
MAX(created_at) AS latest,
210+
MIN(created_at) AS earliest
211+
FROM
212+
%s
213+
WHERE
214+
created_at >= ? AND created_at <= ?;`, view)
215+
var total *int
216+
var latest, earliest *time.Time
217+
if err := ctx.DB().Raw(q, from, to).Row().Scan(&total, &latest, &earliest); err != nil {
218+
return 0, err
219+
}
220+
if total == nil {
221+
return -1, nil //
222+
}
223+
224+
return getBestPartitioner(*total, earliest.Sub(*latest)), nil
225+
}
226+
227+
func CheckStatuses(ctx context.Context, q CheckQueryParams) ([]Timeseries, types.Uptime, types.Latency, error) {
142228
start := q.GetStartTime().Format(time.RFC3339)
143229
end := q.GetEndTime().Format(time.RFC3339)
144230

231+
// For the given ranges try to find the best window using check statuses summary
232+
window, err := optimalWindow(ctx, *q.GetStartTime(), *q.GetEndTime())
233+
if err != nil {
234+
return nil, types.Uptime{}, types.Latency{}, err
235+
} else if window >= 0 {
236+
q.WindowDuration = window
237+
}
238+
145239
query := `
146240
With grouped_by_window AS (
147241
SELECT
@@ -309,3 +403,14 @@ func parseDuration(d string, name string) (clause string, arg interface{}, err e
309403
}
310404
return "", nil, fmt.Errorf("start time must be a duration or RFC3339 timestamp")
311405
}
406+
407+
// abs returns the absolute value of i.
408+
// math.Abs only supports float64 and this avoids the needless type conversions
409+
// and ugly expression.
410+
func abs(n int) int {
411+
if n > 0 {
412+
return n
413+
}
414+
415+
return -n
416+
}

tests/fixtures/dummy/check_statuses.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import (
66
"github.com/flanksource/duty/models"
77
)
88

9-
func generateStatus(check models.Check, t time.Time, count int, passingMod int) []models.CheckStatus {
9+
func generateStatus(check models.Check, t time.Time, schedule time.Duration, count int, passingMod int) []models.CheckStatus {
1010
var statuses = []models.CheckStatus{}
1111

1212
for i := 0; i < count; i++ {
@@ -27,9 +27,9 @@ func generateStatus(check models.Check, t time.Time, count int, passingMod int)
2727

2828
func AllDummyCheckStatuses() []models.CheckStatus {
2929
statuses := append(
30-
generateStatus(LogisticsAPIHealthHTTPCheck, CurrentTime, 70, 5),
31-
generateStatus(DeletedCheck, CurrentTime, 1, 1)[0],
32-
generateStatus(DeletedCheckOld, *DeletedCheckOld.CreatedAt, 1, 1)[0],
30+
generateStatus(LogisticsAPIHealthHTTPCheck, CurrentTime, time.Minute, 70, 5),
31+
generateStatus(DeletedCheck, CurrentTime, time.Minute, 1, 1)[0],
32+
generateStatus(DeletedCheckOld, *DeletedCheckOld.CreatedAt, time.Minute, 1, 1)[0],
3333
models.CheckStatus{
3434
CheckID: LogisticsAPIHomeHTTPCheck.ID,
3535
Duration: 100,
@@ -46,12 +46,12 @@ func AllDummyCheckStatuses() []models.CheckStatus {
4646
},
4747
)
4848

49-
statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-15*time.Minute), 1, 1)[0])
50-
statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-2*time.Hour), 10, 2)...)
49+
statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-15*time.Minute), time.Minute, 1, 1)[0])
50+
statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-2*time.Hour), time.Minute, 10, 2)...)
5151

5252
// Check statuses from 2022-01-01
5353
// not dervied from current time for consistency
54-
statuses = append(statuses, generateStatus(CartAPIHeathCheckAgent, time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC), 70, 5)...)
54+
statuses = append(statuses, generateStatus(CartAPIHeathCheckAgent, time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC), time.Minute*5, 1440, 5)...) // 1440 check statuses spanning 5 days
5555

5656
return statuses
5757
}

tests/query_check_details_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import (
1313
. "github.com/onsi/gomega"
1414
)
1515

16-
var _ = ginkgo.Describe("CheckDetails", ginkgo.Ordered, func() {
16+
var _ = ginkgo.Describe("CheckDetails", ginkgo.Ordered, ginkgo.Focus, func() {
1717
type testRecord struct {
1818
since string
1919
statuses int
@@ -48,7 +48,7 @@ var _ = ginkgo.Describe("CheckDetails", ginkgo.Ordered, func() {
4848
err = q.Init(urlParam)
4949
Expect(err).To(BeNil())
5050

51-
ts, uptime, latency, err := q.ExecuteDetails(DefaultContext)
51+
ts, uptime, latency, err := query.CheckStatuses(DefaultContext, q)
5252
Expect(err).To(BeNil())
5353

5454
Expect(len(ts)).To(Equal(td.statuses), "unexpected number of results")

tests/setup/common.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,14 @@ func BeforeSuiteFn(args ...interface{}) context.Context {
122122
logger.Infof("Created dummy data %v", len(dummyData.Checks))
123123
}
124124

125+
if err, _ := job.AggregateCheckStatus1d(DefaultContext); err != nil {
126+
panic(err.Error())
127+
}
128+
129+
if err, _ := job.AggregateCheckStatus1h(DefaultContext); err != nil {
130+
panic(err.Error())
131+
}
132+
125133
DefaultContext := DefaultContext.WithKubernetes(fake.NewSimpleClientset(&v1.ConfigMap{
126134
ObjectMeta: metav1.ObjectMeta{
127135
Name: "test-cm",

0 commit comments

Comments
 (0)