Skip to content
This repository was archived by the owner on May 12, 2026. It is now read-only.

Commit 5ed484d

Browse files
authored
Merge pull request #185 from quickwit-oss/traces
feat: add trace support
2 parents c825fd4 + ee88eb6 commit 5ed484d

41 files changed

Lines changed: 3910 additions & 184 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ To configure the Quickwit datasource, you need to provide the following informat
121121
- The index name.
122122
- The log message field name (optional). This is the field displayed in the explorer view.
123123
- The log level field name (optional). It must be a fast field.
124+
- The related logs or traces datasource (optional). This enables trace-to-logs and log-to-trace links when logs and traces are stored in separate Quickwit indexes.
124125

125126
### With Grafana UI
126127

@@ -141,6 +142,112 @@ datasources:
141142
logLevelField: severity_text
142143
```
143144
145+
### Logs and traces in separate indexes
146+
147+
When logs and traces are stored in different Quickwit indexes, configure one datasource per index and link them with `logsDatasourceUid` and `tracesDatasourceUid`.
148+
149+
```yaml
150+
apiVersion: 1
151+
152+
datasources:
153+
- name: Quickwit Logs
154+
uid: quickwit-logs
155+
type: quickwit-quickwit-datasource
156+
url: http://localhost:7280/api/v1
157+
jsonData:
158+
index: 'otel-logs-v0_9'
159+
logMessageField: body.message
160+
logLevelField: severity_text
161+
tracesDatasourceUid: quickwit-traces
162+
tracesDatasourceName: Quickwit Traces
163+
164+
- name: Quickwit Traces
165+
uid: quickwit-traces
166+
type: quickwit-quickwit-datasource
167+
url: http://localhost:7280/api/v1
168+
jsonData:
169+
index: 'otel-traces-v0_9'
170+
logsDatasourceUid: quickwit-logs
171+
logsDatasourceName: Quickwit Logs
172+
```
173+
174+
## Traces
175+
176+
The query editor has two trace query types:
177+
178+
- **Trace search** scans matching spans and returns one row per trace. Use this to find trace IDs by Lucene query, service, operation, status, or attributes.
179+
- **Traces** returns a full trace frame for Grafana's trace viewer. Use this with a trace ID query such as `trace_id:abc123`.
180+
181+
The trace parser expects Quickwit OpenTelemetry trace fields such as:
182+
183+
- `trace_id`
184+
- `span_id`
185+
- `parent_span_id`
186+
- `service_name`
187+
- `span_name`
188+
- `span_start_timestamp_nanos`
189+
- `span_duration_millis` or `span_end_timestamp_nanos`
190+
191+
It also reads optional fields for richer trace rendering:
192+
193+
- `resource_attributes` for service tags
194+
- `span_attributes` for span tags, including `service.peer.name` and `peer.service`
195+
- `span_status` for error status and warnings
196+
- `events` for span events and exception stack traces
197+
- `links` for span references
198+
- `scope_name` and `scope_version` for instrumentation library details
199+
200+
Trace responses include:
201+
202+
- Grafana trace frames for the trace viewer.
203+
- Node graph frames that summarize service-to-service calls.
204+
- Span warnings for error status and dropped attributes/events/links.
205+
- Span event details and exception stack traces.
206+
- Stable per-service node colors in the node graph.
207+
208+
### Trace/log correlations
209+
210+
Trace-to-logs links are attached to each trace span. They query the configured logs datasource with:
211+
212+
```text
213+
trace_id:${__span.traceId} AND span_id:${__span.spanId}
214+
```
215+
216+
Log-to-trace links are attached to log fields named:
217+
218+
- `trace_id`
219+
- `traceID`
220+
- `traceId`
221+
- `attributes.trace_id`
222+
223+
They open the configured traces datasource with:
224+
225+
```text
226+
trace_id:${__value.raw}
227+
```
228+
229+
### Local trace fixtures
230+
231+
For local testing, use the fixture script:
232+
233+
```bash
234+
QUICKWIT_URL=http://127.0.0.1:7280/api/v1 ./scripts/ingest-multi-service-traces.sh
235+
```
236+
237+
It writes two multi-service traces into `otel-traces-v0_9` and matching correlated logs into `otel-logs-v0_9`.
238+
239+
In **Trace search**, filter the fixture data with:
240+
241+
```text
242+
span_attributes.fixture:multi-service-trace
243+
```
244+
245+
In **Traces**, open a returned trace ID with:
246+
247+
```text
248+
trace_id:<trace id>
249+
```
250+
144251
## Features
145252

146253
- Explore view.
@@ -149,6 +256,9 @@ datasources:
149256
- Adhoc filters.
150257
- Annotations
151258
- Explore Log Context.
259+
- Trace search and trace view.
260+
- Trace-to-logs and log-to-trace links.
261+
- Service node graph for trace results.
152262
- [Alerting](https://grafana.com/docs/grafana/latest/alerting/).
153263

154264
## FAQ and Limitations

pkg/quickwit/client/client.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ type ReadyStatus struct {
2222

2323
type DatasourceInfo struct {
2424
ID int64
25+
UID string
26+
Name string
27+
LogsDatasourceUID string
28+
LogsDatasourceName string
29+
TracesDatasourceUID string
30+
TracesDatasourceName string
2531
HTTPClient *http.Client
2632
URL string
2733
Database string

pkg/quickwit/data_query.go

Lines changed: 195 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"fmt"
55
"regexp"
66
"strconv"
7+
"strings"
78

89
es "github.com/quickwit-oss/quickwit-datasource/pkg/quickwit/client"
910
"github.com/quickwit-oss/quickwit-datasource/pkg/quickwit/simplejson"
@@ -25,11 +26,22 @@ func buildMSR(queries []*Query, defaultTimeField string) ([]*es.SearchRequest, e
2526
b := ms.Search(q.Interval)
2627
b.Size(0)
2728
filters := b.Query().Bool().Filter()
29+
// Always pass Grafana's picker range through. Quickwit's metastore
30+
// prunes splits whose timestamps fall outside this window, so even
31+
// trace_id lookups go from "scan every split" to "scan a few" — the
32+
// same speedup the native Jaeger endpoint gets via auto-derived bounds.
2833
filters.AddDateRangeFilter(defaultTimeField, q.RangeTo, q.RangeFrom)
2934
filters.AddQueryStringFilter(q.RawQuery, true, "AND")
35+
if isTraceSearchQuery(q) {
36+
filters.AddQueryStringFilter(traceSearchSettingsQuery(q), true, "AND")
37+
}
3038

3139
if isLogsQuery(q) {
3240
processLogsQuery(q, b, q.RangeFrom, q.RangeTo, defaultTimeField)
41+
} else if isTraceSearchQuery(q) {
42+
processTraceSearchQuery(q, b, defaultTimeField)
43+
} else if isTracesQuery(q) {
44+
processTracesQuery(q, b, defaultTimeField)
3345
} else if isDocumentQuery(q) {
3446
processDocumentQuery(q, b, q.RangeFrom, q.RangeTo, defaultTimeField)
3547
} else {
@@ -269,8 +281,8 @@ func getPipelineAggField(m *MetricAgg) string {
269281

270282
func isQueryWithError(query *Query) error {
271283
if len(query.BucketAggs) == 0 {
272-
// If no aggregations, only document and logs queries are valid
273-
if len(query.Metrics) == 0 || !(isLogsQuery(query) || isDocumentQuery(query)) {
284+
// If no aggregations, only document, logs, and trace queries are valid
285+
if len(query.Metrics) == 0 || !(isLogsQuery(query) || isTraceSearchQuery(query) || isTracesQuery(query) || isDocumentQuery(query)) {
274286
return fmt.Errorf("invalid query, missing metrics and aggregations")
275287
}
276288
} else {
@@ -302,19 +314,185 @@ func isQueryWithError(query *Query) error {
302314
}
303315

304316
func isLogsQuery(query *Query) bool {
305-
return query.Metrics[0].Type == logsType
317+
return queryMetricType(query) == logsType
318+
}
319+
320+
func isTracesQuery(query *Query) bool {
321+
return queryMetricType(query) == tracesType
322+
}
323+
324+
func isTraceSearchQuery(query *Query) bool {
325+
return queryMetricType(query) == traceSearchType
306326
}
307327

308328
func isDocumentQuery(query *Query) bool {
309329
return isRawDataQuery(query) || isRawDocumentQuery(query)
310330
}
311331

312332
func isRawDataQuery(query *Query) bool {
313-
return query.Metrics[0].Type == rawDataType
333+
return queryMetricType(query) == rawDataType
314334
}
315335

316336
func isRawDocumentQuery(query *Query) bool {
317-
return query.Metrics[0].Type == rawDocumentType
337+
return queryMetricType(query) == rawDocumentType
338+
}
339+
340+
var (
341+
bareTraceIDPattern = regexp.MustCompile(`^[0-9a-fA-F]{32}$`)
342+
durationPattern = regexp.MustCompile(`(?i)^\s*(\d+(?:\.\d+)?)\s*(ns|us|ms|s|m|h)?\s*$`)
343+
)
344+
345+
func firstMetricType(query *Query) string {
346+
if query == nil || len(query.Metrics) == 0 {
347+
return ""
348+
}
349+
return query.Metrics[0].Type
350+
}
351+
352+
func queryMetricType(query *Query) string {
353+
return firstMetricType(query)
354+
}
355+
356+
func isBareTraceIDQuery(rawQuery string) bool {
357+
return bareTraceIDPattern.MatchString(strings.TrimSpace(rawQuery))
358+
}
359+
360+
func traceSearchSettingsQuery(query *Query) string {
361+
if !isTraceSearchQuery(query) || len(query.Metrics) == 0 || query.Metrics[0].Settings == nil {
362+
return ""
363+
}
364+
365+
settings := query.Metrics[0].Settings
366+
clauses := []string{}
367+
368+
if serviceName := strings.TrimSpace(settings.Get("serviceName").MustString()); serviceName != "" {
369+
clauses = append(clauses, traceSearchPhraseClause("service_name", serviceName))
370+
}
371+
if spanName := strings.TrimSpace(settings.Get("spanName").MustString()); spanName != "" {
372+
clauses = append(clauses, traceSearchPhraseClause("span_name", spanName))
373+
}
374+
if statusClause := traceSearchStatusClause(settings.Get("status").MustString()); statusClause != "" {
375+
clauses = append(clauses, statusClause)
376+
}
377+
if minDuration, ok := traceSearchDurationMillis(settings.Get("minDuration").MustString()); ok {
378+
clauses = append(clauses, "span_duration_millis:>="+minDuration)
379+
}
380+
if maxDuration, ok := traceSearchDurationMillis(settings.Get("maxDuration").MustString()); ok {
381+
clauses = append(clauses, "span_duration_millis:<="+maxDuration)
382+
}
383+
384+
return strings.Join(clauses, " AND ")
385+
}
386+
387+
func traceSearchPhraseClause(fieldName, value string) string {
388+
escaped := strings.ReplaceAll(value, `\`, `\\`)
389+
escaped = strings.ReplaceAll(escaped, `"`, `\"`)
390+
return fieldName + `:"` + escaped + `"`
391+
}
392+
393+
func traceSearchStatusClause(status string) string {
394+
// span_status is mapped as `tokenizer: raw`, so matches are exact tokens.
395+
// OTel canonical strings are TitleCase ("Error"/"Ok"/"Unset") and the OTLP
396+
// wire form is the integer enum (0/1/2) or "STATUS_CODE_*". Some pipelines
397+
// lowercase, so cover all variants we've seen.
398+
switch strings.ToLower(strings.TrimSpace(status)) {
399+
case "error":
400+
return `(span_status.code:Error OR span_status.code:ERROR OR span_status.code:error OR span_status.code:STATUS_CODE_ERROR OR span_status.code:2 OR span_attributes.error:true OR span_attributes.otel.status_code:ERROR)`
401+
case "ok":
402+
return `(span_status.code:Ok OR span_status.code:OK OR span_status.code:ok OR span_status.code:STATUS_CODE_OK OR span_status.code:1)`
403+
case "unset":
404+
return `(span_status.code:Unset OR span_status.code:UNSET OR span_status.code:unset OR span_status.code:STATUS_CODE_UNSET OR span_status.code:0)`
405+
default:
406+
return ""
407+
}
408+
}
409+
410+
func traceSearchDurationMillis(duration string) (string, bool) {
411+
matches := durationPattern.FindStringSubmatch(duration)
412+
if matches == nil {
413+
return "", false
414+
}
415+
416+
value, err := strconv.ParseFloat(matches[1], 64)
417+
if err != nil {
418+
return "", false
419+
}
420+
421+
switch strings.ToLower(matches[2]) {
422+
case "ns":
423+
value = value / 1000000
424+
case "us":
425+
value = value / 1000
426+
case "s":
427+
value = value * 1000
428+
case "m":
429+
value = value * 60 * 1000
430+
case "h":
431+
value = value * 60 * 60 * 1000
432+
case "ms", "":
433+
default:
434+
return "", false
435+
}
436+
437+
return strconv.FormatFloat(value, 'f', -1, 64), true
438+
}
439+
440+
func canInferTraceLink(query *Query) bool {
441+
firstMetricType := firstMetricType(query)
442+
return firstMetricType == "" || firstMetricType == logsType || firstMetricType == tracesType
443+
}
444+
445+
func canTraceQueryTypeNormalize(query *Query) bool {
446+
// queryType is a transient hint from internal trace links. Once the request
447+
// has an explicit non-log/non-trace metric, the metric type is authoritative.
448+
return query.QueryType == tracesType && canInferTraceLink(query)
449+
}
450+
451+
func normalizeInternalLinkTraceQuery(query *Query) {
452+
if query == nil {
453+
return
454+
}
455+
456+
if query.QueryType == tracesType && !canInferTraceLink(query) {
457+
query.QueryType = ""
458+
}
459+
460+
if !canTraceQueryTypeNormalize(query) &&
461+
firstMetricType(query) != tracesType &&
462+
!(firstMetricType(query) == "" && isBareTraceIDQuery(query.RawQuery)) {
463+
return
464+
}
465+
466+
rawQuery := strings.TrimSpace(query.RawQuery)
467+
if isBareTraceIDQuery(rawQuery) {
468+
query.RawQuery = "trace_id:" + rawQuery
469+
}
470+
query.QueryType = tracesType
471+
query.BucketAggs = []*BucketAgg{}
472+
473+
if len(query.Metrics) == 0 {
474+
query.Metrics = []*MetricAgg{
475+
{
476+
ID: "1",
477+
Type: tracesType,
478+
Settings: simplejson.NewFromAny(map[string]interface{}{"limit": "1000"}),
479+
Meta: simplejson.New(),
480+
},
481+
}
482+
return
483+
}
484+
485+
query.Metrics = query.Metrics[:1]
486+
query.Metrics[0].Type = tracesType
487+
if query.Metrics[0].ID == "" {
488+
query.Metrics[0].ID = "1"
489+
}
490+
if query.Metrics[0].Settings == nil {
491+
query.Metrics[0].Settings = simplejson.New()
492+
}
493+
if query.Metrics[0].Settings.Get("limit").MustString() == "" {
494+
query.Metrics[0].Settings.Set("limit", "1000")
495+
}
318496
}
319497

320498
func processLogsQuery(q *Query, b *es.SearchRequestBuilder, from, to int64, defaultTimeField string) {
@@ -337,6 +515,18 @@ func processLogsQuery(q *Query, b *es.SearchRequestBuilder, from, to int64, defa
337515
}
338516
}
339517

518+
func processTracesQuery(q *Query, b *es.SearchRequestBuilder, defaultTimeField string) {
519+
metric := q.Metrics[0]
520+
b.Sort(es.SortOrderAsc, defaultTimeField, "epoch_nanos_int")
521+
b.Size(stringToIntWithDefaultValue(metric.Settings.Get("limit").MustString(), defaultSize))
522+
}
523+
524+
func processTraceSearchQuery(q *Query, b *es.SearchRequestBuilder, defaultTimeField string) {
525+
metric := q.Metrics[0]
526+
b.Sort(es.SortOrderDesc, defaultTimeField, "epoch_nanos_int")
527+
b.Size(stringToIntWithDefaultValue(metric.Settings.Get("spanLimit").MustString(), 5000))
528+
}
529+
340530
func processDocumentQuery(q *Query, b *es.SearchRequestBuilder, from, to int64, defaultTimeField string) {
341531
metric := q.Metrics[0]
342532
b.Sort(es.SortOrderDesc, defaultTimeField, "epoch_nanos_int")

0 commit comments

Comments
 (0)