From 971fb616420c1766465361c01bbb87ad2ba52a71 Mon Sep 17 00:00:00 2001 From: Matthew McPherrin Date: Tue, 23 Dec 2025 14:26:21 -0500 Subject: [PATCH 1/4] Add observability stack to boulder-dev This is adding a more full observability stack to the boulder devenv. The purpose of this is to make it faster to iterate on Boulder's logging, tracing, and metrics without needing to leave the Boulder repo. Currently, this is configuring the opentelemetry collector to read the existing logs laid down by rsyslog and exporting them to Clickhouse. There's a Grafana (barely) configured to view the Clickhouse Data. TODO: Traces, metrics, grafana dashboard provisioning. --- docker-compose.yml | 62 +++++++++++- test/clickhouse/users.xml | 29 ++++++ .../provisioning/datasources/clickhouse.yaml | 24 +++++ test/otelcol/config.yaml | 95 +++++++++++++++++++ 4 files changed, 207 insertions(+), 3 deletions(-) create mode 100644 test/clickhouse/users.xml create mode 100644 test/grafana/provisioning/datasources/clickhouse.yaml create mode 100644 test/otelcol/config.yaml diff --git a/docker-compose.yml b/docker-compose.yml index d683f856837..78d51199177 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,6 +20,7 @@ services: - .:/boulder:cached - ./.gocache:/root/.cache/go-build:cached - ./test/certs/.softhsm-tokens/:/var/lib/softhsm/tokens/:cached + - logs:/var/log/ networks: bouldernet: ipv4_address: 10.77.77.77 @@ -56,7 +57,7 @@ services: - bredis_1 - bredis_2 - bconsul - - bjaeger + - otel-collector - bpkimetal entrypoint: test/entrypoint.sh working_dir: &boulder_working_dir /boulder @@ -135,8 +136,58 @@ services: ipv4_address: 10.77.77.10 command: "consul agent -dev -config-format=hcl -config-file=/test/consul/config.hcl" - bjaeger: - image: jaegertracing/all-in-one:1.50 + clickhouse: + image: clickhouse/clickhouse-server:latest + container_name: clickhouse + ports: + - "8123:8123" # HTTP interface + - "9000:9000" # Native interface + environment: + CLICKHOUSE_DB: otel + volumes: + - clickhouse_data:/var/lib/clickhouse + - ./test/clickhouse/users.xml:/etc/clickhouse-server/users.d/users.xml:ro + ulimits: + nofile: + soft: 262144 + hard: 262144 + restart: unless-stopped + networks: + - bouldernet + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + container_name: otel-collector + command: ["--config=/etc/otel-collector-config.yaml", "--feature-gates=clickhouse.json"] + volumes: + - logs:/var/log/boulder/ + - ./test/otelcol/config.yaml:/etc/otel-collector-config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver + - "8888:8888" # Prometheus metrics + - "8889:8889" # Prometheus exporter metrics + - "5514:5514/udp" # Syslog receiver + depends_on: + - clickhouse + restart: unless-stopped + networks: + - bouldernet + + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin_very_bad_password + - GF_INSTALL_PLUGINS=grafana-clickhouse-datasource + volumes: + - grafana_data:/var/lib/grafana + - ./test/grafana/provisioning:/etc/grafana/provisioning:ro + depends_on: + - clickhouse + restart: unless-stopped networks: - bouldernet @@ -162,6 +213,11 @@ services: aliases: - boulder-vitess +volumes: + clickhouse_data: + grafana_data: + logs: + networks: # This network represents the data-center internal network. It is used for # boulder services and their infrastructure, such as consul, mariadb, and diff --git a/test/clickhouse/users.xml b/test/clickhouse/users.xml new file mode 100644 index 00000000000..9f23cd1a0d1 --- /dev/null +++ b/test/clickhouse/users.xml @@ -0,0 +1,29 @@ + + + + + default_user_very_bad_password + + ::/0 + + default + default + + + otel_writer_very_bad_password + + ::/0 + + default + default + + + grafana_reader_very_bad_password + + ::/0 + + default + default + + + diff --git a/test/grafana/provisioning/datasources/clickhouse.yaml b/test/grafana/provisioning/datasources/clickhouse.yaml new file mode 100644 index 00000000000..7afe4c4175e --- /dev/null +++ b/test/grafana/provisioning/datasources/clickhouse.yaml @@ -0,0 +1,24 @@ +apiVersion: 1 + +datasources: + - name: ClickHouse + type: grafana-clickhouse-datasource + jsonData: + defaultDatabase: otel + port: 9000 + server: clickhouse + username: grafana_reader + logs: + defaultDatabase: otel + defaultTable: otel_logs + otelEnabled: true + otelVersion: latest + traces: + defaultDatabase: otel + defaultTable: otel_traces + otelEnabled: true + otelVersion: latest + secureJsonData: + password: grafana_reader_very_bad_password + isDefault: true + editable: true diff --git a/test/otelcol/config.yaml b/test/otelcol/config.yaml new file mode 100644 index 00000000000..75b89440214 --- /dev/null +++ b/test/otelcol/config.yaml @@ -0,0 +1,95 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:4318" + syslog: + udp: + listen_address: "0.0.0.0:5514" + protocol: rfc3164 + filelog: + include: + - /var/log/boulder/boulder-*.log + - /var/log/boulder/crl-storer.log + - /var/log/boulder/log-validator.log + - /var/log/boulder/nonce-service.log + operators: + - type: regex_parser + regex: '^(?P\S+) (?P\S+) (?P\S+) (?P\d+) (?P[^\[]+)\[(?P\d+)\]: (?P\S+) (?P.*?)$' + timestamp: + parse_from: attributes.timestamp + layout: '%Y-%m-%dT%H:%M:%S.%f%j' + severity: + parse_from: attributes.syslogseverity + preset: none + overwrite_text: true + mapping: # Map syslog levels to otel ones + fatal: 0 # syslog: emerg + error3: 1 # syslog: alert + error2: 2 # syslog: crit + error: 3 # syslog: err + warn: 4 # syslog: warning + info2: 5 # syslog: notice + info: 6 # syslog: info + debug: 7 # syslog: debug + - type: remove + field: attributes.timestamp + - type: remove + field: attributes.syslogseverity + - type: move + from: attributes.msg + to: body + - type: move + from: attributes.service + to: resource["service.name"] + - type: move # TODO: Parse integer, per semantic conventions + from: attributes.pid + to: resource["process.pid"] + +processors: + batch: + +exporters: + clickhouse: + endpoint: http://clickhouse:8123 + database: otel + username: otel_writer + password: otel_writer_very_bad_password + logs_table_name: otel_logs + traces_table_name: otel_traces + metrics_table_name: otel_metrics + timeout: 5s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + + prometheus: + namespace: "something" + endpoint: "0.0.0.0:8889" + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [clickhouse] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [clickhouse, prometheus] + logs: + receivers: [otlp, syslog, filelog] + processors: [batch] + exporters: [clickhouse] + telemetry: + metrics: + readers: + - pull: + exporter: + prometheus: + host: '0.0.0.0' + port: 8888 From 04faf1265b7bfd890f60fd4154f4247ca10caa53 Mon Sep 17 00:00:00 2001 From: Matthew McPherrin Date: Tue, 23 Dec 2025 15:55:09 -0500 Subject: [PATCH 2/4] set contextColumns at least ServiceName, but we want hostname in here too But need to get hostname in the right spot first --- test/grafana/provisioning/datasources/clickhouse.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/grafana/provisioning/datasources/clickhouse.yaml b/test/grafana/provisioning/datasources/clickhouse.yaml index 7afe4c4175e..76b6db108b5 100644 --- a/test/grafana/provisioning/datasources/clickhouse.yaml +++ b/test/grafana/provisioning/datasources/clickhouse.yaml @@ -13,6 +13,9 @@ datasources: defaultTable: otel_logs otelEnabled: true otelVersion: latest + selectContextColumns: true + contextColumns: + - ServiceName traces: defaultDatabase: otel defaultTable: otel_traces From 48c67029761b7cf7f2bba313226031a2234e17b7 Mon Sep 17 00:00:00 2001 From: Matthew McPherrin Date: Tue, 23 Dec 2025 16:18:10 -0500 Subject: [PATCH 3/4] switch bjaeger to otelcol --- test/config-next/admin.json | 2 +- test/config-next/bad-key-revoker.json | 2 +- test/config-next/ca.json | 2 +- test/config-next/crl-storer.json | 2 +- test/config-next/crl-updater.json | 2 +- test/config-next/log-validator.json | 2 +- test/config-next/nonce-a.json | 2 +- test/config-next/nonce-b.json | 2 +- test/config-next/publisher.json | 2 +- test/config-next/ra.json | 2 +- test/config-next/remoteva-a.json | 2 +- test/config-next/remoteva-b.json | 2 +- test/config-next/remoteva-c.json | 2 +- test/config-next/sa.json | 2 +- test/config-next/sfe.json | 2 +- test/config-next/va.json | 2 +- test/config-next/wfe2.json | 2 +- test/config/log-validator.json | 2 +- test/config/sfe.json | 2 +- test/integration/otel_test.go | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/test/config-next/admin.json b/test/config-next/admin.json index c0775344223..76dc03d383d 100644 --- a/test/config-next/admin.json +++ b/test/config-next/admin.json @@ -36,7 +36,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/bad-key-revoker.json b/test/config-next/bad-key-revoker.json index b031643865b..9e3e83ef335 100644 --- a/test/config-next/bad-key-revoker.json +++ b/test/config-next/bad-key-revoker.json @@ -30,7 +30,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/ca.json b/test/config-next/ca.json index ada60732617..445b4bf663d 100644 --- a/test/config-next/ca.json +++ b/test/config-next/ca.json @@ -202,7 +202,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/crl-storer.json b/test/config-next/crl-storer.json index 0934bcef071..991cadf54c6 100644 --- a/test/config-next/crl-storer.json +++ b/test/config-next/crl-storer.json @@ -38,7 +38,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/crl-updater.json b/test/config-next/crl-updater.json index 6bbb8a88b9f..4efb4fed692 100644 --- a/test/config-next/crl-updater.json +++ b/test/config-next/crl-updater.json @@ -59,7 +59,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/log-validator.json b/test/config-next/log-validator.json index 687058f6829..c46b8bb5e5f 100644 --- a/test/config-next/log-validator.json +++ b/test/config-next/log-validator.json @@ -3,7 +3,7 @@ "stdoutLevel": 7 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "files": [ diff --git a/test/config-next/nonce-a.json b/test/config-next/nonce-a.json index d14b44063f2..bf04fe52ad2 100644 --- a/test/config-next/nonce-a.json +++ b/test/config-next/nonce-a.json @@ -9,7 +9,7 @@ "syslogLevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "grpc": { diff --git a/test/config-next/nonce-b.json b/test/config-next/nonce-b.json index d14b44063f2..bf04fe52ad2 100644 --- a/test/config-next/nonce-b.json +++ b/test/config-next/nonce-b.json @@ -9,7 +9,7 @@ "syslogLevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "grpc": { diff --git a/test/config-next/publisher.json b/test/config-next/publisher.json index 3d0a0fb7e4e..16dc1d905d8 100644 --- a/test/config-next/publisher.json +++ b/test/config-next/publisher.json @@ -47,7 +47,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/ra.json b/test/config-next/ra.json index f9a8131f7d5..0e7a018c07f 100644 --- a/test/config-next/ra.json +++ b/test/config-next/ra.json @@ -183,7 +183,7 @@ "sysloglevel": 6 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/remoteva-a.json b/test/config-next/remoteva-a.json index f931ba183f3..d6b899bdd81 100644 --- a/test/config-next/remoteva-a.json +++ b/test/config-next/remoteva-a.json @@ -49,7 +49,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/remoteva-b.json b/test/config-next/remoteva-b.json index 937d6635d1c..8756037cda7 100644 --- a/test/config-next/remoteva-b.json +++ b/test/config-next/remoteva-b.json @@ -49,7 +49,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/remoteva-c.json b/test/config-next/remoteva-c.json index 6fc58b2b2dc..c650dcf05d7 100644 --- a/test/config-next/remoteva-c.json +++ b/test/config-next/remoteva-c.json @@ -49,7 +49,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/sa.json b/test/config-next/sa.json index f8c02f58fff..00c8ee3d6b1 100644 --- a/test/config-next/sa.json +++ b/test/config-next/sa.json @@ -57,7 +57,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/sfe.json b/test/config-next/sfe.json index 7d9fb08fec3..9cb6e2908e7 100644 --- a/test/config-next/sfe.json +++ b/test/config-next/sfe.json @@ -92,7 +92,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "openTelemetryHttpConfig": { diff --git a/test/config-next/va.json b/test/config-next/va.json index 21cd801760e..a906beb6627 100644 --- a/test/config-next/va.json +++ b/test/config-next/va.json @@ -73,7 +73,7 @@ "sysloglevel": 6 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/wfe2.json b/test/config-next/wfe2.json index 32b12cfc8c6..c943798ddf1 100644 --- a/test/config-next/wfe2.json +++ b/test/config-next/wfe2.json @@ -156,7 +156,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "openTelemetryHttpConfig": { diff --git a/test/config/log-validator.json b/test/config/log-validator.json index 687058f6829..c46b8bb5e5f 100644 --- a/test/config/log-validator.json +++ b/test/config/log-validator.json @@ -3,7 +3,7 @@ "stdoutLevel": 7 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "files": [ diff --git a/test/config/sfe.json b/test/config/sfe.json index 73aa1f58efc..514a003d539 100644 --- a/test/config/sfe.json +++ b/test/config/sfe.json @@ -39,7 +39,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "openTelemetryHttpConfig": { diff --git a/test/integration/otel_test.go b/test/integration/otel_test.go index b3d3ce48635..df971af4f22 100644 --- a/test/integration/otel_test.go +++ b/test/integration/otel_test.go @@ -261,7 +261,7 @@ func TestTraces(t *testing.T) { func traceIssuingTestCert(t *testing.T) trace.TraceID { // Configure this integration test to trace to jaeger:4317 like Boulder will shutdown := cmd.NewOpenTelemetry(cmd.OpenTelemetryConfig{ - Endpoint: "bjaeger:4317", + Endpoint: "otel-collector:4317", SampleRatio: 1, }, blog.Get()) defer shutdown(context.Background()) From bd7d35f88ec72b0dd55ad1b1ec0b027f82dfa739 Mon Sep 17 00:00:00 2001 From: Matthew McPherrin Date: Tue, 23 Dec 2025 16:54:55 -0500 Subject: [PATCH 4/4] Convert otel integration test to query clickhouse The returned data is much simpler, so this has a bit of nice simplification. --- test/integration/otel_test.go | 145 ++++++++++++---------------------- 1 file changed, 49 insertions(+), 96 deletions(-) diff --git a/test/integration/otel_test.go b/test/integration/otel_test.go index df971af4f22..54915077732 100644 --- a/test/integration/otel_test.go +++ b/test/integration/otel_test.go @@ -9,8 +9,8 @@ import ( "crypto/rand" "encoding/json" "fmt" - "io" "net/http" + "net/url" "os" "strings" "testing" @@ -27,82 +27,45 @@ import ( "github.com/letsencrypt/boulder/test" ) -// TraceResponse is the list of traces returned from Jaeger's trace search API -// We always search for a single trace by ID, so this should be length 1. -// This is a specialization of Jaeger's structuredResponse type which -// uses []interface{} upstream. -type TraceResponse struct { - Data []Trace -} - -// Trace represents a single trace in Jaeger's API -// See https://pkg.go.dev/github.com/jaegertracing/jaeger/model/json#Trace +// Trace is the list of traces returned from ClickHouse for this trace type Trace struct { - TraceID string - Spans []Span - Processes map[string]struct { - ServiceName string - } - Warnings []string + Data []Span } -// Span represents a single span in Jaeger's API -// See https://pkg.go.dev/github.com/jaegertracing/jaeger/model/json#Span +// Span in clickhouse results type Span struct { - SpanID string - OperationName string - Warnings []string - ProcessID string - References []struct { - RefType string - TraceID string - SpanID string - } + TraceId string + SpanId string + ParentSpanId string + SpanName string + ServiceName string } -func getTraceFromJaeger(t *testing.T, traceID trace.TraceID) Trace { +func getTraceFromClickHouse(t *testing.T, traceID trace.TraceID) Trace { t.Helper() - traceURL := "http://bjaeger:16686/api/traces/" + traceID.String() - resp, err := http.Get(traceURL) - test.AssertNotError(t, err, "failed to trace from jaeger: "+traceID.String()) - if resp.StatusCode == http.StatusNotFound { - t.Fatalf("jaeger returned 404 for trace %s", traceID) - } - test.AssertEquals(t, resp.StatusCode, http.StatusOK) - body, err := io.ReadAll(resp.Body) - test.AssertNotError(t, err, "failed to read trace body") + query := fmt.Sprintf("SELECT TraceId, SpanId, ParentSpanId, SpanName, ServiceName FROM otel.otel_traces WHERE TraceId = '%s'", traceID.String()) + clickhouseURL := fmt.Sprintf("http://clickhouse:8123/?default_format=JSON&query=%s", url.QueryEscape(query)) - var parsed TraceResponse - err = json.Unmarshal(body, &parsed) - test.AssertNotError(t, err, "failed to decode traces body") + req, err := http.NewRequest("GET", clickhouseURL, nil) + test.AssertNotError(t, err, "failed to create request") + req.SetBasicAuth("default", "default_user_very_bad_password") - if len(parsed.Data) != 1 { - t.Fatalf("expected to get exactly one trace from jaeger for %s: %v", traceID, parsed) - } + resp, err := http.DefaultClient.Do(req) + test.AssertNotError(t, err, "failed to query clickhouse") + defer resp.Body.Close() + test.AssertEquals(t, resp.StatusCode, http.StatusOK) - return parsed.Data[0] -} + var response Trace + test.AssertNotError(t, json.NewDecoder(resp.Body).Decode(&response), "failed to decode clickhouse response") -type expectedSpans struct { - Operation string - Service string - Children []expectedSpans + return response } -// isParent returns true if the given span has a parent of ParentID -// The empty string means no ParentID -func isParent(parentID string, span Span) bool { - if len(span.References) == 0 { - return parentID == "" - } - for _, ref := range span.References { - // In OpenTelemetry, CHILD_OF is the only reference, but Jaeger supports other systems. - if ref.RefType == "CHILD_OF" { - return ref.SpanID == parentID - } - } - return false +type expectedSpans struct { + SpanName string + Service string + Children []expectedSpans } func missingChildren(trace Trace, spanID string, children []expectedSpans) bool { @@ -117,24 +80,24 @@ func missingChildren(trace Trace, spanID string, children []expectedSpans) bool // findSpans checks if the expectedSpan and its expected children are found in trace func findSpans(trace Trace, parentSpan string, expectedSpan expectedSpans) bool { - for _, span := range trace.Spans { - if !isParent(parentSpan, span) { + for _, span := range trace.Data { + if span.ParentSpanId != parentSpan { continue } - if trace.Processes[span.ProcessID].ServiceName != expectedSpan.Service { + if span.ServiceName != expectedSpan.Service { continue } - if span.OperationName != expectedSpan.Operation { + if span.SpanName != expectedSpan.SpanName { continue } - if missingChildren(trace, span.SpanID, expectedSpan.Children) { + if missingChildren(trace, span.SpanId, expectedSpan.Children) { continue } // This span has the correct parent, service, operation, and children return true } - fmt.Printf("did not find span %s::%s with parent '%s'\n", expectedSpan.Service, expectedSpan.Operation, parentSpan) + fmt.Printf("did not find span %s::%s with parent '%s'\n", expectedSpan.Service, expectedSpan.SpanName, parentSpan) return false } @@ -165,13 +128,13 @@ func (c *ContextInjectingRoundTripper) RoundTrip(request *http.Request) (*http.R // rpcSpan is a helper for constructing an RPC span where we have both a client and server rpc operation func rpcSpan(op, client, server string, children ...expectedSpans) expectedSpans { return expectedSpans{ - Operation: op, - Service: client, + SpanName: op, + Service: client, Children: []expectedSpans{ { - Operation: op, - Service: server, - Children: children, + SpanName: op, + Service: server, + Children: children, }, }, } @@ -179,8 +142,8 @@ func rpcSpan(op, client, server string, children ...expectedSpans) expectedSpans func httpSpan(endpoint string, children ...expectedSpans) expectedSpans { return expectedSpans{ - Operation: endpoint, - Service: "boulder-wfe2", + SpanName: endpoint, + Service: "boulder-wfe2", Children: append(children, rpcSpan("nonce.NonceService/Nonce", "boulder-wfe2", "nonce-service"), rpcSpan("nonce.NonceService/Redeem", "boulder-wfe2", "nonce-service"), @@ -190,9 +153,9 @@ func httpSpan(endpoint string, children ...expectedSpans) expectedSpans { func redisPipelineSpan(op, service string, children ...expectedSpans) expectedSpans { return expectedSpans{ - Operation: "redis.pipeline " + op, - Service: service, - Children: children, + SpanName: "redis.pipeline " + op, + Service: service, + Children: children, } } @@ -213,11 +176,11 @@ func TestTraces(t *testing.T) { // flow: just enough to ensure that our otel tracing is working without // asserting too much about the exact set of RPCs we use under the hood. expectedSpans := expectedSpans{ - Operation: "TraceTest", - Service: "integration.test", + SpanName: "TraceTest", + Service: "integration.test", Children: []expectedSpans{ - {Operation: "/directory", Service: wfe}, - {Operation: "/acme/new-nonce", Service: wfe, Children: []expectedSpans{ + {SpanName: "/directory", Service: wfe}, + {SpanName: "/acme/new-nonce", Service: wfe, Children: []expectedSpans{ rpcSpan("nonce.NonceService/Nonce", wfe, "nonce-service")}}, httpSpan("/acme/new-acct", redisPipelineSpan("get", wfe)), @@ -238,28 +201,18 @@ func TestTraces(t *testing.T) { found := false const retries = 10 for range retries { - trace := getTraceFromJaeger(t, traceID) + trace = getTraceFromClickHouse(t, traceID) if findSpans(trace, "", expectedSpans) { found = true break } time.Sleep(sdktrace.DefaultScheduleDelay / 5 * time.Millisecond) } - test.Assert(t, found, fmt.Sprintf("Failed to find expected spans in Jaeger for trace %s", traceID)) - - test.AssertEquals(t, len(trace.Warnings), 0) - for _, span := range trace.Spans { - for _, warning := range span.Warnings { - if strings.Contains(warning, "clock skew adjustment disabled; not applying calculated delta") { - continue - } - t.Errorf("Span %s (%s) warning: %v", span.SpanID, span.OperationName, warning) - } - } + test.Assert(t, found, fmt.Sprintf("Failed to find expected spans in ClickHouse for trace %s", traceID)) } func traceIssuingTestCert(t *testing.T) trace.TraceID { - // Configure this integration test to trace to jaeger:4317 like Boulder will + // Configure this integration test to trace to otel-collector:4317 like Boulder will shutdown := cmd.NewOpenTelemetry(cmd.OpenTelemetryConfig{ Endpoint: "otel-collector:4317", SampleRatio: 1,