diff --git a/docker-compose.yml b/docker-compose.yml index d683f856837..78d51199177 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,6 +20,7 @@ services: - .:/boulder:cached - ./.gocache:/root/.cache/go-build:cached - ./test/certs/.softhsm-tokens/:/var/lib/softhsm/tokens/:cached + - logs:/var/log/ networks: bouldernet: ipv4_address: 10.77.77.77 @@ -56,7 +57,7 @@ services: - bredis_1 - bredis_2 - bconsul - - bjaeger + - otel-collector - bpkimetal entrypoint: test/entrypoint.sh working_dir: &boulder_working_dir /boulder @@ -135,8 +136,58 @@ services: ipv4_address: 10.77.77.10 command: "consul agent -dev -config-format=hcl -config-file=/test/consul/config.hcl" - bjaeger: - image: jaegertracing/all-in-one:1.50 + clickhouse: + image: clickhouse/clickhouse-server:latest + container_name: clickhouse + ports: + - "8123:8123" # HTTP interface + - "9000:9000" # Native interface + environment: + CLICKHOUSE_DB: otel + volumes: + - clickhouse_data:/var/lib/clickhouse + - ./test/clickhouse/users.xml:/etc/clickhouse-server/users.d/users.xml:ro + ulimits: + nofile: + soft: 262144 + hard: 262144 + restart: unless-stopped + networks: + - bouldernet + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + container_name: otel-collector + command: ["--config=/etc/otel-collector-config.yaml", "--feature-gates=clickhouse.json"] + volumes: + - logs:/var/log/boulder/ + - ./test/otelcol/config.yaml:/etc/otel-collector-config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver + - "8888:8888" # Prometheus metrics + - "8889:8889" # Prometheus exporter metrics + - "5514:5514/udp" # Syslog receiver + depends_on: + - clickhouse + restart: unless-stopped + networks: + - bouldernet + + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin_very_bad_password + - GF_INSTALL_PLUGINS=grafana-clickhouse-datasource + volumes: + - grafana_data:/var/lib/grafana + - ./test/grafana/provisioning:/etc/grafana/provisioning:ro + depends_on: + - clickhouse + restart: unless-stopped networks: - bouldernet @@ -162,6 +213,11 @@ services: aliases: - boulder-vitess +volumes: + clickhouse_data: + grafana_data: + logs: + networks: # This network represents the data-center internal network. It is used for # boulder services and their infrastructure, such as consul, mariadb, and diff --git a/test/clickhouse/users.xml b/test/clickhouse/users.xml new file mode 100644 index 00000000000..9f23cd1a0d1 --- /dev/null +++ b/test/clickhouse/users.xml @@ -0,0 +1,29 @@ + + + + + default_user_very_bad_password + + ::/0 + + default + default + + + otel_writer_very_bad_password + + ::/0 + + default + default + + + grafana_reader_very_bad_password + + ::/0 + + default + default + + + diff --git a/test/config-next/admin.json b/test/config-next/admin.json index c0775344223..76dc03d383d 100644 --- a/test/config-next/admin.json +++ b/test/config-next/admin.json @@ -36,7 +36,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/bad-key-revoker.json b/test/config-next/bad-key-revoker.json index b031643865b..9e3e83ef335 100644 --- a/test/config-next/bad-key-revoker.json +++ b/test/config-next/bad-key-revoker.json @@ -30,7 +30,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/ca.json b/test/config-next/ca.json index ada60732617..445b4bf663d 100644 --- a/test/config-next/ca.json +++ b/test/config-next/ca.json @@ -202,7 +202,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/crl-storer.json b/test/config-next/crl-storer.json index 0934bcef071..991cadf54c6 100644 --- a/test/config-next/crl-storer.json +++ b/test/config-next/crl-storer.json @@ -38,7 +38,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/crl-updater.json b/test/config-next/crl-updater.json index 6bbb8a88b9f..4efb4fed692 100644 --- a/test/config-next/crl-updater.json +++ b/test/config-next/crl-updater.json @@ -59,7 +59,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/log-validator.json b/test/config-next/log-validator.json index 687058f6829..c46b8bb5e5f 100644 --- a/test/config-next/log-validator.json +++ b/test/config-next/log-validator.json @@ -3,7 +3,7 @@ "stdoutLevel": 7 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "files": [ diff --git a/test/config-next/nonce-a.json b/test/config-next/nonce-a.json index d14b44063f2..bf04fe52ad2 100644 --- a/test/config-next/nonce-a.json +++ b/test/config-next/nonce-a.json @@ -9,7 +9,7 @@ "syslogLevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "grpc": { diff --git a/test/config-next/nonce-b.json b/test/config-next/nonce-b.json index d14b44063f2..bf04fe52ad2 100644 --- a/test/config-next/nonce-b.json +++ b/test/config-next/nonce-b.json @@ -9,7 +9,7 @@ "syslogLevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "grpc": { diff --git a/test/config-next/publisher.json b/test/config-next/publisher.json index 3d0a0fb7e4e..16dc1d905d8 100644 --- a/test/config-next/publisher.json +++ b/test/config-next/publisher.json @@ -47,7 +47,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/ra.json b/test/config-next/ra.json index f9a8131f7d5..0e7a018c07f 100644 --- a/test/config-next/ra.json +++ b/test/config-next/ra.json @@ -183,7 +183,7 @@ "sysloglevel": 6 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/remoteva-a.json b/test/config-next/remoteva-a.json index f931ba183f3..d6b899bdd81 100644 --- a/test/config-next/remoteva-a.json +++ b/test/config-next/remoteva-a.json @@ -49,7 +49,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/remoteva-b.json b/test/config-next/remoteva-b.json index 937d6635d1c..8756037cda7 100644 --- a/test/config-next/remoteva-b.json +++ b/test/config-next/remoteva-b.json @@ -49,7 +49,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/remoteva-c.json b/test/config-next/remoteva-c.json index 6fc58b2b2dc..c650dcf05d7 100644 --- a/test/config-next/remoteva-c.json +++ b/test/config-next/remoteva-c.json @@ -49,7 +49,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/sa.json b/test/config-next/sa.json index f8c02f58fff..00c8ee3d6b1 100644 --- a/test/config-next/sa.json +++ b/test/config-next/sa.json @@ -57,7 +57,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/sfe.json b/test/config-next/sfe.json index 7d9fb08fec3..9cb6e2908e7 100644 --- a/test/config-next/sfe.json +++ b/test/config-next/sfe.json @@ -92,7 +92,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "openTelemetryHttpConfig": { diff --git a/test/config-next/va.json b/test/config-next/va.json index 21cd801760e..a906beb6627 100644 --- a/test/config-next/va.json +++ b/test/config-next/va.json @@ -73,7 +73,7 @@ "sysloglevel": 6 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 } } diff --git a/test/config-next/wfe2.json b/test/config-next/wfe2.json index 32b12cfc8c6..c943798ddf1 100644 --- a/test/config-next/wfe2.json +++ b/test/config-next/wfe2.json @@ -156,7 +156,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "openTelemetryHttpConfig": { diff --git a/test/config/log-validator.json b/test/config/log-validator.json index 687058f6829..c46b8bb5e5f 100644 --- a/test/config/log-validator.json +++ b/test/config/log-validator.json @@ -3,7 +3,7 @@ "stdoutLevel": 7 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "files": [ diff --git a/test/config/sfe.json b/test/config/sfe.json index 73aa1f58efc..514a003d539 100644 --- a/test/config/sfe.json +++ b/test/config/sfe.json @@ -39,7 +39,7 @@ "sysloglevel": -1 }, "openTelemetry": { - "endpoint": "bjaeger:4317", + "endpoint": "otel-collector:4317", "sampleratio": 1 }, "openTelemetryHttpConfig": { diff --git a/test/grafana/provisioning/datasources/clickhouse.yaml b/test/grafana/provisioning/datasources/clickhouse.yaml new file mode 100644 index 00000000000..76b6db108b5 --- /dev/null +++ b/test/grafana/provisioning/datasources/clickhouse.yaml @@ -0,0 +1,27 @@ +apiVersion: 1 + +datasources: + - name: ClickHouse + type: grafana-clickhouse-datasource + jsonData: + defaultDatabase: otel + port: 9000 + server: clickhouse + username: grafana_reader + logs: + defaultDatabase: otel + defaultTable: otel_logs + otelEnabled: true + otelVersion: latest + selectContextColumns: true + contextColumns: + - ServiceName + traces: + defaultDatabase: otel + defaultTable: otel_traces + otelEnabled: true + otelVersion: latest + secureJsonData: + password: grafana_reader_very_bad_password + isDefault: true + editable: true diff --git a/test/integration/otel_test.go b/test/integration/otel_test.go index b3d3ce48635..54915077732 100644 --- a/test/integration/otel_test.go +++ b/test/integration/otel_test.go @@ -9,8 +9,8 @@ import ( "crypto/rand" "encoding/json" "fmt" - "io" "net/http" + "net/url" "os" "strings" "testing" @@ -27,82 +27,45 @@ import ( "github.com/letsencrypt/boulder/test" ) -// TraceResponse is the list of traces returned from Jaeger's trace search API -// We always search for a single trace by ID, so this should be length 1. -// This is a specialization of Jaeger's structuredResponse type which -// uses []interface{} upstream. -type TraceResponse struct { - Data []Trace -} - -// Trace represents a single trace in Jaeger's API -// See https://pkg.go.dev/github.com/jaegertracing/jaeger/model/json#Trace +// Trace is the list of traces returned from ClickHouse for this trace type Trace struct { - TraceID string - Spans []Span - Processes map[string]struct { - ServiceName string - } - Warnings []string + Data []Span } -// Span represents a single span in Jaeger's API -// See https://pkg.go.dev/github.com/jaegertracing/jaeger/model/json#Span +// Span in clickhouse results type Span struct { - SpanID string - OperationName string - Warnings []string - ProcessID string - References []struct { - RefType string - TraceID string - SpanID string - } + TraceId string + SpanId string + ParentSpanId string + SpanName string + ServiceName string } -func getTraceFromJaeger(t *testing.T, traceID trace.TraceID) Trace { +func getTraceFromClickHouse(t *testing.T, traceID trace.TraceID) Trace { t.Helper() - traceURL := "http://bjaeger:16686/api/traces/" + traceID.String() - resp, err := http.Get(traceURL) - test.AssertNotError(t, err, "failed to trace from jaeger: "+traceID.String()) - if resp.StatusCode == http.StatusNotFound { - t.Fatalf("jaeger returned 404 for trace %s", traceID) - } - test.AssertEquals(t, resp.StatusCode, http.StatusOK) - body, err := io.ReadAll(resp.Body) - test.AssertNotError(t, err, "failed to read trace body") + query := fmt.Sprintf("SELECT TraceId, SpanId, ParentSpanId, SpanName, ServiceName FROM otel.otel_traces WHERE TraceId = '%s'", traceID.String()) + clickhouseURL := fmt.Sprintf("http://clickhouse:8123/?default_format=JSON&query=%s", url.QueryEscape(query)) - var parsed TraceResponse - err = json.Unmarshal(body, &parsed) - test.AssertNotError(t, err, "failed to decode traces body") + req, err := http.NewRequest("GET", clickhouseURL, nil) + test.AssertNotError(t, err, "failed to create request") + req.SetBasicAuth("default", "default_user_very_bad_password") - if len(parsed.Data) != 1 { - t.Fatalf("expected to get exactly one trace from jaeger for %s: %v", traceID, parsed) - } + resp, err := http.DefaultClient.Do(req) + test.AssertNotError(t, err, "failed to query clickhouse") + defer resp.Body.Close() + test.AssertEquals(t, resp.StatusCode, http.StatusOK) - return parsed.Data[0] -} + var response Trace + test.AssertNotError(t, json.NewDecoder(resp.Body).Decode(&response), "failed to decode clickhouse response") -type expectedSpans struct { - Operation string - Service string - Children []expectedSpans + return response } -// isParent returns true if the given span has a parent of ParentID -// The empty string means no ParentID -func isParent(parentID string, span Span) bool { - if len(span.References) == 0 { - return parentID == "" - } - for _, ref := range span.References { - // In OpenTelemetry, CHILD_OF is the only reference, but Jaeger supports other systems. - if ref.RefType == "CHILD_OF" { - return ref.SpanID == parentID - } - } - return false +type expectedSpans struct { + SpanName string + Service string + Children []expectedSpans } func missingChildren(trace Trace, spanID string, children []expectedSpans) bool { @@ -117,24 +80,24 @@ func missingChildren(trace Trace, spanID string, children []expectedSpans) bool // findSpans checks if the expectedSpan and its expected children are found in trace func findSpans(trace Trace, parentSpan string, expectedSpan expectedSpans) bool { - for _, span := range trace.Spans { - if !isParent(parentSpan, span) { + for _, span := range trace.Data { + if span.ParentSpanId != parentSpan { continue } - if trace.Processes[span.ProcessID].ServiceName != expectedSpan.Service { + if span.ServiceName != expectedSpan.Service { continue } - if span.OperationName != expectedSpan.Operation { + if span.SpanName != expectedSpan.SpanName { continue } - if missingChildren(trace, span.SpanID, expectedSpan.Children) { + if missingChildren(trace, span.SpanId, expectedSpan.Children) { continue } // This span has the correct parent, service, operation, and children return true } - fmt.Printf("did not find span %s::%s with parent '%s'\n", expectedSpan.Service, expectedSpan.Operation, parentSpan) + fmt.Printf("did not find span %s::%s with parent '%s'\n", expectedSpan.Service, expectedSpan.SpanName, parentSpan) return false } @@ -165,13 +128,13 @@ func (c *ContextInjectingRoundTripper) RoundTrip(request *http.Request) (*http.R // rpcSpan is a helper for constructing an RPC span where we have both a client and server rpc operation func rpcSpan(op, client, server string, children ...expectedSpans) expectedSpans { return expectedSpans{ - Operation: op, - Service: client, + SpanName: op, + Service: client, Children: []expectedSpans{ { - Operation: op, - Service: server, - Children: children, + SpanName: op, + Service: server, + Children: children, }, }, } @@ -179,8 +142,8 @@ func rpcSpan(op, client, server string, children ...expectedSpans) expectedSpans func httpSpan(endpoint string, children ...expectedSpans) expectedSpans { return expectedSpans{ - Operation: endpoint, - Service: "boulder-wfe2", + SpanName: endpoint, + Service: "boulder-wfe2", Children: append(children, rpcSpan("nonce.NonceService/Nonce", "boulder-wfe2", "nonce-service"), rpcSpan("nonce.NonceService/Redeem", "boulder-wfe2", "nonce-service"), @@ -190,9 +153,9 @@ func httpSpan(endpoint string, children ...expectedSpans) expectedSpans { func redisPipelineSpan(op, service string, children ...expectedSpans) expectedSpans { return expectedSpans{ - Operation: "redis.pipeline " + op, - Service: service, - Children: children, + SpanName: "redis.pipeline " + op, + Service: service, + Children: children, } } @@ -213,11 +176,11 @@ func TestTraces(t *testing.T) { // flow: just enough to ensure that our otel tracing is working without // asserting too much about the exact set of RPCs we use under the hood. expectedSpans := expectedSpans{ - Operation: "TraceTest", - Service: "integration.test", + SpanName: "TraceTest", + Service: "integration.test", Children: []expectedSpans{ - {Operation: "/directory", Service: wfe}, - {Operation: "/acme/new-nonce", Service: wfe, Children: []expectedSpans{ + {SpanName: "/directory", Service: wfe}, + {SpanName: "/acme/new-nonce", Service: wfe, Children: []expectedSpans{ rpcSpan("nonce.NonceService/Nonce", wfe, "nonce-service")}}, httpSpan("/acme/new-acct", redisPipelineSpan("get", wfe)), @@ -238,30 +201,20 @@ func TestTraces(t *testing.T) { found := false const retries = 10 for range retries { - trace := getTraceFromJaeger(t, traceID) + trace = getTraceFromClickHouse(t, traceID) if findSpans(trace, "", expectedSpans) { found = true break } time.Sleep(sdktrace.DefaultScheduleDelay / 5 * time.Millisecond) } - test.Assert(t, found, fmt.Sprintf("Failed to find expected spans in Jaeger for trace %s", traceID)) - - test.AssertEquals(t, len(trace.Warnings), 0) - for _, span := range trace.Spans { - for _, warning := range span.Warnings { - if strings.Contains(warning, "clock skew adjustment disabled; not applying calculated delta") { - continue - } - t.Errorf("Span %s (%s) warning: %v", span.SpanID, span.OperationName, warning) - } - } + test.Assert(t, found, fmt.Sprintf("Failed to find expected spans in ClickHouse for trace %s", traceID)) } func traceIssuingTestCert(t *testing.T) trace.TraceID { - // Configure this integration test to trace to jaeger:4317 like Boulder will + // Configure this integration test to trace to otel-collector:4317 like Boulder will shutdown := cmd.NewOpenTelemetry(cmd.OpenTelemetryConfig{ - Endpoint: "bjaeger:4317", + Endpoint: "otel-collector:4317", SampleRatio: 1, }, blog.Get()) defer shutdown(context.Background()) diff --git a/test/otelcol/config.yaml b/test/otelcol/config.yaml new file mode 100644 index 00000000000..75b89440214 --- /dev/null +++ b/test/otelcol/config.yaml @@ -0,0 +1,95 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:4318" + syslog: + udp: + listen_address: "0.0.0.0:5514" + protocol: rfc3164 + filelog: + include: + - /var/log/boulder/boulder-*.log + - /var/log/boulder/crl-storer.log + - /var/log/boulder/log-validator.log + - /var/log/boulder/nonce-service.log + operators: + - type: regex_parser + regex: '^(?P\S+) (?P\S+) (?P\S+) (?P\d+) (?P[^\[]+)\[(?P\d+)\]: (?P\S+) (?P.*?)$' + timestamp: + parse_from: attributes.timestamp + layout: '%Y-%m-%dT%H:%M:%S.%f%j' + severity: + parse_from: attributes.syslogseverity + preset: none + overwrite_text: true + mapping: # Map syslog levels to otel ones + fatal: 0 # syslog: emerg + error3: 1 # syslog: alert + error2: 2 # syslog: crit + error: 3 # syslog: err + warn: 4 # syslog: warning + info2: 5 # syslog: notice + info: 6 # syslog: info + debug: 7 # syslog: debug + - type: remove + field: attributes.timestamp + - type: remove + field: attributes.syslogseverity + - type: move + from: attributes.msg + to: body + - type: move + from: attributes.service + to: resource["service.name"] + - type: move # TODO: Parse integer, per semantic conventions + from: attributes.pid + to: resource["process.pid"] + +processors: + batch: + +exporters: + clickhouse: + endpoint: http://clickhouse:8123 + database: otel + username: otel_writer + password: otel_writer_very_bad_password + logs_table_name: otel_logs + traces_table_name: otel_traces + metrics_table_name: otel_metrics + timeout: 5s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + + prometheus: + namespace: "something" + endpoint: "0.0.0.0:8889" + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [clickhouse] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [clickhouse, prometheus] + logs: + receivers: [otlp, syslog, filelog] + processors: [batch] + exporters: [clickhouse] + telemetry: + metrics: + readers: + - pull: + exporter: + prometheus: + host: '0.0.0.0' + port: 8888