DataDog · dougqh · May 18, 2026 · May 20, 2026 · Jun 3, 2026 · Jun 3, 2026
@@ -78,6 +78,7 @@ public final class GeneralConfig {
       "trace.tracer.metrics.ignored.resources";
   public static final String TRACE_STATS_CARDINALITY_LIMITS_ENABLED =
       "trace.stats.cardinality.limits.enabled";
+  public static final String TRACE_STATS_ADDITIONAL_TAGS = "trace.stats.additional.tags";
 
   public static final String AZURE_APP_SERVICES = "azure.app.services";
   public static final String INTERNAL_EXIT_ON_FAILURE = "trace.internal.exit.on.failure";

@@ -0,0 +1,154 @@
+package datadog.trace.common.metrics;
+
+import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND;
+import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT;
+import static java.util.concurrent.TimeUnit.SECONDS;
+
+import datadog.trace.api.WellKnownTags;
+import datadog.trace.core.CoreSpan;
+import de.thetaphi.forbiddenapis.SuppressForbidden;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * JMH benchmark exercising the span-derived primary tags pipeline added in CSS v1.3.0. Parallel to
+ * {@link AdversarialMetricsBenchmark} but configures two additional-tag keys (each with a per-key
+ * cardinality cap of {@link MetricCardinalityLimits#ADDITIONAL_TAG_VALUE}) and generates unique
+ * values per op so the cap saturates fast. The benchmark measures the cost of:
+ *
+ * <ul>
+ *   <li>producer-side capture: {@code ClientStatsAggregator.captureAdditionalTagValues} walks the
+ *       schema and pulls each key via {@code unsafeGetTag}.
+ *   <li>aggregator-side canonicalization: {@code AdditionalTagsSchema.register(i, value)} runs a
+ *       {@link TagCardinalityHandler} probe + insert, returning the per-key blocked sentinel once
+ *       the per-cycle value budget is exhausted.
+ *   <li>cycle-reset flush: at every reporting cycle, the schema fires one {@code
+ *       HealthMetrics.onTagCardinalityBlocked(name, count)} per affected key.
+ * </ul>
+ *
+ * <p>The aim is not absolute throughput numbers but a regression guard for the additional-tags hot
+ * path: any future refactor that adds a tag-map lookup, allocates per call, or pulls the
+ * sentinel-materialization onto the hot path should show up as a step change here.
+ *
+ * <p><b>Interpreting the {@code limitsEnabled} parameter.</b> The two arms are NOT a fair "cost of
+ * limiting" comparison and should not be read as one. With this benchmark's unbounded distinct
+ * values, the per-key budget saturates almost immediately and the two modes diverge into different
+ * downstream behavior, not just a different branch in {@code register}:
+ *
+ * <ul>
+ *   <li>{@code limitsEnabled=true}: every over-budget span collapses to the single {@code
+ *       "<key>:blocked_by_tracer"} sentinel entry, so {@code findOrInsert} always hits a live entry
+ *       and {@code recordOneDuration} (a DDSketch histogram insert) runs for every drained span.
+ *   <li>{@code limitsEnabled=false}: every over-budget value canonicalizes to a distinct entry, so
+ *       the table saturates at {@code maxAggregates} and most subsequent spans are dropped at
+ *       {@code findOrInsert} -- never reaching the histogram.
+ * </ul>
+ *
+ * <p>So {@code limitsEnabled=true} measures lower throughput here precisely because it does MORE
+ * useful work per span (it keeps the masked data and records it) where the disabled arm drops the
+ * overflow. A 2026-06-03 run (3 forks, -prof gc) measured {@code false} at ~19.8M ops/s / 820 B/op
+ * and {@code true} at ~12.4M ops/s / 888 B/op -- the higher per-op allocation under limits is the
+ * histogram recording, not the sentinel path. Throughput CIs were wide (>20%); the per-op
+ * allocation figures are the reliable signal. A production workload with a bounded value set never
+ * saturates the budget and sees neither arm's overflow behavior (cf. {@code
+ * HighCardinalityResourceMetricsBenchmark}, which is at parity with limits on/off).
+ */
+@State(Scope.Benchmark)
+@Warmup(iterations = 2, time = 15, timeUnit = SECONDS)
+@Measurement(iterations = 5, time = 15, timeUnit = SECONDS)
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(SECONDS)
+@Threads(8)
+@Fork(value = 1)
+public class AdditionalTagsMetricsBenchmark {
+
+  private ClientStatsAggregator aggregator;
+  private AdversarialMetricsBenchmark.CountingHealthMetrics health;
+
+  /**
+   * Whether the {@link TagCardinalityHandler}s inside {@link AdditionalTagsSchema} substitute the
+   * {@code blocked_by_tracer} sentinel once the per-key budget is exhausted. JMH runs both values
+   * within the same fork so the two modes see equivalent thermal conditions.
+   */
+  @Param({"false", "true"})
+  public boolean limitsEnabled;
+
+  @State(Scope.Thread)
+  public static class ThreadState {
+    int cursor;
+  }
+
+  @Setup
+  public void setup() {
+    this.health = new AdversarialMetricsBenchmark.CountingHealthMetrics();
+    // Two configured additional tags. Each key gets a TagCardinalityHandler capped at
+    // MetricCardinalityLimits.ADDITIONAL_TAG_VALUE (512) distinct values per cycle. The benchmark
+    // generates 65k distinct values per key so the cap saturates quickly and most ops return the
+    // blocked sentinel -- that's the contention we want to measure.
+    AdditionalTagsSchema additionalTagsSchema =
+        AdditionalTagsSchema.from(
+            new LinkedHashSet<>(Arrays.asList("region", "tenant_id")), this.health, limitsEnabled);
+    this.aggregator =
+        new ClientStatsAggregator(
+            new WellKnownTags("", "", "", "", "", ""),
+            Collections.emptySet(),
+            additionalTagsSchema,
+            new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery(
+                Collections.singleton("peer.hostname"), Collections.emptySet()),
+            this.health,
+            new ClientStatsAggregatorBenchmark.NullSink(),
+            2048,
+            2048,
+            false);
+    this.aggregator.start();
+  }
+
+  @TearDown
+  @SuppressForbidden
+  public void tearDown() {
+    aggregator.close();
+    System.err.println("[ADDITIONAL-TAGS] counters (across all threads, single fork):");
+    System.err.println("  onStatsInboxFull         = " + health.inboxFull.sum());
+    System.err.println("  onStatsAggregateDropped  = " + health.aggregateDropped.sum());
+  }
+
+  @Benchmark
+  public void publish(ThreadState ts, Blackhole blackhole) {
+    int idx = ts.cursor++;
+    ThreadLocalRandom rng = ThreadLocalRandom.current();
+
+    // Distinct values per op -- 65k regions × 65k tenants × random durations. With the per-key cap
+    // (ADDITIONAL_TAG_VALUE = 512), the first 512 distinct values per key admit; the rest collapse
+    // to the blocked sentinel and increment the per-tag block counter via the schema's flush path.
+    int scrambled = idx * 0x9E3779B1;
+    String region = "region-" + ((scrambled >>> 4) & 0xFFFF);
+    String tenant = "tenant-" + ((scrambled >>> 16) & 0xFFFF);
+    long durationNanos = 1L + (rng.nextLong() & 0x3FFFFFFFL);
+
+    SimpleSpan span =
+        new SimpleSpan("svc", "op", "res", "web", true, true, false, 0, durationNanos, 200);
+    span.setTag(SPAN_KIND, SPAN_KIND_CLIENT);
+    span.setTag("region", region);
+    span.setTag("tenant_id", tenant);
+
+    List<CoreSpan<?>> trace = Collections.singletonList(span);
+    blackhole.consume(aggregator.publish(trace));
+  }
+}
@@ -0,0 +1,165 @@
+package datadog.trace.common.metrics;
+
+import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString;
+import datadog.trace.core.monitor.HealthMetrics;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Immutable schema describing the configured span-derived primary tag keys. Built once from {@code
+ * Config.getTraceStatsAdditionalTags()} at aggregator construction; not replaced at runtime.
+ *
+ * <p>Parallels {@link PeerTagSchema} for shape: a sorted, deduped, validated, capped {@code
+ * String[]} of names plus per-name {@link TagCardinalityHandler}s for UTF8 caching and value-level
+ * cardinality limiting. The handlers are reset each reporting cycle via {@link #resetHandlers()}.
+ *
+ * <p>What's pre-built:
+ *
+ * <ul>
+ *   <li>{@link #names} -- the alphabetically sorted, deduped, validated, capped list of tag keys.
+ *   <li>{@link #blockedSentinels} -- one shared {@code UTF8BytesString("<key>:blocked_by_tracer")}
+ *       per configured key, returned when the per-tag cardinality budget is exhausted.
+ *   <li>{@link #handlers} -- one {@link TagCardinalityHandler} per key providing UTF8 reuse and
+ *       per-cycle cardinality limiting. Aggregator-thread-only; reset each cycle.
+ * </ul>
+ */
+final class AdditionalTagsSchema {
+
+  private static final Logger log = LoggerFactory.getLogger(AdditionalTagsSchema.class);
+
+  /**
+   * Backend stats pipeline supports a small number of primary tag dimensions (4 by default, up to
+   * ~10 for elevated quotas). Configuring more than this is misuse; we drop the overflow at
+   * startup.
+   */
+  static final int MAX_ADDITIONAL_TAG_KEYS = 10;
+
+  static final String BLOCKED_VALUE = "blocked_by_tracer";
+
+  /** Singleton empty schema returned when no additional tags are configured. */
+  static final AdditionalTagsSchema EMPTY =
+      new AdditionalTagsSchema(
+          new String[0], new UTF8BytesString[0], new TagCardinalityHandler[0], HealthMetrics.NO_OP);
+
+  final String[] names;
+  final UTF8BytesString[] blockedSentinels;
+
+  /** Per-key handlers providing UTF8 caching and per-cycle cardinality limiting. */
+  private final TagCardinalityHandler[] handlers;
+
+  private final HealthMetrics healthMetrics;
+
+  private AdditionalTagsSchema(
+      String[] names,
+      UTF8BytesString[] blockedSentinels,
+      TagCardinalityHandler[] handlers,
+      HealthMetrics healthMetrics) {
+    this.names = names;
+    this.blockedSentinels = blockedSentinels;
+    this.handlers = handlers;
+    this.healthMetrics = healthMetrics;
+  }
+
+  /**
+   * Builds a schema from the configured tag keys. Validates each key (non-empty, no {@code :}),
+   * sorts alphabetically, dedupes, and caps at {@link #MAX_ADDITIONAL_TAG_KEYS}. Returns the shared
+   * empty schema when {@code configured} is null or empty.
+   */
+  /** Test convenience: uses {@link HealthMetrics#NO_OP}. */
+  static AdditionalTagsSchema from(Set<String> configured) {
+    return from(configured, HealthMetrics.NO_OP, AggregateEntry.LIMITS_ENABLED);
+  }
+
+  static AdditionalTagsSchema from(Set<String> configured, HealthMetrics healthMetrics) {
+    return from(configured, healthMetrics, AggregateEntry.LIMITS_ENABLED);
+  }
+
+  static AdditionalTagsSchema from(
+      Set<String> configured, HealthMetrics healthMetrics, boolean useBlockedSentinel) {
+    if (configured == null || configured.isEmpty()) {
+      return EMPTY;
+    }
+    List<String> valid = new ArrayList<>();
+    for (String key : configured) {
+      if (key == null || key.isEmpty()) {
+        log.warn("Ignoring empty additional metric tag key");
+        continue;
+      }
+      if (key.contains(":")) {
+        log.warn("Ignoring additional metric tag key '{}': keys must not contain ':'", key);
+        continue;
+      }
+      valid.add(key);
+    }
+    if (valid.isEmpty()) {
+      return EMPTY;
+    }
+    Collections.sort(valid);
+    // Dedup (sort brings duplicates adjacent)
+    List<String> deduped = new ArrayList<>(valid.size());
+    String prev = null;
+    for (String key : valid) {
+      if (!key.equals(prev)) {
+        deduped.add(key);
+        prev = key;
+      }
+    }
+    if (deduped.size() > MAX_ADDITIONAL_TAG_KEYS) {
+      log.warn(
+          "Configured additional metric tag keys ({}) exceeds the supported limit of {}; "
+              + "dropping extra keys: {}",
+          deduped.size(),
+          MAX_ADDITIONAL_TAG_KEYS,
+          deduped.subList(MAX_ADDITIONAL_TAG_KEYS, deduped.size()));
+      deduped = deduped.subList(0, MAX_ADDITIONAL_TAG_KEYS);
+    }
+    String[] namesArr = deduped.toArray(new String[0]);
+    UTF8BytesString[] sentinels = new UTF8BytesString[namesArr.length];
+    TagCardinalityHandler[] handlersArr = new TagCardinalityHandler[namesArr.length];
+    for (int i = 0; i < namesArr.length; i++) {
+      sentinels[i] = UTF8BytesString.create(namesArr[i] + ":" + BLOCKED_VALUE);
+      handlersArr[i] =
+          new TagCardinalityHandler(
+              namesArr[i], MetricCardinalityLimits.ADDITIONAL_TAG_VALUE, useBlockedSentinel);
+    }
+    return new AdditionalTagsSchema(namesArr, sentinels, handlersArr, healthMetrics);
+  }
+
+  int size() {
+    return names.length;
+  }
+
+  String name(int i) {
+    return names[i];
+  }
+
+  UTF8BytesString blockedSentinel(int i) {
+    return blockedSentinels[i];
+  }
+
+  /**
+   * Canonicalizes {@code value} for the additional tag at slot {@code i} through the per-key {@link
+   * TagCardinalityHandler}: provides UTF8 caching and returns the per-tag blocked sentinel when the
+   * per-cycle budget is exhausted. Returns {@link UTF8BytesString#EMPTY} for null inputs.
+   */
+  UTF8BytesString register(int i, String value) {
+    return handlers[i].register(value);
+  }
+
+  /**
+   * Resets every handler's working set and flushes accumulated block counts to {@link
+   * HealthMetrics}. Must be called on the aggregator thread each cycle.
+   */
+  void resetHandlers() {
+    for (int i = 0; i < handlers.length; i++) {
+      long blocked = handlers[i].reset();
+      if (blocked > 0) {
+        healthMetrics.onTagCardinalityBlocked(names[i], blocked);
+      }
+    }
+  }
+}