diff --git a/.chloggen/tailsamplingprocessor-fixed-sampling-decision-metrics.yaml b/.chloggen/tailsamplingprocessor-fixed-sampling-decision-metrics.yaml new file mode 100644 index 000000000000..ef9aecbb6490 --- /dev/null +++ b/.chloggen/tailsamplingprocessor-fixed-sampling-decision-metrics.yaml @@ -0,0 +1,32 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: tailsamplingprocessor + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Fixed sampling decision metrics `otelcol_processor_tail_sampling_sampling_trace_dropped_too_early` and `otelcol_processor_tail_sampling_sampling_policy_evaluation_error_total`, these were sometimes overcounted. + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [37212] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: | + As a result of this change non-zero values of `otelcol_processor_tail_sampling_sampling_trace_dropped_too_early` + and `otelcol_processor_tail_sampling_sampling_policy_evaluation_error_total` metrics will be lower. + Before this fix, errors got counted several times depending on the amount of traces being processed + that tick and where in the batch the error happened. + Zero values are unaffected. + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/processor/tailsamplingprocessor/processor.go b/processor/tailsamplingprocessor/processor.go index e96a55483ce2..400474450efb 100644 --- a/processor/tailsamplingprocessor/processor.go +++ b/processor/tailsamplingprocessor/processor.go @@ -344,7 +344,6 @@ func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() { decision := tsp.makeDecision(id, trace, &metrics) tsp.telemetry.ProcessorTailSamplingSamplingDecisionTimerLatency.Record(tsp.ctx, int64(time.Since(startTime)/time.Microsecond)) - tsp.telemetry.ProcessorTailSamplingSamplingTracesOnMemory.Record(tsp.ctx, int64(tsp.numTracesOnMap.Load())) tsp.telemetry.ProcessorTailSamplingGlobalCountTracesSampled.Add(tsp.ctx, 1, decisionToAttribute[decision]) // Sampled or not, remove the batches @@ -362,6 +361,7 @@ func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() { } } + tsp.telemetry.ProcessorTailSamplingSamplingTracesOnMemory.Record(tsp.ctx, int64(tsp.numTracesOnMap.Load())) tsp.telemetry.ProcessorTailSamplingSamplingTraceDroppedTooEarly.Add(tsp.ctx, metrics.idNotFoundOnMapCount) tsp.telemetry.ProcessorTailSamplingSamplingPolicyEvaluationError.Add(tsp.ctx, metrics.evaluateErrorCount) diff --git a/processor/tailsamplingprocessor/processor_telemetry_test.go b/processor/tailsamplingprocessor/processor_telemetry_test.go index 924842251963..c07c5f95a84e 100644 --- a/processor/tailsamplingprocessor/processor_telemetry_test.go +++ b/processor/tailsamplingprocessor/processor_telemetry_test.go @@ -20,6 +20,8 @@ import ( sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/metric/metricdata" "go.opentelemetry.io/otel/sdk/metric/metricdata/metricdatatest" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl" ) func TestMetricsAfterOneEvaluation(t *testing.T) { @@ -522,6 +524,138 @@ func TestProcessorTailSamplingSamplingLateSpanAge(t *testing.T) { metricdatatest.AssertEqual(t, m, got, metricdatatest.IgnoreTimestamp()) } +func TestProcessorTailSamplingSamplingTraceDroppedTooEarly(t *testing.T) { + // prepare + s := setupTestTelemetry() + b := newSyncIDBatcher() + syncBatcher := b.(*syncIDBatcher) + + cfg := Config{ + DecisionWait: 1, + NumTraces: 2, + PolicyCfgs: []PolicyCfg{ + { + sharedPolicyCfg: sharedPolicyCfg{ + Name: "always", + Type: AlwaysSample, + }, + }, + }, + } + cs := &consumertest.TracesSink{} + ct := s.newSettings() + proc, err := newTracesProcessor(context.Background(), ct, cs, cfg, withDecisionBatcher(syncBatcher)) + require.NoError(t, err) + defer func() { + err = proc.Shutdown(context.Background()) + require.NoError(t, err) + }() + + err = proc.Start(context.Background(), componenttest.NewNopHost()) + require.NoError(t, err) + + // test + _, batches := generateIDsAndBatches(3) + for _, batch := range batches { + err = proc.ConsumeTraces(context.Background(), batch) + require.NoError(t, err) + } + + tsp := proc.(*tailSamplingSpanProcessor) + tsp.policyTicker.OnTick() // the first tick always gets an empty batch + tsp.policyTicker.OnTick() + + // verify + var md metricdata.ResourceMetrics + require.NoError(t, s.reader.Collect(context.Background(), &md)) + + m := metricdata.Metrics{ + Name: "otelcol_processor_tail_sampling_sampling_trace_dropped_too_early", + Description: "Count of traces that needed to be dropped before the configured wait time", + Unit: "{traces}", + Data: metricdata.Sum[int64]{ + IsMonotonic: true, + Temporality: metricdata.CumulativeTemporality, + DataPoints: []metricdata.DataPoint[int64]{ + { + Value: 1, + }, + }, + }, + } + + got := s.getMetric(m.Name, md) + metricdatatest.AssertEqual(t, m, got, metricdatatest.IgnoreTimestamp()) +} + +func TestProcessorTailSamplingSamplingPolicyEvaluationError(t *testing.T) { + // prepare + s := setupTestTelemetry() + b := newSyncIDBatcher() + syncBatcher := b.(*syncIDBatcher) + + cfg := Config{ + DecisionWait: 1, + NumTraces: 100, + PolicyCfgs: []PolicyCfg{ + { + sharedPolicyCfg: sharedPolicyCfg{ + Name: "ottl", + Type: OTTLCondition, + OTTLConditionCfg: OTTLConditionCfg{ + ErrorMode: ottl.PropagateError, + SpanConditions: []string{"attributes[1] == \"test\""}, + }, + }, + }, + }, + } + cs := &consumertest.TracesSink{} + ct := s.newSettings() + proc, err := newTracesProcessor(context.Background(), ct, cs, cfg, withDecisionBatcher(syncBatcher)) + require.NoError(t, err) + defer func() { + err = proc.Shutdown(context.Background()) + require.NoError(t, err) + }() + + err = proc.Start(context.Background(), componenttest.NewNopHost()) + require.NoError(t, err) + + // test + _, batches := generateIDsAndBatches(2) + for _, batch := range batches { + err = proc.ConsumeTraces(context.Background(), batch) + require.NoError(t, err) + } + + tsp := proc.(*tailSamplingSpanProcessor) + tsp.policyTicker.OnTick() // the first tick always gets an empty batch + tsp.policyTicker.OnTick() + + // verify + var md metricdata.ResourceMetrics + require.NoError(t, s.reader.Collect(context.Background(), &md)) + + m := metricdata.Metrics{ + Name: "otelcol_processor_tail_sampling_sampling_policy_evaluation_error", + Description: "Count of sampling policy evaluation errors", + Unit: "{errors}", + Data: metricdata.Sum[int64]{ + IsMonotonic: true, + Temporality: metricdata.CumulativeTemporality, + DataPoints: []metricdata.DataPoint[int64]{ + { + Value: 2, + }, + }, + }, + } + + got := s.getMetric(m.Name, md) + metricdatatest.AssertEqual(t, m, got, metricdatatest.IgnoreTimestamp()) +} + type testTelemetry struct { reader *sdkmetric.ManualReader meterProvider *sdkmetric.MeterProvider