buffering for telemetry + centralized metric constants - Metrics are buffered on the frontend for 5 seconds before being sent in batches to the `/_/fe_telemetry` endpoint. This reduces network traffic. - Telemetry constants in one place Change-Id: Ib7bd0b6aa089b1eb5e0966a500324dc96524502e Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/1077936 Commit-Queue: Sergei Rudenkov <sergeirudenkov@google.com> Reviewed-by: Marcin Mordecki <mordeckimarcin@google.com>
diff --git a/perf/modules/explore-multi-sk/explore-multi-sk.ts b/perf/modules/explore-multi-sk/explore-multi-sk.ts index c4d75b3..6746358 100644 --- a/perf/modules/explore-multi-sk/explore-multi-sk.ts +++ b/perf/modules/explore-multi-sk/explore-multi-sk.ts
@@ -49,7 +49,7 @@ Trace, TraceMetadata, } from '../json'; -import { recordSummary } from '../telemetry/telemetry'; +import { SummaryMetric, telemetry } from '../telemetry/telemetry'; import '../../../elements-sk/modules/spinner-sk'; import '../explore-simple-sk'; @@ -442,9 +442,13 @@ } catch (err: any) { errorMessage(err.message || "Something went wrong, can't plot the graphs."); } finally { - recordSummary('fe_multi_graph_data_load_time_s', (performance.now() - startTime) / 1000, { - url: window.location.href, - }); + telemetry.recordSummary( + SummaryMetric.MultiGraphDataLoadTime, + (performance.now() - startTime) / 1000, + { + url: window.location.href, + } + ); this.updateShortcutMultiview(); this.setProgress(''); this.checkDataLoaded();
diff --git a/perf/modules/plot-google-chart-sk/plot-google-chart-sk.ts b/perf/modules/plot-google-chart-sk/plot-google-chart-sk.ts index 21cdb8a..b3dd187 100644 --- a/perf/modules/plot-google-chart-sk/plot-google-chart-sk.ts +++ b/perf/modules/plot-google-chart-sk/plot-google-chart-sk.ts
@@ -19,7 +19,7 @@ import { define } from '../../../elements-sk/modules/define'; import { AnomalyMap } from '../json'; import { defaultColors, mainChartOptions } from '../common/plot-builder'; -import { recordSummary } from '../telemetry/telemetry'; +import { telemetry } from '../telemetry/telemetry'; import { dataframeAnomalyContext, dataframeUserIssueContext, @@ -33,6 +33,7 @@ import { VResizableBoxSk } from './v-resizable-box-sk'; import { SidePanelCheckboxClickDetails, SidePanelSk } from './side-panel-sk'; import { DragToZoomBox } from './drag-to-zoom-box-sk'; +import { SummaryMetric } from '../telemetry/telemetry'; export interface PlotSelectionEventDetails { value: range; @@ -51,8 +52,6 @@ export class PlotGoogleChartSk extends LitElement { private static readonly MOUSE_DOWN_HOLD_TIMEOUT = 3000; // 3 seconds - private static readonly GRAPH_PERF_METRIC_NAME = 'fe_google_graph_plot_time_s'; - private mouseDownTimeoutId: number | null = null; // TODO(b/362831653): Adjust height to 100% once plot-summary-sk is deprecated @@ -425,7 +424,7 @@ plot.view = view; this.updateOptions(); - recordSummary(PlotGoogleChartSk.GRAPH_PERF_METRIC_NAME, (performance.now() - start) / 1000, { + telemetry.recordSummary(SummaryMetric.GoogleGraphPlotTime, (performance.now() - start) / 1000, { type: 'update-data-view', }); } @@ -1059,7 +1058,7 @@ // them to the new locations, but the rendering internal may already do this for us. // We should only do this optimization if we see a performance issue. anomalyDiv.replaceChildren(...allDivs); - recordSummary(PlotGoogleChartSk.GRAPH_PERF_METRIC_NAME, (performance.now() - start) / 1000, { + telemetry.recordSummary(SummaryMetric.GoogleGraphPlotTime, (performance.now() - start) / 1000, { type: 'draw-anomaly', }); } @@ -1165,7 +1164,7 @@ }); userIssueDiv.replaceChildren(...allDivs); - recordSummary(PlotGoogleChartSk.GRAPH_PERF_METRIC_NAME, (performance.now() - start) / 1000, { + telemetry.recordSummary(SummaryMetric.GoogleGraphPlotTime, (performance.now() - start) / 1000, { type: 'draw-user-issues', }); } @@ -1228,7 +1227,7 @@ this.drawAnomaly(this.chart); this.drawUserIssues(this.chart); this.drawXbar(this.chart); - recordSummary(PlotGoogleChartSk.GRAPH_PERF_METRIC_NAME, (performance.now() - start) / 1000, { + telemetry.recordSummary(SummaryMetric.GoogleGraphPlotTime, (performance.now() - start) / 1000, { type: 'main-chart', }); } @@ -1272,7 +1271,7 @@ plot.options = options; } } - recordSummary(PlotGoogleChartSk.GRAPH_PERF_METRIC_NAME, (performance.now() - start) / 1000, { + telemetry.recordSummary(SummaryMetric.GoogleGraphPlotTime, (performance.now() - start) / 1000, { type: 'update-bounds', }); }
diff --git a/perf/modules/plot-summary-sk/plot-summary-sk.ts b/perf/modules/plot-summary-sk/plot-summary-sk.ts index 6815e0c..47e9de4 100644 --- a/perf/modules/plot-summary-sk/plot-summary-sk.ts +++ b/perf/modules/plot-summary-sk/plot-summary-sk.ts
@@ -32,7 +32,7 @@ } from '../dataframe/dataframe_context'; import { range } from '../dataframe/index'; import { define } from '../../../elements-sk/modules/define'; -import { recordSummary } from '../telemetry/telemetry'; +import { SummaryMetric, telemetry } from '../telemetry/telemetry'; import { style } from './plot-summary-sk.css'; import { HResizableBoxSk } from './h_resizable_box_sk'; @@ -133,7 +133,7 @@ view.setColumns(cols); plot.view = view; plot.options = options; - recordSummary('fe_google_graph_plot_time_s', (performance.now() - start) / 1000, { + telemetry.recordSummary(SummaryMetric.GoogleGraphPlotTime, (performance.now() - start) / 1000, { type: 'summary', }); }
diff --git a/perf/modules/regressions-page-sk/regressions-page-sk.ts b/perf/modules/regressions-page-sk/regressions-page-sk.ts index b99b62a..4111b3b 100644 --- a/perf/modules/regressions-page-sk/regressions-page-sk.ts +++ b/perf/modules/regressions-page-sk/regressions-page-sk.ts
@@ -19,7 +19,7 @@ import '@material/web/button/outlined-button.js'; import { HintableObject } from '../../../infra-sk/modules/hintable'; import { errorMessage } from '../errorMessage'; -import { increaseCounter } from '../telemetry/telemetry'; +import { CountMetric, telemetry } from '../telemetry/telemetry'; // State is the local UI state of regressions-page-sk interface State { @@ -184,7 +184,7 @@ await this.anomaliesTable!.populateTable(this.cpAnomalies); }) .catch((msg) => { - increaseCounter('fe_data_fetch_failure', { + telemetry.increaseCounter(CountMetric.DataFetchFailure, { page: 'regressions', endpoint: '/_/anomalies/anomaly_list', });
diff --git a/perf/modules/report-page-sk/report-page-sk.ts b/perf/modules/report-page-sk/report-page-sk.ts index 84a2d3c..326cc60 100644 --- a/perf/modules/report-page-sk/report-page-sk.ts +++ b/perf/modules/report-page-sk/report-page-sk.ts
@@ -18,7 +18,7 @@ import { upgradeProperty } from '../../../elements-sk/modules/upgradeProperty'; import '../../../elements-sk/modules/icons/camera-roll-icon-sk'; import { PlotSelectionEventDetails } from '../plot-google-chart-sk/plot-google-chart-sk'; -import { increaseCounter, recordSummary } from '../telemetry/telemetry'; +import { CountMetric, SummaryMetric, telemetry } from '../telemetry/telemetry'; const weekInSeconds = 7 * 24 * 60 * 60; @@ -230,7 +230,7 @@ this.setCurrentlyLoading(''); }) .catch((msg: any) => { - increaseCounter('fe_data_fetch_failure', { + telemetry.increaseCounter(CountMetric.DataFetchFailure, { page: 'report', endpoint: '/_/anomalies/group_report', }); @@ -264,10 +264,13 @@ this.anomalyTracker.setGraph(anomaly.id, graphElement); const listener = () => { - recordSummary( - 'fe_single_graph_load_time_s', + telemetry.recordSummary( + SummaryMetric.SingleGraphLoadTime, (performance.now() - startTime) / 1000, - { page: 'report', url: window.location.href } + { + page: 'report', + url: window.location.href, + } ); graphElement.removeEventListener('data-loaded', listener); loadedCount++;
diff --git a/perf/modules/telemetry/BUILD.bazel b/perf/modules/telemetry/BUILD.bazel index dae6480..15be920 100644 --- a/perf/modules/telemetry/BUILD.bazel +++ b/perf/modules/telemetry/BUILD.bazel
@@ -1,4 +1,16 @@ -load("//infra-sk:index.bzl", "ts_library") +load("//infra-sk:index.bzl", "karma_test", "ts_library") + +karma_test( + name = "telemetry_test", + src = "telemetry_test.ts", + deps = [ + ":telemetry_ts_lib", + "//:node_modules/@types/chai", + "//:node_modules/@types/sinon", + "//:node_modules/chai", + "//:node_modules/sinon", + ], +) ts_library( name = "telemetry_ts_lib",
diff --git a/perf/modules/telemetry/telemetry.ts b/perf/modules/telemetry/telemetry.ts index 6e8bc98..f07165e 100644 --- a/perf/modules/telemetry/telemetry.ts +++ b/perf/modules/telemetry/telemetry.ts
@@ -1,15 +1,20 @@ /** - * @fileoverview This file defines functions for sending frontend telemetry metrics. - * These metrics are used to track user interactions and performance within the - * application. + * @fileoverview This file defines a Telemetry class for sending frontend metrics. + * A singleton instance is exported for application-wide use. These metrics are + * used to track user interactions and performance. + * + * Metrics are buffered on the frontend for 5 seconds before being sent in batches + * to the `/_/fe_telemetry` endpoint. This reduces network traffic. Any pending metrics + * are also sent when the page visibility changes to 'hidden' (e.g., when the user + * navigates away or closes the tab) to prevent data loss. * * To add a new counter metric: - * 1. Add the metric name to the `CountMetricName` type. - * 2. Call the `increaseCounter` function with the new metric name and optional tags. + * 1. Add the metric name to the `CountMetric` enum. + * 2. Call `telemetry.increaseCounter()` with the new metric name and optional tags. * * To add a new summary metric: - * 1. Add the metric name to the `SummaryMetricName` type. - * 2. Call the `recordSummary` function with the new metric name, value, and optional tags. + * 1. Add the metric name to the `SummaryMetric` enum. + * 2. Call `telemetry.recordSummary()` with the new metric name, value, and optional tags. */ interface FrontendMetric { metric_name: string; @@ -18,50 +23,121 @@ metric_type: 'counter' | 'summary'; } -type CountMetricName = - // Counts data request failures - | 'fe_data_fetch_failure' - // Counts the specific triage actions users perform (e.g., filing a bug, ignoring an anomaly). - | 'fe_triage_action_taken'; +export enum CountMetric { + // go/keep-sorted start + DataFetchFailure = 'fe_data_fetch_failure', + TriageActionTaken = 'fe_triage_action_taken', + // go/keep-sorted end +} -type SummaryMetricName = - // Measures the time it takes for google.graph to plot, when data is already fetched. - | 'fe_google_graph_plot_time_s' - // Measures the time taken to fetch and process data when plotting new graphs. - | 'fe_multi_graph_data_load_time_s' - // Measures the time it takes to render each individual graph - | 'fe_single_graph_load_time_s'; +export enum SummaryMetric { + // go/keep-sorted start + GoogleGraphPlotTime = 'fe_google_graph_plot_time_s', + MultiGraphDataLoadTime = 'fe_multi_graph_data_load_time_s', + ReportAnomaliesTableLoadTime = 'fe_report_anomalies_table_load_time_s', + ReportChartContainerLoadTime = 'fe_report_chart_container_load_time_s', + ReportGraphChunkLoadTime = 'fe_report_graph_chunk_load_time_s', + SingleGraphLoadTime = 'fe_single_graph_load_time_s', + // go/keep-sorted end +} -export async function increaseCounter(metricName: CountMetricName, tags = {}) { - sendMetrics([ - { - metric_name: metricName as string, +class Telemetry { + private static readonly BUFFER_FLUSH_INTERVAL_MS = 5000; // 5 seconds + + private static readonly MAX_BUFFER_SIZE = 1000; // Max 1000 metrics in buffer + + private metricsBuffer: FrontendMetric[] = []; + + private timerId: number | null = null; + + constructor() { + // When the page visibility changes, flush the buffer. This helps ensure we + // capture metrics before the user navigates away or closes the tab. + document.addEventListener('visibilitychange', () => { + if (document.visibilityState === 'hidden') { + if (this.timerId) { + clearTimeout(this.timerId); + this.timerId = null; + } + this.sendBufferedMetrics(); + } + }); + } + + // Flushes the metrics buffer by sending the data to the telemetry endpoint. + private async sendBufferedMetrics() { + if (this.metricsBuffer.length === 0) { + return; + } + + const metricsToSend = [...this.metricsBuffer]; + this.metricsBuffer.length = 0; + + try { + await fetch('/_/fe_telemetry', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(metricsToSend), + }); + } catch (e) { + console.error(e, 'Failed to send frontend metrics:', metricsToSend); + this.queueMetrics(metricsToSend); + } + } + + private queueMetric(metric: FrontendMetric) { + this.queueMetrics([metric]); + } + + private queueMetrics(metrics: FrontendMetric[]) { + for (const m of metrics) { + if (this.metricsBuffer.length >= Telemetry.MAX_BUFFER_SIZE) { + console.warn('Frontend metrics buffer full, removing oldest metric to make space.'); + this.metricsBuffer.shift(); // Remove the oldest metric (FIFO) + } + this.metricsBuffer.push(m); + } + + if (!this.timerId) { + this.timerId = window.setTimeout(() => { + this.sendBufferedMetrics(); + this.timerId = null; + }, Telemetry.BUFFER_FLUSH_INTERVAL_MS); + } + } + + increaseCounter(metricName: CountMetric, tags = {}) { + this.queueMetric({ + metric_name: metricName, metric_value: 1, tags: tags, metric_type: 'counter', - }, - ]); -} + }); + } -export async function recordSummary(metricName: SummaryMetricName, val: number, tags = {}) { - sendMetrics([ - { - metric_name: metricName as string, + recordSummary(metricName: SummaryMetric, val: number, tags = {}) { + this.queueMetric({ + metric_name: metricName, metric_value: val, tags: tags, metric_type: 'summary', + }); + } + + // The following are exposed for testing purposes. + _forTesting = { + reset: () => { + this.metricsBuffer.length = 0; + if (this.timerId) { + clearTimeout(this.timerId); + this.timerId = null; + } }, - ]); + getBuffer: () => this.metricsBuffer, + MAX_BUFFER_SIZE: Telemetry.MAX_BUFFER_SIZE, + }; } -async function sendMetrics(metrics: FrontendMetric[]) { - fetch('/_/fe_telemetry', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify(metrics), - }).catch((e) => { - console.error(e, 'Failed to send frontend metrics:', metrics); - }); -} +export const telemetry = new Telemetry();
diff --git a/perf/modules/telemetry/telemetry_test.ts b/perf/modules/telemetry/telemetry_test.ts new file mode 100644 index 0000000..fad5a2e --- /dev/null +++ b/perf/modules/telemetry/telemetry_test.ts
@@ -0,0 +1,163 @@ +import { expect } from 'chai'; +import * as sinon from 'sinon'; +import { CountMetric, SummaryMetric, telemetry } from './telemetry'; + +describe('telemetry', () => { + const BUFFER_FLUSH_INTERVAL_MS = 5000; // 5 seconds + + let fetchStub: sinon.SinonStub; + let setTimeoutStub: sinon.SinonStub; + let clearTimeoutStub: sinon.SinonStub; + + beforeEach(() => { + // Mock fetch to prevent actual network requests + fetchStub = sinon.stub(window, 'fetch').resolves(new Response()); + // Mock setTimeout and clearTimeout to control time in tests + setTimeoutStub = sinon.stub(window, 'setTimeout').returns(123 as any); + clearTimeoutStub = sinon.stub(window, 'clearTimeout'); + + // Reset the internal buffer and timer before each test + telemetry._forTesting.reset(); + }); + + afterEach(() => { + sinon.restore(); + telemetry._forTesting.reset(); + }); + + it('should buffer increaseCounter metrics and send them after the interval', async () => { + telemetry.increaseCounter(CountMetric.DataFetchFailure, { test: 'tag1' }); + telemetry.increaseCounter(CountMetric.TriageActionTaken, { test: 'tag2' }); + + // Expect fetch not to have been called immediately + expect(fetchStub.callCount).to.equal(0); + // Expect setTimeout to have been called to schedule the flush + expect(setTimeoutStub.callCount).to.equal(1); + expect(setTimeoutStub.getCall(0).args[1]).to.equal(BUFFER_FLUSH_INTERVAL_MS); + + // Manually trigger the setTimeout callback + setTimeoutStub.getCall(0).args[0](); + + // Expect fetch to have been called with the buffered metrics + expect(fetchStub.callCount).to.equal(1); + const expectedBody = JSON.stringify([ + { + metric_name: CountMetric.DataFetchFailure, + metric_value: 1, + tags: { test: 'tag1' }, + metric_type: 'counter', + }, + { + metric_name: CountMetric.TriageActionTaken, + metric_value: 1, + tags: { test: 'tag2' }, + metric_type: 'counter', + }, + ]); + expect(fetchStub.getCall(0).args[0]).to.equal('/_/fe_telemetry'); + expect(fetchStub.getCall(0).args[1]).to.deep.equal({ + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: expectedBody, + }); + }); + + it('should buffer recordSummary metrics and send them after the interval', async () => { + telemetry.recordSummary(SummaryMetric.GoogleGraphPlotTime, 100, { test: 'tag3' }); + telemetry.recordSummary(SummaryMetric.MultiGraphDataLoadTime, 200, { test: 'tag4' }); + + expect(fetchStub.callCount).to.equal(0); + expect(setTimeoutStub.callCount).to.equal(1); + expect(setTimeoutStub.getCall(0).args[1]).to.equal(BUFFER_FLUSH_INTERVAL_MS); + + setTimeoutStub.getCall(0).args[0](); + + expect(fetchStub.callCount).to.equal(1); + const expectedBody = JSON.stringify([ + { + metric_name: SummaryMetric.GoogleGraphPlotTime, + metric_value: 100, + tags: { test: 'tag3' }, + metric_type: 'summary', + }, + { + metric_name: SummaryMetric.MultiGraphDataLoadTime, + metric_value: 200, + tags: { test: 'tag4' }, + metric_type: 'summary', + }, + ]); + expect(fetchStub.getCall(0).args[0]).to.equal('/_/fe_telemetry'); + expect(fetchStub.getCall(0).args[1]).to.deep.equal({ + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: expectedBody, + }); + }); + + it('should send buffered metrics when document visibility changes to hidden', async () => { + telemetry.increaseCounter(CountMetric.DataFetchFailure); + expect(fetchStub.callCount).to.equal(0); + expect(setTimeoutStub.callCount).to.equal(1); // Timer started + + // Simulate visibility change to hidden + Object.defineProperty(document, 'visibilityState', { + value: 'hidden', + writable: true, + }); + document.dispatchEvent(new Event('visibilitychange')); + + // Expect clearTimeout to be called + expect(clearTimeoutStub.callCount).to.equal(1); + expect(clearTimeoutStub.getCall(0).args[0]).to.equal(123); // 123 is the mocked timerId + + // Expect fetch to have been called immediately + expect(fetchStub.callCount).to.equal(1); + const expectedBody = JSON.stringify([ + { + metric_name: CountMetric.DataFetchFailure, + metric_value: 1, + tags: {}, + metric_type: 'counter', + }, + ]); + expect(fetchStub.getCall(0).args[0]).to.equal('/_/fe_telemetry'); + expect(fetchStub.getCall(0).args[1]).to.deep.equal({ + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: expectedBody, + }); + }); + + it('should implement FIFO behavior when buffer is full', async () => { + const MAX_BUFFER_SIZE = telemetry._forTesting.MAX_BUFFER_SIZE; + // Fill the buffer to its maximum capacity + for (let i = 0; i < MAX_BUFFER_SIZE; i++) { + telemetry.increaseCounter(CountMetric.DataFetchFailure, { id: `metric_${i}` }); + } + expect(telemetry._forTesting.getBuffer().length).to.equal(MAX_BUFFER_SIZE); + + // Add one more metric, which should trigger FIFO behavior + telemetry.increaseCounter(CountMetric.TriageActionTaken, { id: 'new_metric' }); + + // Expect the buffer size to remain the same + expect(telemetry._forTesting.getBuffer().length).to.equal(MAX_BUFFER_SIZE); + + // Expect the oldest metric (id: metric_0) to be removed + const buffer = telemetry._forTesting.getBuffer(); + + // Expect the second oldest metric (id: metric_1) to now be the oldest + expect(buffer[0].tags.id).to.equal('metric_1'); + }); + + it('should not send metrics if buffer is empty', async () => { + telemetry.increaseCounter(CountMetric.DataFetchFailure); + // Clear the buffer, but the timer is still scheduled. + telemetry._forTesting.getBuffer().length = 0; + // Trigger the timer. + setTimeoutStub.getCall(0).args[0](); + + // Expect fetch not to have been called because the buffer was empty. + expect(fetchStub.callCount).to.equal(0); + }); +});
diff --git a/perf/modules/triage-menu-sk/triage-menu-sk.ts b/perf/modules/triage-menu-sk/triage-menu-sk.ts index 4c76213..5e7d531 100644 --- a/perf/modules/triage-menu-sk/triage-menu-sk.ts +++ b/perf/modules/triage-menu-sk/triage-menu-sk.ts
@@ -18,7 +18,7 @@ import { ExistingBugDialogSk } from '../existing-bug-dialog-sk/existing-bug-dialog-sk'; import { Anomaly } from '../json'; import { AnomalyData } from '../common/anomaly-data'; -import { increaseCounter } from '../telemetry/telemetry'; +import { CountMetric, telemetry } from '../telemetry/telemetry'; import '../new-bug-dialog-sk/new-bug-dialog-sk'; import '../existing-bug-dialog-sk/existing-bug-dialog-sk'; @@ -115,7 +115,7 @@ } fileBug() { - increaseCounter('fe_triage_action_taken', { action: 'file_bug' }); + telemetry.increaseCounter(CountMetric.TriageActionTaken, { action: 'file_bug' }); this.newBugDialog!.fileNewBug(); } @@ -124,12 +124,12 @@ } openExistingBugDialog() { - increaseCounter('fe_triage_action_taken', { action: 'associate_bug' }); + telemetry.increaseCounter(CountMetric.TriageActionTaken, { action: 'associate_bug' }); this.existingBugDialog!.open(); } ignoreAnomaly() { - increaseCounter('fe_triage_action_taken', { action: 'ignore' }); + telemetry.increaseCounter(CountMetric.TriageActionTaken, { action: 'ignore' }); this.makeEditAnomalyRequest(this._anomalies, this._trace_names, 'IGNORE'); }