Add service_name label to insight metrics (#4300)

# What this PR does Adds `service_name` label to insight metrics NOTE: It is related to [this PR](#4227) and should be merged no sooner than two days after the next release (current release version is 1.4.4), because we need to wait for the metrics cache to be updated for all organizations (uses the new cache structure with `services`) ## Which issue(s) this PR closes Related to grafana/oncall-private#2610 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes.
grafana · May 22, 2024 · f583da5 · f583da5
1 parent a285955
commit f583da5
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 402 deletions.
diff --git a/docs/sources/manage/insights-and-metrics/index.md b/docs/sources/manage/insights-and-metrics/index.md
@@ -65,6 +65,7 @@ This metric has the following labels:
 | `org_id` | ID of Grafana organization |
 | `team` | Team name |
 | `integration` | OnCall Integration name |
+| `service_name`| Value of Alert group `service_name` label |
 | `state` | Alert groups state. May be `firing`, `acknowledged`, `resolved` and `silenced`|
 
 **Query example:**
@@ -86,6 +87,7 @@ This metric has the following labels:
 | `org_id` | ID of Grafana organization |
 | `team` | Team name |
 | `integration` | OnCall Integration name |
+| `service_name`| Value of Alert group `service_name` label |
 | `le` | Histogram bucket value in seconds. May be `60`, `300`, `600`, `3600` and `+Inf`|
 
 **Query example:**

diff --git a/engine/apps/metrics_exporter/helpers.py b/engine/apps/metrics_exporter/helpers.py
@@ -287,14 +287,7 @@ def metrics_update_alert_groups_state_cache(states_diff: dict, organization_id:
  if not integration_alert_groups:
  continue
  for service_name, service_state_diff in service_data.items():
- if "services" in integration_alert_groups:
- states_to_update = integration_alert_groups["services"].setdefault(
- service_name, get_default_states_dict()
- )
- else:
- # support version of metrics cache without service name. This clause can be removed when all metrics
- # cache is updated on prod (~2 days after release)
- states_to_update = integration_alert_groups
+ states_to_update = integration_alert_groups["services"].setdefault(service_name, get_default_states_dict())
  for previous_state, counter in service_state_diff["previous_states"].items():
  if states_to_update[previous_state] - counter > 0:
  states_to_update[previous_state] -= counter
@@ -329,13 +322,8 @@ def metrics_update_alert_groups_response_time_cache(integrations_response_time:
  if not integration_response_time_metrics:
  continue
  for service_name, response_time_values in service_data.items():
- if "services" in integration_response_time_metrics:
- integration_response_time_metrics["services"].setdefault(service_name, [])
- integration_response_time_metrics["services"][service_name].extend(response_time_values)
- else:
- # support version of metrics cache without service name. This clause can be removed when all metrics
- # cache is updated on prod (~2 days after release)
- integration_response_time_metrics["response_time"].extend(response_time_values)
+ integration_response_time_metrics["services"].setdefault(service_name, [])
+ integration_response_time_metrics["services"][service_name].extend(response_time_values)
  cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)
 
 

diff --git a/engine/apps/metrics_exporter/metrics_collectors.py b/engine/apps/metrics_exporter/metrics_collectors.py
@@ -9,6 +9,7 @@
 from apps.metrics_exporter.constants import (
  ALERT_GROUPS_RESPONSE_TIME,
  ALERT_GROUPS_TOTAL,
+ SERVICE_LABEL,
  USER_WAS_NOTIFIED_OF_ALERT_GROUPS,
  AlertGroupsResponseTimeMetricsDict,
  AlertGroupsTotalMetricsDict,
@@ -52,7 +53,7 @@ def __init__(self):
  "team",
  ]
  + self._stack_labels
- # + [SERVICE_LABEL] # todo:metrics: uncomment when all metric cache is updated (~2 after release)
+ + [SERVICE_LABEL]
  )
  self._integration_labels_with_state = self._integration_labels + ["state"]
  self._user_labels = ["username"] + self._stack_labels
@@ -100,24 +101,12 @@ def _get_alert_groups_total_metric(self, org_ids):
  integration_data["id"], # grafana instance id
  ]
  labels_values = list(map(str, labels_values))
- # clause below is needed for compatibility with old metric cache during rollout metrics with services
- if "services" in integration_data:
- count_per_state = {state.value: 0 for state in AlertGroupState}
- for service_name in integration_data["services"]:
- for state in AlertGroupState:
- count_per_state[state.value] += integration_data["services"][service_name][state.value]
- # todo:metrics: with enabling service_name label move "add_metric" under
- # "for service_name..." iteration
- for state_name, counter in count_per_state.items():
+ for service_name in integration_data["services"]:
+ for state in AlertGroupState:
  alert_groups_total.add_metric(
- labels_values + [state_name],
- # todo:metrics: replace [state.value] when all metric cache is updated
- # + [service_name, state.value],
- counter,
+ labels_values + [service_name, state.value],
+ integration_data["services"][service_name][state.value],
  )
- else:
- for state in AlertGroupState:
- alert_groups_total.add_metric(labels_values + [state.value], integration_data[state.value])
  org_id_from_key = RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0]
  processed_org_ids.add(int(org_id_from_key))
  missing_org_ids = org_ids - processed_org_ids
@@ -146,30 +135,16 @@ def _get_response_time_metric(self, org_ids):
  ]
  labels_values = list(map(str, labels_values))
 
- # clause below is needed for compatibility with old metric cache during rollout metrics with services
- if "services" in integration_data:
- response_time_values = []
- # todo:metrics: for service_name, response_time
- for _, response_time in integration_data["services"].items():
- if not response_time:
- continue
- response_time_values.extend(response_time)
- else:
- response_time_values = integration_data["response_time"]
-
- if not response_time_values:
- # ignore empty response_time_values
- continue
-
- # todo:metrics: with enabling service_name label move "add_metric" under
- # "for service_name, response_time..." iteration
- buckets, sum_value = self.get_buckets_with_sum(response_time_values)
- buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
- alert_groups_response_time_seconds.add_metric(
- labels_values, # + [service_name] todo:metrics: uncomment when all metric cache is updated
- buckets=buckets,
- sum_value=sum_value,
- )
+ for service_name, response_time in integration_data["services"].items():
+ if not response_time:
+ continue
+ buckets, sum_value = self.get_buckets_with_sum(response_time)
+ buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
+ alert_groups_response_time_seconds.add_metric(
+ labels_values + [service_name],
+ buckets=buckets,
+ sum_value=sum_value,
+ )
  org_id_from_key = RE_ALERT_GROUPS_RESPONSE_TIME.match(org_key).groups()[0]
  processed_org_ids.add(int(org_id_from_key))
  missing_org_ids = org_ids - processed_org_ids

diff --git a/engine/apps/metrics_exporter/tests/conftest.py b/engine/apps/metrics_exporter/tests/conftest.py
@@ -18,6 +18,7 @@
 METRICS_TEST_INSTANCE_SLUG = "test_instance"
 METRICS_TEST_INSTANCE_ID = 292 # random number
 METRICS_TEST_USER_USERNAME = "Alex"
+METRICS_TEST_SERVICE_NAME = "test_service"
 
 
 @pytest.fixture()
@@ -45,6 +46,12 @@ def _mock_cache_get(key, *args, **kwargs):
  "acknowledged": 3,
  "resolved": 5,
  },
+ METRICS_TEST_SERVICE_NAME: {
+ "firing": 12,
+ "silenced": 14,
+ "acknowledged": 13,
+ "resolved": 15,
+ },
  },
  },
  2: {
@@ -72,9 +79,7 @@ def _mock_cache_get(key, *args, **kwargs):
  "org_id": 1,
  "slug": "Test stack",
  "id": 1,
- "services": {
- NO_SERVICE_VALUE: [2, 10, 200, 650],
- },
+ "services": {NO_SERVICE_VALUE: [2, 10, 200, 650], METRICS_TEST_SERVICE_NAME: [4, 12, 20]},
  },
  2: {
  "integration_name": "Empty integration",
@@ -108,92 +113,6 @@ def _mock_cache_get_many(keys, *args, **kwargs):
  monkeypatch.setattr(cache, "get_many", _mock_cache_get_many)
 
 
-# todo:metrics: remove later when all cache is updated
-@pytest.fixture() # used for test backwards compatibility with old version of metrics
-def mock_cache_get_metrics_for_collector_mixed_versions(monkeypatch):
- def _mock_cache_get(key, *args, **kwargs):
- if ALERT_GROUPS_TOTAL in key:
- key = ALERT_GROUPS_TOTAL
- elif ALERT_GROUPS_RESPONSE_TIME in key:
- key = ALERT_GROUPS_RESPONSE_TIME
- elif USER_WAS_NOTIFIED_OF_ALERT_GROUPS in key:
- key = USER_WAS_NOTIFIED_OF_ALERT_GROUPS
- test_metrics = {
- ALERT_GROUPS_TOTAL: {
- 1: {
- "integration_name": "Test metrics integration",
- "team_name": "Test team",
- "team_id": 1,
- "org_id": 1,
- "slug": "Test stack",
- "id": 1,
- "firing": 2,
- "acknowledged": 3,
- "silenced": 4,
- "resolved": 5,
- },
- 2: {
- "integration_name": "Test metrics integration 2",
- "team_name": "Test team",
- "team_id": 1,
- "org_id": 1,
- "slug": "Test stack",
- "id": 1,
- "services": {
- NO_SERVICE_VALUE: {
- "firing": 2,
- "silenced": 4,
- "acknowledged": 3,
- "resolved": 5,
- },
- "test_service": {
- "firing": 10,
- "silenced": 10,
- "acknowledged": 10,
- "resolved": 10,
- },
- },
- },
- },
- ALERT_GROUPS_RESPONSE_TIME: {
- 1: {
- "integration_name": "Test metrics integration",
- "team_name": "Test team",
- "team_id": 1,
- "org_id": 1,
- "slug": "Test stack",
- "id": 1,
- "response_time": [2, 10, 200, 650],
- },
- 2: {
- "integration_name": "Test metrics integration 2",
- "team_name": "Test team",
- "team_id": 1,
- "org_id": 1,
- "slug": "Test stack",
- "id": 1,
- "services": {NO_SERVICE_VALUE: [2, 10, 200, 650], "test_service": [4, 8, 12]},
- },
- },
- USER_WAS_NOTIFIED_OF_ALERT_GROUPS: {
- 1: {
- "org_id": 1,
- "slug": "Test stack",
- "id": 1,
- "user_username": "Alex",
- "counter": 4,
- }
- },
- }
- return test_metrics.get(key)
-
- def _mock_cache_get_many(keys, *args, **kwargs):
- return {key: _mock_cache_get(key) for key in keys if _mock_cache_get(key)}
-
- monkeypatch.setattr(cache, "get", _mock_cache_get)
- monkeypatch.setattr(cache, "get_many", _mock_cache_get_many)
-
-
 @pytest.fixture()
 def mock_get_metrics_cache(monkeypatch):
  def _mock_cache_get(key, *args, **kwargs):
@@ -255,50 +174,6 @@ def cache_get(key, *args, **kwargs):
  return _make_cache_params
 
 
-# todo:metrics: remove later when all cache is updated
-@pytest.fixture
-def make_metrics_cache_params_old_version(monkeypatch):
- def _make_cache_params(integration_id, organization_id, team_name=None, team_id=None):
- team_name = team_name or "No team"
- team_id = team_id or "no_team"
- metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
- metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)
-
- def cache_get(key, *args, **kwargs):
- metrics_data = {
- metric_alert_groups_response_time_key: {
- integration_id: {
- "integration_name": METRICS_TEST_INTEGRATION_NAME,
- "team_name": team_name,
- "team_id": team_id,
- "org_id": METRICS_TEST_ORG_ID,
- "slug": METRICS_TEST_INSTANCE_SLUG,
- "id": METRICS_TEST_INSTANCE_ID,
- "response_time": [],
- }
- },
- metric_alert_groups_total_key: {
- integration_id: {
- "integration_name": METRICS_TEST_INTEGRATION_NAME,
- "team_name": team_name,
- "team_id": team_id,
- "org_id": METRICS_TEST_ORG_ID,
- "slug": METRICS_TEST_INSTANCE_SLUG,
- "id": METRICS_TEST_INSTANCE_ID,
- "firing": 0,
- "acknowledged": 0,
- "silenced": 0,
- "resolved": 0,
- }
- },
- }
- return metrics_data.get(key, {})
-
- return cache_get
-
- return _make_cache_params
-
-
 @pytest.fixture
 def make_user_was_notified_metrics_cache_params(monkeypatch):
  def _make_cache_params(user_id, organization_id):