grafana · Ferril · May 22, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
@@ -65,6 +65,7 @@ This metric has the following labels:
 | `org_id` | ID of Grafana organization |
 | `team` | Team name |
 | `integration` | OnCall Integration name |
+| `service_name`| Value of Alert group `service_name` label |
 | `state` | Alert groups state. May be `firing`, `acknowledged`, `resolved` and `silenced`|
 
 **Query example:**
@@ -86,6 +87,7 @@ This metric has the following labels:
 | `org_id` | ID of Grafana organization |
 | `team` | Team name |
 | `integration` | OnCall Integration name |
+| `service_name`| Value of Alert group `service_name` label |
 | `le` | Histogram bucket value in seconds. May be `60`, `300`, `600`, `3600` and `+Inf`|
 
 **Query example:**

@@ -287,14 +287,7 @@ def metrics_update_alert_groups_state_cache(states_diff: dict, organization_id:
  if not integration_alert_groups:
  continue
  for service_name, service_state_diff in service_data.items():
- if "services" in integration_alert_groups:
- states_to_update = integration_alert_groups["services"].setdefault(
- service_name, get_default_states_dict()
- )
- else:
- # support version of metrics cache without service name. This clause can be removed when all metrics
- # cache is updated on prod (~2 days after release)
- states_to_update = integration_alert_groups
+ states_to_update = integration_alert_groups["services"].setdefault(service_name, get_default_states_dict())
  for previous_state, counter in service_state_diff["previous_states"].items():
  if states_to_update[previous_state] - counter > 0:
  states_to_update[previous_state] -= counter
@@ -329,13 +322,8 @@ def metrics_update_alert_groups_response_time_cache(integrations_response_time:
  if not integration_response_time_metrics:
  continue
  for service_name, response_time_values in service_data.items():
- if "services" in integration_response_time_metrics:
- integration_response_time_metrics["services"].setdefault(service_name, [])
- integration_response_time_metrics["services"][service_name].extend(response_time_values)
- else:
- # support version of metrics cache without service name. This clause can be removed when all metrics
- # cache is updated on prod (~2 days after release)
- integration_response_time_metrics["response_time"].extend(response_time_values)
+ integration_response_time_metrics["services"].setdefault(service_name, [])
+ integration_response_time_metrics["services"][service_name].extend(response_time_values)
  cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)
 
 

@@ -9,6 +9,7 @@
 from apps.metrics_exporter.constants import (
  ALERT_GROUPS_RESPONSE_TIME,
  ALERT_GROUPS_TOTAL,
+ SERVICE_LABEL,
  USER_WAS_NOTIFIED_OF_ALERT_GROUPS,
  AlertGroupsResponseTimeMetricsDict,
  AlertGroupsTotalMetricsDict,
@@ -52,7 +53,7 @@ def __init__(self):
  "team",
  ]
  + self._stack_labels
- # + [SERVICE_LABEL] # todo:metrics: uncomment when all metric cache is updated (~2 after release)
+ + [SERVICE_LABEL]
  )
  self._integration_labels_with_state = self._integration_labels + ["state"]
  self._user_labels = ["username"] + self._stack_labels
@@ -100,24 +101,12 @@ def _get_alert_groups_total_metric(self, org_ids):
  integration_data["id"], # grafana instance id
  ]
  labels_values = list(map(str, labels_values))
- # clause below is needed for compatibility with old metric cache during rollout metrics with services
- if "services" in integration_data:
- count_per_state = {state.value: 0 for state in AlertGroupState}
- for service_name in integration_data["services"]:
- for state in AlertGroupState:
- count_per_state[state.value] += integration_data["services"][service_name][state.value]
- # todo:metrics: with enabling service_name label move "add_metric" under
- # "for service_name..." iteration
- for state_name, counter in count_per_state.items():
+ for service_name in integration_data["services"]:
+ for state in AlertGroupState:
  alert_groups_total.add_metric(
- labels_values + [state_name],
- # todo:metrics: replace [state.value] when all metric cache is updated
- # + [service_name, state.value],
- counter,
+ labels_values + [service_name, state.value],
+ integration_data["services"][service_name][state.value],
  )
- else:
- for state in AlertGroupState:
- alert_groups_total.add_metric(labels_values + [state.value], integration_data[state.value])
  org_id_from_key = RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0]
  processed_org_ids.add(int(org_id_from_key))
  missing_org_ids = org_ids - processed_org_ids
@@ -146,27 +135,16 @@ def _get_response_time_metric(self, org_ids):
  ]
  labels_values = list(map(str, labels_values))
 
- # clause below is needed for compatibility with old metric cache during rollout metrics with services
- if "services" in integration_data:
- response_time_values = []
- # todo:metrics: for service_name, response_time
- for _, response_time in integration_data["services"].items():
- if not response_time:
- continue
- response_time_values.extend(response_time)
- else:
- response_time_values = integration_data["response_time"]
- if not response_time_values:
+ for service_name, response_time in integration_data["services"].items():
+ if not response_time:
  continue
- # todo:metrics: with enabling service_name label move "add_metric" under
- # "for service_name, response_time..." iteration
- buckets, sum_value = self.get_buckets_with_sum(response_time_values)
- buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
- alert_groups_response_time_seconds.add_metric(
- labels_values, # + [service_name] todo:metrics: uncomment when all metric cache is updated
- buckets=buckets,
- sum_value=sum_value,
- )
+ buckets, sum_value = self.get_buckets_with_sum(response_time)
+ buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
+ alert_groups_response_time_seconds.add_metric(
+ labels_values + [service_name],
+ buckets=buckets,
+ sum_value=sum_value,
+ )
  org_id_from_key = RE_ALERT_GROUPS_RESPONSE_TIME.match(org_key).groups()[0]
  processed_org_ids.add(int(org_id_from_key))
  missing_org_ids = org_ids - processed_org_ids

@@ -18,6 +18,7 @@
 METRICS_TEST_INSTANCE_SLUG = "test_instance"
 METRICS_TEST_INSTANCE_ID = 292 # random number
 METRICS_TEST_USER_USERNAME = "Alex"
+METRICS_TEST_SERVICE_NAME = "test_service"
 
 
 @pytest.fixture()
@@ -45,84 +46,11 @@ def _mock_cache_get(key, *args, **kwargs):
  "acknowledged": 3,
  "resolved": 5,
  },
- },
- },
- },
- ALERT_GROUPS_RESPONSE_TIME: {
- 1: {
- "integration_name": "Test metrics integration",
- "team_name": "Test team",
- "team_id": 1,
- "org_id": 1,
- "slug": "Test stack",
- "id": 1,
- "services": {
- NO_SERVICE_VALUE: [2, 10, 200, 650],
- },
- }
- },
- USER_WAS_NOTIFIED_OF_ALERT_GROUPS: {
- 1: {
- "org_id": 1,
- "slug": "Test stack",
- "id": 1,
- "user_username": "Alex",
- "counter": 4,
- }
- },
- }
- return test_metrics.get(key)
-
- def _mock_cache_get_many(keys, *args, **kwargs):
- return {key: _mock_cache_get(key) for key in keys if _mock_cache_get(key)}
-
- monkeypatch.setattr(cache, "get", _mock_cache_get)
- monkeypatch.setattr(cache, "get_many", _mock_cache_get_many)
-
-
-# todo:metrics: remove later when all cache is updated
-@pytest.fixture() # used for test backwards compatibility with old version of metrics
-def mock_cache_get_metrics_for_collector_mixed_versions(monkeypatch):
- def _mock_cache_get(key, *args, **kwargs):
- if ALERT_GROUPS_TOTAL in key:
- key = ALERT_GROUPS_TOTAL
- elif ALERT_GROUPS_RESPONSE_TIME in key:
- key = ALERT_GROUPS_RESPONSE_TIME
- elif USER_WAS_NOTIFIED_OF_ALERT_GROUPS in key:
- key = USER_WAS_NOTIFIED_OF_ALERT_GROUPS
- test_metrics = {
- ALERT_GROUPS_TOTAL: {
- 1: {
- "integration_name": "Test metrics integration",
- "team_name": "Test team",
- "team_id": 1,
- "org_id": 1,
- "slug": "Test stack",
- "id": 1,
- "firing": 2,
- "acknowledged": 3,
- "silenced": 4,
- "resolved": 5,
- },
- 2: {
- "integration_name": "Test metrics integration 2",
- "team_name": "Test team",
- "team_id": 1,
- "org_id": 1,
- "slug": "Test stack",
- "id": 1,
- "services": {
- NO_SERVICE_VALUE: {
- "firing": 2,
- "silenced": 4,
- "acknowledged": 3,
- "resolved": 5,
- },
- "test_service": {
- "firing": 10,
- "silenced": 10,
- "acknowledged": 10,
- "resolved": 10,
+ METRICS_TEST_SERVICE_NAME: {
+ "firing": 12,
+ "silenced": 14,
+ "acknowledged": 13,
+ "resolved": 15,
  },
  },
  },
@@ -135,17 +63,8 @@ def _mock_cache_get(key, *args, **kwargs):
  "org_id": 1,
  "slug": "Test stack",
  "id": 1,
- "response_time": [2, 10, 200, 650],
- },
- 2: {
- "integration_name": "Test metrics integration 2",
- "team_name": "Test team",
- "team_id": 1,
- "org_id": 1,
- "slug": "Test stack",
- "id": 1,
- "services": {NO_SERVICE_VALUE: [2, 10, 200, 650], "test_service": [4, 8, 12]},
- },
+ "services": {NO_SERVICE_VALUE: [2, 10, 200, 650], METRICS_TEST_SERVICE_NAME: [4, 12, 20]},
+ }
  },
  USER_WAS_NOTIFIED_OF_ALERT_GROUPS: {
  1: {
@@ -227,50 +146,6 @@ def cache_get(key, *args, **kwargs):
  return _make_cache_params
 
 
-# todo:metrics: remove later when all cache is updated
-@pytest.fixture
-def make_metrics_cache_params_old_version(monkeypatch):
- def _make_cache_params(integration_id, organization_id, team_name=None, team_id=None):
- team_name = team_name or "No team"
- team_id = team_id or "no_team"
- metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
- metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)
-
- def cache_get(key, *args, **kwargs):
- metrics_data = {
- metric_alert_groups_response_time_key: {
- integration_id: {
- "integration_name": METRICS_TEST_INTEGRATION_NAME,
- "team_name": team_name,
- "team_id": team_id,
- "org_id": METRICS_TEST_ORG_ID,
- "slug": METRICS_TEST_INSTANCE_SLUG,
- "id": METRICS_TEST_INSTANCE_ID,
- "response_time": [],
- }
- },
- metric_alert_groups_total_key: {
- integration_id: {
- "integration_name": METRICS_TEST_INTEGRATION_NAME,
- "team_name": team_name,
- "team_id": team_id,
- "org_id": METRICS_TEST_ORG_ID,
- "slug": METRICS_TEST_INSTANCE_SLUG,
- "id": METRICS_TEST_INSTANCE_ID,
- "firing": 0,
- "acknowledged": 0,
- "silenced": 0,
- "resolved": 0,
- }
- },
- }
- return metrics_data.get(key, {})
-
- return cache_get
-
- return _make_cache_params
-
-
 @pytest.fixture
 def make_user_was_notified_metrics_cache_params(monkeypatch):
  def _make_cache_params(user_id, organization_id):