Skip to content

Commit

Permalink
Add service_name label to insight metrics (#4300)
Browse files Browse the repository at this point in the history
# What this PR does
Adds `service_name` label to insight metrics
NOTE: It is related to [this
PR](#4227) and should be merged no
sooner than two days after the next release (current release version is
1.4.4), because we need to wait for the metrics cache to be updated for
all organizations (uses the new cache structure with `services`)

## Which issue(s) this PR closes
Related to grafana/oncall-private#2610

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
    show up in the autogenerated release notes.
  • Loading branch information
Ferril committed May 22, 2024
1 parent a285955 commit f583da5
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 402 deletions.
2 changes: 2 additions & 0 deletions docs/sources/manage/insights-and-metrics/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ This metric has the following labels:
| `org_id` | ID of Grafana organization |
| `team` | Team name |
| `integration` | OnCall Integration name |
| `service_name`| Value of Alert group `service_name` label |
| `state` | Alert groups state. May be `firing`, `acknowledged`, `resolved` and `silenced`|

**Query example:**
Expand All @@ -86,6 +87,7 @@ This metric has the following labels:
| `org_id` | ID of Grafana organization |
| `team` | Team name |
| `integration` | OnCall Integration name |
| `service_name`| Value of Alert group `service_name` label |
| `le` | Histogram bucket value in seconds. May be `60`, `300`, `600`, `3600` and `+Inf`|

**Query example:**
Expand Down
18 changes: 3 additions & 15 deletions engine/apps/metrics_exporter/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,14 +287,7 @@ def metrics_update_alert_groups_state_cache(states_diff: dict, organization_id:
if not integration_alert_groups:
continue
for service_name, service_state_diff in service_data.items():
if "services" in integration_alert_groups:
states_to_update = integration_alert_groups["services"].setdefault(
service_name, get_default_states_dict()
)
else:
# support version of metrics cache without service name. This clause can be removed when all metrics
# cache is updated on prod (~2 days after release)
states_to_update = integration_alert_groups
states_to_update = integration_alert_groups["services"].setdefault(service_name, get_default_states_dict())
for previous_state, counter in service_state_diff["previous_states"].items():
if states_to_update[previous_state] - counter > 0:
states_to_update[previous_state] -= counter
Expand Down Expand Up @@ -329,13 +322,8 @@ def metrics_update_alert_groups_response_time_cache(integrations_response_time:
if not integration_response_time_metrics:
continue
for service_name, response_time_values in service_data.items():
if "services" in integration_response_time_metrics:
integration_response_time_metrics["services"].setdefault(service_name, [])
integration_response_time_metrics["services"][service_name].extend(response_time_values)
else:
# support version of metrics cache without service name. This clause can be removed when all metrics
# cache is updated on prod (~2 days after release)
integration_response_time_metrics["response_time"].extend(response_time_values)
integration_response_time_metrics["services"].setdefault(service_name, [])
integration_response_time_metrics["services"][service_name].extend(response_time_values)
cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)


Expand Down
57 changes: 16 additions & 41 deletions engine/apps/metrics_exporter/metrics_collectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from apps.metrics_exporter.constants import (
ALERT_GROUPS_RESPONSE_TIME,
ALERT_GROUPS_TOTAL,
SERVICE_LABEL,
USER_WAS_NOTIFIED_OF_ALERT_GROUPS,
AlertGroupsResponseTimeMetricsDict,
AlertGroupsTotalMetricsDict,
Expand Down Expand Up @@ -52,7 +53,7 @@ def __init__(self):
"team",
]
+ self._stack_labels
# + [SERVICE_LABEL] # todo:metrics: uncomment when all metric cache is updated (~2 after release)
+ [SERVICE_LABEL]
)
self._integration_labels_with_state = self._integration_labels + ["state"]
self._user_labels = ["username"] + self._stack_labels
Expand Down Expand Up @@ -100,24 +101,12 @@ def _get_alert_groups_total_metric(self, org_ids):
integration_data["id"], # grafana instance id
]
labels_values = list(map(str, labels_values))
# clause below is needed for compatibility with old metric cache during rollout metrics with services
if "services" in integration_data:
count_per_state = {state.value: 0 for state in AlertGroupState}
for service_name in integration_data["services"]:
for state in AlertGroupState:
count_per_state[state.value] += integration_data["services"][service_name][state.value]
# todo:metrics: with enabling service_name label move "add_metric" under
# "for service_name..." iteration
for state_name, counter in count_per_state.items():
for service_name in integration_data["services"]:
for state in AlertGroupState:
alert_groups_total.add_metric(
labels_values + [state_name],
# todo:metrics: replace [state.value] when all metric cache is updated
# + [service_name, state.value],
counter,
labels_values + [service_name, state.value],
integration_data["services"][service_name][state.value],
)
else:
for state in AlertGroupState:
alert_groups_total.add_metric(labels_values + [state.value], integration_data[state.value])
org_id_from_key = RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0]
processed_org_ids.add(int(org_id_from_key))
missing_org_ids = org_ids - processed_org_ids
Expand Down Expand Up @@ -146,30 +135,16 @@ def _get_response_time_metric(self, org_ids):
]
labels_values = list(map(str, labels_values))

# clause below is needed for compatibility with old metric cache during rollout metrics with services
if "services" in integration_data:
response_time_values = []
# todo:metrics: for service_name, response_time
for _, response_time in integration_data["services"].items():
if not response_time:
continue
response_time_values.extend(response_time)
else:
response_time_values = integration_data["response_time"]

if not response_time_values:
# ignore empty response_time_values
continue

# todo:metrics: with enabling service_name label move "add_metric" under
# "for service_name, response_time..." iteration
buckets, sum_value = self.get_buckets_with_sum(response_time_values)
buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
alert_groups_response_time_seconds.add_metric(
labels_values, # + [service_name] todo:metrics: uncomment when all metric cache is updated
buckets=buckets,
sum_value=sum_value,
)
for service_name, response_time in integration_data["services"].items():
if not response_time:
continue
buckets, sum_value = self.get_buckets_with_sum(response_time)
buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
alert_groups_response_time_seconds.add_metric(
labels_values + [service_name],
buckets=buckets,
sum_value=sum_value,
)
org_id_from_key = RE_ALERT_GROUPS_RESPONSE_TIME.match(org_key).groups()[0]
processed_org_ids.add(int(org_id_from_key))
missing_org_ids = org_ids - processed_org_ids
Expand Down
141 changes: 8 additions & 133 deletions engine/apps/metrics_exporter/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
METRICS_TEST_INSTANCE_SLUG = "test_instance"
METRICS_TEST_INSTANCE_ID = 292 # random number
METRICS_TEST_USER_USERNAME = "Alex"
METRICS_TEST_SERVICE_NAME = "test_service"


@pytest.fixture()
Expand Down Expand Up @@ -45,6 +46,12 @@ def _mock_cache_get(key, *args, **kwargs):
"acknowledged": 3,
"resolved": 5,
},
METRICS_TEST_SERVICE_NAME: {
"firing": 12,
"silenced": 14,
"acknowledged": 13,
"resolved": 15,
},
},
},
2: {
Expand Down Expand Up @@ -72,9 +79,7 @@ def _mock_cache_get(key, *args, **kwargs):
"org_id": 1,
"slug": "Test stack",
"id": 1,
"services": {
NO_SERVICE_VALUE: [2, 10, 200, 650],
},
"services": {NO_SERVICE_VALUE: [2, 10, 200, 650], METRICS_TEST_SERVICE_NAME: [4, 12, 20]},
},
2: {
"integration_name": "Empty integration",
Expand Down Expand Up @@ -108,92 +113,6 @@ def _mock_cache_get_many(keys, *args, **kwargs):
monkeypatch.setattr(cache, "get_many", _mock_cache_get_many)


# todo:metrics: remove later when all cache is updated
@pytest.fixture() # used for test backwards compatibility with old version of metrics
def mock_cache_get_metrics_for_collector_mixed_versions(monkeypatch):
def _mock_cache_get(key, *args, **kwargs):
if ALERT_GROUPS_TOTAL in key:
key = ALERT_GROUPS_TOTAL
elif ALERT_GROUPS_RESPONSE_TIME in key:
key = ALERT_GROUPS_RESPONSE_TIME
elif USER_WAS_NOTIFIED_OF_ALERT_GROUPS in key:
key = USER_WAS_NOTIFIED_OF_ALERT_GROUPS
test_metrics = {
ALERT_GROUPS_TOTAL: {
1: {
"integration_name": "Test metrics integration",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"firing": 2,
"acknowledged": 3,
"silenced": 4,
"resolved": 5,
},
2: {
"integration_name": "Test metrics integration 2",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"services": {
NO_SERVICE_VALUE: {
"firing": 2,
"silenced": 4,
"acknowledged": 3,
"resolved": 5,
},
"test_service": {
"firing": 10,
"silenced": 10,
"acknowledged": 10,
"resolved": 10,
},
},
},
},
ALERT_GROUPS_RESPONSE_TIME: {
1: {
"integration_name": "Test metrics integration",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"response_time": [2, 10, 200, 650],
},
2: {
"integration_name": "Test metrics integration 2",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"services": {NO_SERVICE_VALUE: [2, 10, 200, 650], "test_service": [4, 8, 12]},
},
},
USER_WAS_NOTIFIED_OF_ALERT_GROUPS: {
1: {
"org_id": 1,
"slug": "Test stack",
"id": 1,
"user_username": "Alex",
"counter": 4,
}
},
}
return test_metrics.get(key)

def _mock_cache_get_many(keys, *args, **kwargs):
return {key: _mock_cache_get(key) for key in keys if _mock_cache_get(key)}

monkeypatch.setattr(cache, "get", _mock_cache_get)
monkeypatch.setattr(cache, "get_many", _mock_cache_get_many)


@pytest.fixture()
def mock_get_metrics_cache(monkeypatch):
def _mock_cache_get(key, *args, **kwargs):
Expand Down Expand Up @@ -255,50 +174,6 @@ def cache_get(key, *args, **kwargs):
return _make_cache_params


# todo:metrics: remove later when all cache is updated
@pytest.fixture
def make_metrics_cache_params_old_version(monkeypatch):
def _make_cache_params(integration_id, organization_id, team_name=None, team_id=None):
team_name = team_name or "No team"
team_id = team_id or "no_team"
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)

def cache_get(key, *args, **kwargs):
metrics_data = {
metric_alert_groups_response_time_key: {
integration_id: {
"integration_name": METRICS_TEST_INTEGRATION_NAME,
"team_name": team_name,
"team_id": team_id,
"org_id": METRICS_TEST_ORG_ID,
"slug": METRICS_TEST_INSTANCE_SLUG,
"id": METRICS_TEST_INSTANCE_ID,
"response_time": [],
}
},
metric_alert_groups_total_key: {
integration_id: {
"integration_name": METRICS_TEST_INTEGRATION_NAME,
"team_name": team_name,
"team_id": team_id,
"org_id": METRICS_TEST_ORG_ID,
"slug": METRICS_TEST_INSTANCE_SLUG,
"id": METRICS_TEST_INSTANCE_ID,
"firing": 0,
"acknowledged": 0,
"silenced": 0,
"resolved": 0,
}
},
}
return metrics_data.get(key, {})

return cache_get

return _make_cache_params


@pytest.fixture
def make_user_was_notified_metrics_cache_params(monkeypatch):
def _make_cache_params(user_id, organization_id):
Expand Down

0 comments on commit f583da5

Please sign in to comment.