Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add service_name label to insight metrics #4300

Merged
merged 7 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/sources/manage/insights-and-metrics/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ This metric has the following labels:
| `org_id` | ID of Grafana organization |
| `team` | Team name |
| `integration` | OnCall Integration name |
| `service_name`| Value of Alert group `service_name` label |
| `state` | Alert groups state. May be `firing`, `acknowledged`, `resolved` and `silenced`|

**Query example:**
Expand All @@ -86,6 +87,7 @@ This metric has the following labels:
| `org_id` | ID of Grafana organization |
| `team` | Team name |
| `integration` | OnCall Integration name |
| `service_name`| Value of Alert group `service_name` label |
| `le` | Histogram bucket value in seconds. May be `60`, `300`, `600`, `3600` and `+Inf`|

**Query example:**
Expand Down
18 changes: 3 additions & 15 deletions engine/apps/metrics_exporter/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,14 +287,7 @@ def metrics_update_alert_groups_state_cache(states_diff: dict, organization_id:
if not integration_alert_groups:
continue
for service_name, service_state_diff in service_data.items():
if "services" in integration_alert_groups:
states_to_update = integration_alert_groups["services"].setdefault(
service_name, get_default_states_dict()
)
else:
# support version of metrics cache without service name. This clause can be removed when all metrics
# cache is updated on prod (~2 days after release)
states_to_update = integration_alert_groups
states_to_update = integration_alert_groups["services"].setdefault(service_name, get_default_states_dict())
for previous_state, counter in service_state_diff["previous_states"].items():
if states_to_update[previous_state] - counter > 0:
states_to_update[previous_state] -= counter
Expand Down Expand Up @@ -329,13 +322,8 @@ def metrics_update_alert_groups_response_time_cache(integrations_response_time:
if not integration_response_time_metrics:
continue
for service_name, response_time_values in service_data.items():
if "services" in integration_response_time_metrics:
integration_response_time_metrics["services"].setdefault(service_name, [])
integration_response_time_metrics["services"][service_name].extend(response_time_values)
else:
# support version of metrics cache without service name. This clause can be removed when all metrics
# cache is updated on prod (~2 days after release)
integration_response_time_metrics["response_time"].extend(response_time_values)
integration_response_time_metrics["services"].setdefault(service_name, [])
integration_response_time_metrics["services"][service_name].extend(response_time_values)
cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)


Expand Down
52 changes: 15 additions & 37 deletions engine/apps/metrics_exporter/metrics_collectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from apps.metrics_exporter.constants import (
ALERT_GROUPS_RESPONSE_TIME,
ALERT_GROUPS_TOTAL,
SERVICE_LABEL,
USER_WAS_NOTIFIED_OF_ALERT_GROUPS,
AlertGroupsResponseTimeMetricsDict,
AlertGroupsTotalMetricsDict,
Expand Down Expand Up @@ -52,7 +53,7 @@ def __init__(self):
"team",
]
+ self._stack_labels
# + [SERVICE_LABEL] # todo:metrics: uncomment when all metric cache is updated (~2 after release)
+ [SERVICE_LABEL]
)
self._integration_labels_with_state = self._integration_labels + ["state"]
self._user_labels = ["username"] + self._stack_labels
Expand Down Expand Up @@ -100,24 +101,12 @@ def _get_alert_groups_total_metric(self, org_ids):
integration_data["id"], # grafana instance id
]
labels_values = list(map(str, labels_values))
# clause below is needed for compatibility with old metric cache during rollout metrics with services
if "services" in integration_data:
count_per_state = {state.value: 0 for state in AlertGroupState}
for service_name in integration_data["services"]:
for state in AlertGroupState:
count_per_state[state.value] += integration_data["services"][service_name][state.value]
# todo:metrics: with enabling service_name label move "add_metric" under
# "for service_name..." iteration
for state_name, counter in count_per_state.items():
for service_name in integration_data["services"]:
for state in AlertGroupState:
alert_groups_total.add_metric(
labels_values + [state_name],
# todo:metrics: replace [state.value] when all metric cache is updated
# + [service_name, state.value],
counter,
labels_values + [service_name, state.value],
integration_data["services"][service_name][state.value],
)
else:
for state in AlertGroupState:
alert_groups_total.add_metric(labels_values + [state.value], integration_data[state.value])
org_id_from_key = RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0]
processed_org_ids.add(int(org_id_from_key))
missing_org_ids = org_ids - processed_org_ids
Expand Down Expand Up @@ -146,27 +135,16 @@ def _get_response_time_metric(self, org_ids):
]
labels_values = list(map(str, labels_values))

# clause below is needed for compatibility with old metric cache during rollout metrics with services
if "services" in integration_data:
response_time_values = []
# todo:metrics: for service_name, response_time
for _, response_time in integration_data["services"].items():
if not response_time:
continue
response_time_values.extend(response_time)
else:
response_time_values = integration_data["response_time"]
if not response_time_values:
for service_name, response_time in integration_data["services"].items():
if not response_time:
continue
# todo:metrics: with enabling service_name label move "add_metric" under
# "for service_name, response_time..." iteration
buckets, sum_value = self.get_buckets_with_sum(response_time_values)
buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
alert_groups_response_time_seconds.add_metric(
labels_values, # + [service_name] todo:metrics: uncomment when all metric cache is updated
buckets=buckets,
sum_value=sum_value,
)
buckets, sum_value = self.get_buckets_with_sum(response_time)
buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
alert_groups_response_time_seconds.add_metric(
labels_values + [service_name],
buckets=buckets,
sum_value=sum_value,
)
org_id_from_key = RE_ALERT_GROUPS_RESPONSE_TIME.match(org_key).groups()[0]
processed_org_ids.add(int(org_id_from_key))
missing_org_ids = org_ids - processed_org_ids
Expand Down
141 changes: 8 additions & 133 deletions engine/apps/metrics_exporter/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
METRICS_TEST_INSTANCE_SLUG = "test_instance"
METRICS_TEST_INSTANCE_ID = 292 # random number
METRICS_TEST_USER_USERNAME = "Alex"
METRICS_TEST_SERVICE_NAME = "test_service"


@pytest.fixture()
Expand Down Expand Up @@ -45,84 +46,11 @@ def _mock_cache_get(key, *args, **kwargs):
"acknowledged": 3,
"resolved": 5,
},
},
},
},
ALERT_GROUPS_RESPONSE_TIME: {
1: {
"integration_name": "Test metrics integration",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"services": {
NO_SERVICE_VALUE: [2, 10, 200, 650],
},
}
},
USER_WAS_NOTIFIED_OF_ALERT_GROUPS: {
1: {
"org_id": 1,
"slug": "Test stack",
"id": 1,
"user_username": "Alex",
"counter": 4,
}
},
}
return test_metrics.get(key)

def _mock_cache_get_many(keys, *args, **kwargs):
return {key: _mock_cache_get(key) for key in keys if _mock_cache_get(key)}

monkeypatch.setattr(cache, "get", _mock_cache_get)
monkeypatch.setattr(cache, "get_many", _mock_cache_get_many)


# todo:metrics: remove later when all cache is updated
@pytest.fixture() # used for test backwards compatibility with old version of metrics
def mock_cache_get_metrics_for_collector_mixed_versions(monkeypatch):
def _mock_cache_get(key, *args, **kwargs):
if ALERT_GROUPS_TOTAL in key:
key = ALERT_GROUPS_TOTAL
elif ALERT_GROUPS_RESPONSE_TIME in key:
key = ALERT_GROUPS_RESPONSE_TIME
elif USER_WAS_NOTIFIED_OF_ALERT_GROUPS in key:
key = USER_WAS_NOTIFIED_OF_ALERT_GROUPS
test_metrics = {
ALERT_GROUPS_TOTAL: {
1: {
"integration_name": "Test metrics integration",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"firing": 2,
"acknowledged": 3,
"silenced": 4,
"resolved": 5,
},
2: {
"integration_name": "Test metrics integration 2",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"services": {
NO_SERVICE_VALUE: {
"firing": 2,
"silenced": 4,
"acknowledged": 3,
"resolved": 5,
},
"test_service": {
"firing": 10,
"silenced": 10,
"acknowledged": 10,
"resolved": 10,
METRICS_TEST_SERVICE_NAME: {
"firing": 12,
"silenced": 14,
"acknowledged": 13,
"resolved": 15,
},
},
},
Expand All @@ -135,17 +63,8 @@ def _mock_cache_get(key, *args, **kwargs):
"org_id": 1,
"slug": "Test stack",
"id": 1,
"response_time": [2, 10, 200, 650],
},
2: {
"integration_name": "Test metrics integration 2",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"services": {NO_SERVICE_VALUE: [2, 10, 200, 650], "test_service": [4, 8, 12]},
},
"services": {NO_SERVICE_VALUE: [2, 10, 200, 650], METRICS_TEST_SERVICE_NAME: [4, 12, 20]},
}
},
USER_WAS_NOTIFIED_OF_ALERT_GROUPS: {
1: {
Expand Down Expand Up @@ -227,50 +146,6 @@ def cache_get(key, *args, **kwargs):
return _make_cache_params


# todo:metrics: remove later when all cache is updated
@pytest.fixture
def make_metrics_cache_params_old_version(monkeypatch):
def _make_cache_params(integration_id, organization_id, team_name=None, team_id=None):
team_name = team_name or "No team"
team_id = team_id or "no_team"
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)

def cache_get(key, *args, **kwargs):
metrics_data = {
metric_alert_groups_response_time_key: {
integration_id: {
"integration_name": METRICS_TEST_INTEGRATION_NAME,
"team_name": team_name,
"team_id": team_id,
"org_id": METRICS_TEST_ORG_ID,
"slug": METRICS_TEST_INSTANCE_SLUG,
"id": METRICS_TEST_INSTANCE_ID,
"response_time": [],
}
},
metric_alert_groups_total_key: {
integration_id: {
"integration_name": METRICS_TEST_INTEGRATION_NAME,
"team_name": team_name,
"team_id": team_id,
"org_id": METRICS_TEST_ORG_ID,
"slug": METRICS_TEST_INSTANCE_SLUG,
"id": METRICS_TEST_INSTANCE_ID,
"firing": 0,
"acknowledged": 0,
"silenced": 0,
"resolved": 0,
}
},
}
return metrics_data.get(key, {})

return cache_get

return _make_cache_params


@pytest.fixture
def make_user_was_notified_metrics_cache_params(monkeypatch):
def _make_cache_params(user_id, organization_id):
Expand Down