diff --git a/documentation/packages/monitoring/README.md b/documentation/packages/monitoring/README.md index a1aa247b..e6768290 100644 --- a/documentation/packages/monitoring/README.md +++ b/documentation/packages/monitoring/README.md @@ -4,14 +4,15 @@ description: A package for monitoring the platform services # Monitoring -The monitoring package sets up services to monitor the entire deployed stack. This includes the state of the servers involved in the docker swarm, the docker containers themselves and particular applications such as Kafka. +The monitoring package sets up services to monitor the entire deployed stack. This includes the state of the servers involved in the docker swarm, the docker containers themselves and particular applications such as Kafka. It also captures the logs from the various services. This monitoring package uses: * Grafana: for dashboards * Prometheus: for recording metrics * Cadvisor: for reading docker container metrics -* Kafka: for saving a backup of metrics data +* Loki: for storing logs +* Node Exporter: for monitoring host machine metrics like CPU, memory etc To use the monitoring services, include the `monitoring` package id to your list of package ids when standing up the platform. @@ -31,7 +32,20 @@ To use custom metrics for an application, first configure that application to pr - prometheus-job-service=kafka - prometheus-address=kafka-minion:8080 -`prometheus-job` lets Prometheus know to enable monitoring for this container and `prometheus-address` give the endpoint that the monitoring can access the metrics on. By default this is assumed to be at the path `/metrics` by Prometheus. +`prometheus-job-service` lets Prometheus know to enable monitoring for this container and `prometheus-address` gives the endpoint that Prometheus can access the metrics on. By default this is assumed to be at the path `/metrics` by Prometheus. + +By using the `prometheus-job-service` label prometheus will only create a single target for your application even if it is replicated via service config in docker swarm. If you would like to monitor each replica separately (i.e. if metrics are only captured for that replica and not shared to some central location in the application cluster) you can instead used the `prometheus-job-task` label and Prometheus will create a target for each replica. + +A full list od supported labels are listed below: + +* `prometheus-job-service` - indicates this service should be monitored +* `prometheus-job-task` - indicates each task in the replicated service should be monitored separately +* `prometheus-address` - the service address Prometheus can scrape metrics from, can only be used with `prometheus-job-service` +* `prometheus-scheme` - the scheme to use when scaping a task or service (e.g. http or https), defaults to http +* `prometheus-metrics-path` - the path to the metrics endpoint on the target (defaults to /metrics) +* `prometheus-port` - the port of the metrics endpoint. Only usable with `prometheus-job-task`, defaults to all exposed ports for the container if no label is present + +All services must also be on the `prometheus_public` network to be able to be seen by Prometheus for metrics scraping. ## Adding additional dashboards diff --git a/interoperability-layer-openhim/docker-compose.yml b/interoperability-layer-openhim/docker-compose.yml index c2c2347d..f332945d 100644 --- a/interoperability-layer-openhim/docker-compose.yml +++ b/interoperability-layer-openhim/docker-compose.yml @@ -2,7 +2,7 @@ version: '3.9' services: openhim-core: - image: jembi/openhim-core:v8.0.1 + image: jembi/openhim-core:v8.1.1 networks: kafka: hapi-fhir: @@ -10,6 +10,7 @@ services: keycloak: public: default: + prometheus: environment: - mongo_url=${OPENHIM_MONGO_URL} - mongo_atnaUrl=${OPENHIM_MONGO_ATNAURL} @@ -31,6 +32,10 @@ services: reservations: cpus: ${OPENHIM_CPU_RESERVE} memory: ${OPENHIM_MEMORY_RESERVE} + labels: + - prometheus-job-task=openhim + - prometheus-scheme=https + - prometheus-port=8080 openhim-console: image: jembi/openhim-console:v1.18.2 @@ -74,4 +79,7 @@ networks: public: name: openhim_public external: true + prometheus: + name: prometheus_public + external: true default: diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 2f562a54..afe7d743 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -50,6 +50,10 @@ services: source: kminion-groups_rev1.json - target: /etc/grafana/provisioning/dashboards/applications/kminion-topic_rev1.json source: kminion-topic_rev1.json + - target: /etc/grafana/provisioning/dashboards/applications/openhim_nodejs_dashboard.json + source: openhim_nodejs_dashboard.json + - target: /etc/grafana/provisioning/dashboards/applications/openhim_transactions_dashboard.json + source: openhim_transactions_dashboard.json - target: /etc/grafana/provisioning/dashboards/containers/logging-universal-dashboard_rev1.json source: logging-universal-dashboard_rev1.json networks: @@ -195,6 +199,16 @@ configs: name: kminion-topic_rev1.json-${kminion_topic_rev1_json_DIGEST:?err} labels: name: grafana + openhim_nodejs_dashboard.json: + file: ./grafana/dashboards/applications/openhim_nodejs_dashboard.json + name: openhim_nodejs_dashboard.json-${openhim_nodejs_dashboard_json_DIGEST:?err} + labels: + name: grafana + openhim_transactions_dashboard.json: + file: ./grafana/dashboards/applications/openhim_transactions_dashboard.json + name: openhim_transactions_dashboard.json-${openhim_transactions_dashboard_json_DIGEST:?err} + labels: + name: grafana logging-universal-dashboard_rev1.json: file: ./grafana/dashboards/containers/logging-universal-dashboard_rev1.json name: logging-universal-dashboard_rev1.json-${logging_universal_dashboard_rev1_json_DIGEST:?err} diff --git a/monitoring/grafana/dashboards/applications/openhim_nodejs_dashboard.json b/monitoring/grafana/dashboards/applications/openhim_nodejs_dashboard.json new file mode 100644 index 00000000..a0b3344d --- /dev/null +++ b/monitoring/grafana/dashboards/applications/openhim_nodejs_dashboard.json @@ -0,0 +1,978 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "node.js prometheus client basic metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 11159, + "graphTooltip": 0, + "id": 11, + "links": [], + "liveNow": false, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 10, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "9.2.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "irate(openhim_process_cpu_user_seconds_total{instance=~\"$instance\"}[2m]) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "User CPU - {{instance}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "irate(openhim_process_cpu_system_seconds_total{instance=~\"$instance\"}[2m]) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Sys CPU - {{instance}}", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Process CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 9, + "x": 10, + "y": 0 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "9.2.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "openhim_nodejs_eventloop_lag_seconds{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Event Loop Lag", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 5, + "x": 19, + "y": 0 + }, + "id": 2, + "interval": "", + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "name" + }, + "pluginVersion": "9.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(openhim_nodejs_version_info{instance=~\"$instance\"}) by (version)", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{version}}", + "refId": "A" + } + ], + "title": "Node.js Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#F2495C", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 19, + "y": 3 + }, + "id": 4, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(changes(openhim_process_start_time_seconds{instance=~\"$instance\"}[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Process Restart Times", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 16, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "9.2.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "openhim_process_resident_memory_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Process Memory - {{instance}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "openhim_nodejs_heap_size_total_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Heap Total - {{instance}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "openhim_nodejs_heap_size_used_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Heap Used - {{instance}}", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "openhim_nodejs_external_memory_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "External Memory - {{instance}}", + "range": true, + "refId": "D" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Process Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 7 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "9.2.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "openhim_nodejs_active_handles_total{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Active Handler - {{instance}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "openhim_nodejs_active_requests_total{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Active Request - {{instance}}", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Active Handlers/Requests Total", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 14 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "9.2.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "openhim_nodejs_heap_space_size_total_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Heap Total - {{instance}} - {{space}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Heap Total Detail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 14 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "9.2.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "openhim_nodejs_heap_space_size_used_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Heap Used - {{instance}} - {{space}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Heap Used Detail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 14 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "9.2.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "openhim_nodejs_heap_space_size_available_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Heap Used - {{instance}} - {{space}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Heap Available Detail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "refresh": "5s", + "schemaVersion": 37, + "style": "dark", + "tags": [ + "nodejs", + "openhim" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(openhim_nodejs_version_info, instance)", + "hide": 0, + "includeAll": true, + "label": "instance", + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "label_values(openhim_nodejs_version_info, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "OpenHIM: Node.js", + "uid": "PTSqcpJWk", + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/monitoring/grafana/dashboards/applications/openhim_transactions_dashboard.json b/monitoring/grafana/dashboards/applications/openhim_transactions_dashboard.json new file mode 100644 index 00000000..c2a3e6ce --- /dev/null +++ b/monitoring/grafana/dashboards/applications/openhim_transactions_dashboard.json @@ -0,0 +1,715 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 10, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "range" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "builder", + "expr": "sum(openhim_transactions_total)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total transactions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 11, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false + }, + "pluginVersion": "9.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "builder", + "expr": "sum(rate(openhim_transactions_total[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Current transactions per second", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Successful" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Completed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Completed with error(s)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 9, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": [] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "range" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "builder", + "expr": "sum by(status) (openhim_transactions_total)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Transactions by status", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Successful" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Completed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Completed with error(s)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 10, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": [] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "range" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "builder", + "expr": "sum by(client) (openhim_transactions_total)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Transactions by client", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "sum by(channel) (rate(openhim_transactions_total[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Transactions per second by channel", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "builder", + "expr": "sum by(client) (rate(openhim_transactions_total[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Transactions per second by client", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "builder", + "expr": "sum by(channel) (rate(openhim_request_duration_sum[$__rate_interval])) / sum by(channel) (rate(openhim_request_duration_count{status=\"Successful\"}[$__rate_interval]))", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Response times by channel", + "type": "timeseries" + } + ], + "refresh": false, + "schemaVersion": 37, + "style": "dark", + "tags": [ + "openhim" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "OpenHIM: Transactions", + "uid": "Cyq97D9Vz", + "version": 5, + "weekStart": "" +} \ No newline at end of file diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml index e0e71c5e..c776a9f8 100644 --- a/monitoring/prometheus/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -4,14 +4,14 @@ global: scrape_configs: - job_name: "prometheus" static_configs: - - targets: ["prometheus:9090"] + - targets: ["prometheus:9090"] - job_name: "grafana" static_configs: - - targets: ["grafana:3000"] + - targets: ["grafana:3000"] # Create a job for Docker daemons. - - job_name: 'docker' + - job_name: "docker" dockerswarm_sd_configs: - host: unix:///var/run/docker.sock role: nodes @@ -25,23 +25,25 @@ scrape_configs: target_label: instance # Create a job for reading metrics from nodes - - job_name: 'node-exporter' + - job_name: "node-exporter" dns_sd_configs: - - names: - - 'tasks.node-exporter' - type: 'A' - port: 9100 + - names: + - "tasks.node-exporter" + type: "A" + port: 9100 # Create a job for reading metrics from containers - - job_name: 'cadvisor' + - job_name: "cadvisor" dns_sd_configs: - - names: - - 'tasks.cadvisor' - type: 'A' - port: 8080 + - names: + - "tasks.cadvisor" + type: "A" + port: 8080 # Create a job for Docker Swarm services with particular labels (one target for the whole service, possibly multiple containers) - - job_name: 'dockerswarm-services' + - job_name: "dockerswarm-services" + tls_config: + insecure_skip_verify: true dockerswarm_sd_configs: - host: unix:///var/run/docker.sock role: services @@ -56,9 +58,20 @@ scrape_configs: # Use the prometheus-address Swarm label as the target address for swarm service. - source_labels: [__meta_dockerswarm_service_label_prometheus_address] target_label: __address__ + # Use the prometheus-scheme label to set the connection scheme (http or https), defaults to http if no label is present + - source_labels: [__meta_dockerswarm_service_label_prometheus_scheme] + regex: (.+) + target_label: __scheme__ + # Use the prometheus-metrics-path label to set the metric url path, defaults to /metrics if no label is present + - source_labels: + [__meta_dockerswarm_service_label_prometheus_metrics_path] + regex: (.+) + target_label: __metrics_path__ # Create a job for Docker Swarm container with particular labels (one target per container/task) - - job_name: 'dockerswarm-tasks' + - job_name: "dockerswarm-tasks" + tls_config: + insecure_skip_verify: true dockerswarm_sd_configs: - host: unix:///var/run/docker.sock role: tasks @@ -71,6 +84,25 @@ scrape_configs: - source_labels: [__meta_dockerswarm_service_label_prometheus_job_task] regex: .+ action: keep + # Only keep containers on the prometheus_public network. + - source_labels: [__meta_dockerswarm_network_name] + regex: prometheus_public + action: keep # Use the prometheus-job Swarm label as Prometheus job label. - source_labels: [__meta_dockerswarm_service_label_prometheus_job_task] target_label: job + # Use the prometheus-scheme label to set the connection scheme (http or https), defaults to http if no label is present + - source_labels: [__meta_dockerswarm_service_label_prometheus_scheme] + regex: (.+) + target_label: __scheme__ + # Use the prometheus-port label to override the target ports, defaults to all exposed ports for the container if no label is present + - source_labels: + [__address__, __meta_dockerswarm_service_label_prometheus_port] + regex: (.+):.+;(.+) + replacement: $1:$2 + target_label: __address__ + # Use the prometheus-metrics-path label to set the metric url path, defaults to /metrics if no label is present + - source_labels: + [__meta_dockerswarm_service_label_prometheus_metrics_path] + regex: (.+) + target_label: __metrics_path__