diff --git a/.env.cluster b/.env.cluster index 33aec3b6..03a0e74a 100644 --- a/.env.cluster +++ b/.env.cluster @@ -28,7 +28,7 @@ REPMGR_PARTNER_NODES=santempi-psql-1,santempi-psql-2,santempi-psql-3 # Reverse Proxy - Nginx REVERSE_PROXY_INSTANCES=3 DOMAIN_NAME=domain -SUBDOMAINS=openhimcomms.domain,openhimcore.domain,openhimconsole.domain,kibana.domain,reports.domain,santewww.domain,santempi.domain,superset.domain,keycloak.domain,grafana.domain +SUBDOMAINS=openhimcomms.domain,openhimcore.domain,openhimconsole.domain,kibana.domain,reports.domain,santewww.domain,santempi.domain,superset.domain,keycloak.domain,grafana.domain,minio.domain STAGING=true INSECURE=false diff --git a/.github/workflows/run-tests.sh b/.github/workflows/run-tests.sh index 2bff57be..9ea83436 100755 --- a/.github/workflows/run-tests.sh +++ b/.github/workflows/run-tests.sh @@ -49,6 +49,8 @@ else DOCKER_HOST=ssh://ubuntu@$GITHUB_RUN_ID.jembi.cloud yarn test:"$NODE_MODE":hapi elif [[ $folder_name == *"santempi"* ]]; then DOCKER_HOST=ssh://ubuntu@$GITHUB_RUN_ID.jembi.cloud yarn test:"$NODE_MODE":sante + elif [[ $folder_name == *"monitoring"* ]]; then + DOCKER_HOST=ssh://ubuntu@$GITHUB_RUN_ID.jembi.cloud yarn test:"$NODE_MODE":monitoring fi done fi diff --git a/identity-access-manager-keycloak/docker-compose.yml b/identity-access-manager-keycloak/docker-compose.yml index b55d7497..50663bad 100644 --- a/identity-access-manager-keycloak/docker-compose.yml +++ b/identity-access-manager-keycloak/docker-compose.yml @@ -2,8 +2,14 @@ version: '3.9' services: identity-access-manager-keycloak: - image: keycloak/keycloak - command: ["start", "--proxy=edge", "--hostname-url=${KC_FRONTEND_URL}", "--import-realm"] + image: keycloak/keycloak:20.0 + command: + [ + "start", + "--proxy=edge", + "--hostname-url=${KC_FRONTEND_URL}", + "--import-realm" + ] hostname: identity-access-manager-keycloak healthcheck: test: curl --fail http://localhost:8080/health/ready || exit 1 diff --git a/infrastructure/ansible/roles/docker/files/docker-daemon.json b/infrastructure/ansible/roles/docker/files/docker-daemon.json index b8ab8c9d..54d65077 100644 --- a/infrastructure/ansible/roles/docker/files/docker-daemon.json +++ b/infrastructure/ansible/roles/docker/files/docker-daemon.json @@ -2,6 +2,7 @@ "log-driver": "json-file", "log-opts": { "max-size": "10m", - "max-file": "3" + "max-file": "3", + "labels-regex": "^.+" } } diff --git a/monitoring/docker-compose.cluster.yml b/monitoring/docker-compose.cluster.yml index 93a25cc2..a1d307d0 100644 --- a/monitoring/docker-compose.cluster.yml +++ b/monitoring/docker-compose.cluster.yml @@ -29,6 +29,96 @@ services: - '--web.console.templates=/etc/prometheus/consoles' - '--web.enable-lifecycle' + minio-01: + deploy: + placement: + constraints: + - "node.labels.name==node-1" + + minio-02: + image: quay.io/minio/minio:RELEASE.2022-10-24T18-35-07Z + entrypoint: sh + command: -c 'mkdir -p /data1/loki /data2/loki && minio server --console-address ":9001" http://minio-0{1...4}/data{1...2}' + environment: + MINIO_ROOT_USER: ${MO_SECURITY_ADMIN_USER} + MINIO_ROOT_PASSWORD: ${MO_SECURITY_ADMIN_PASSWORD} + healthcheck: + test: + [ + "CMD", + "curl", + "-f", + "http://localhost:9000/minio/health/live" + ] + interval: 30s + timeout: 20s + retries: 3 + hostname: minio-02 + volumes: + - minio-02-data1:/data1 + - minio-02-data2:/data2 + deploy: + placement: + constraints: + - "node.labels.name==node-1" + replicas: 1 + + minio-03: + image: quay.io/minio/minio:RELEASE.2022-10-24T18-35-07Z + entrypoint: sh + command: -c 'mkdir -p /data1/loki /data2/loki && minio server --console-address ":9001" http://minio-0{1...4}/data{1...2}' + environment: + MINIO_ROOT_USER: ${MO_SECURITY_ADMIN_USER} + MINIO_ROOT_PASSWORD: ${MO_SECURITY_ADMIN_PASSWORD} + healthcheck: + test: + [ + "CMD", + "curl", + "-f", + "http://localhost:9000/minio/health/live" + ] + interval: 30s + timeout: 20s + retries: 3 + hostname: minio-03 + volumes: + - minio-03-data1:/data1 + - minio-03-data2:/data2 + deploy: + placement: + constraints: + - "node.labels.name==node-2" + replicas: 1 + + minio-04: + image: quay.io/minio/minio:RELEASE.2022-10-24T18-35-07Z + entrypoint: sh + command: -c 'mkdir -p /data1/loki /data2/loki && minio server --console-address ":9001" http://minio-0{1...4}/data{1...2}' + environment: + MINIO_ROOT_USER: ${MO_SECURITY_ADMIN_USER} + MINIO_ROOT_PASSWORD: ${MO_SECURITY_ADMIN_PASSWORD} + healthcheck: + test: + [ + "CMD", + "curl", + "-f", + "http://localhost:9000/minio/health/live" + ] + interval: 30s + timeout: 20s + retries: 3 + hostname: minio-04 + volumes: + - minio-04-data1:/data1 + - minio-04-data2:/data2 + deploy: + placement: + constraints: + - "node.labels.name==node-3" + replicas: 1 + configs: prometheus.yml: file: ./prometheus/prometheus.yml @@ -38,3 +128,9 @@ configs: volumes: prometheus_data_backup: + minio-02-data1: + minio-02-data2: + minio-03-data1: + minio-03-data2: + minio-04-data1: + minio-04-data2: diff --git a/monitoring/docker-compose.dev.yml b/monitoring/docker-compose.dev.yml index c5138a68..38b4c708 100644 --- a/monitoring/docker-compose.dev.yml +++ b/monitoring/docker-compose.dev.yml @@ -12,3 +12,15 @@ services: - target: 9090 published: 9090 mode: host + + loki: + ports: + - target: 3100 + published: 3100 + mode: host + + minio-01: + ports: + - target: 9001 + published: 9005 + mode: host diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index c74cc214..67037abe 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -4,7 +4,7 @@ services: grafana: image: grafana/grafana-oss:9.2.3 volumes: - - grafana_data:/var/lib/grafana + - grafana-data:/var/lib/grafana environment: GF_SECURITY_ADMIN_USER: ${GF_SECURITY_ADMIN_USER} GF_SECURITY_ADMIN_PASSWORD: ${GF_SECURITY_ADMIN_PASSWORD} @@ -49,12 +49,14 @@ services: source: kminion-groups_rev1.json - target: /etc/grafana/provisioning/dashboards/applications/kminion-topic_rev1.json source: kminion-topic_rev1.json + - target: /etc/grafana/provisioning/dashboards/containers/logging-universal-dashboard_rev1.json + source: logging-universal-dashboard_rev1.json prometheus: image: prom/prometheus:v2.38.0 user: root volumes: - - prometheus_data:/prometheus + - prometheus-data:/prometheus - /var/run/docker.sock:/var/run/docker.sock:ro configs: - target: /etc/prometheus/prometheus.yml @@ -88,11 +90,58 @@ services: deploy: mode: global - prometheus-kafka-adapter: - image: telefonica/prometheus-kafka-adapter:1.8.0 + loki: + image: grafana/loki:2.6.1 + volumes: + - loki-data:/tmp/loki + environment: + MO_SECURITY_ADMIN_USER: ${MO_SECURITY_ADMIN_USER} + MO_SECURITY_ADMIN_PASSWORD: ${MO_SECURITY_ADMIN_PASSWORD} + configs: + - target: /etc/loki/loki-config.yml + source: loki-config.yml + command: -config.file=/etc/loki/loki-config.yml -config.expand-env=true + deploy: + labels: + - prometheus-job-service=loki + - prometheus-address=loki:3100 + + promtail: + image: grafana/promtail:2.6.1 + volumes: + - /var/lib/docker/containers:/host/containers + - /var/log:/var/log:ro + configs: + - target: /etc/promtail/promtail-config.yml + source: promtail-config.yml + command: -config.file=/etc/promtail/promtail-config.yml + deploy: + mode: global + + minio-01: + image: quay.io/minio/minio:RELEASE.2022-10-24T18-35-07Z + entrypoint: sh + command: -c 'mkdir -p /data1/loki /data2/loki && minio server --console-address ":9001" http://minio-0{1...${NUM_MINIO_SERVERS}}/data{1...2}' environment: - - KAFKA_BROKER_LIST=kafka:9092 - - KAFKA_COMPRESSION=gzip + MINIO_ROOT_USER: ${MO_SECURITY_ADMIN_USER} + MINIO_ROOT_PASSWORD: ${MO_SECURITY_ADMIN_PASSWORD} + healthcheck: + test: + [ + "CMD", + "curl", + "-f", + "http://localhost:9000/minio/health/live" + ] + interval: 30s + timeout: 20s + retries: 3 + hostname: minio-01 + volumes: + - minio-01-data1:/data1 + - minio-01-data2:/data2 + deploy: + replicas: 1 configs: grafana.ini: @@ -135,12 +184,30 @@ configs: name: kminion-topic_rev1.json-${kminion_topic_rev1_json_DIGEST:?err} labels: name: grafana + logging-universal-dashboard_rev1.json: + file: ./grafana/dashboards/containers/logging-universal-dashboard_rev1.json + name: logging-universal-dashboard_rev1.json-${logging_universal_dashboard_rev1_json_DIGEST:?err} + labels: + name: grafana prometheus.yml: file: ./prometheus/prometheus.yml name: prometheus.yml-${prometheus_yml_DIGEST:?err} labels: name: prometheus + loki-config.yml: + file: ./loki/loki-config.yml + name: loki-config.yml-${loki_config_yml_DIGEST:?err} + labels: + name: loki + promtail-config.yml: + file: ./promtail/promtail-config.yml + name: promtail-config.yml-${promtail_config_yml_DIGEST:?err} + labels: + name: promtail volumes: - prometheus_data: - grafana_data: + prometheus-data: + grafana-data: + loki-data: + minio-01-data1: + minio-01-data2: diff --git a/monitoring/grafana/dashboards/containers/logging-universal-dashboard_rev1.json b/monitoring/grafana/dashboards/containers/logging-universal-dashboard_rev1.json new file mode 100644 index 00000000..1229002d --- /dev/null +++ b/monitoring/grafana/dashboards/containers/logging-universal-dashboard_rev1.json @@ -0,0 +1,926 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Universal and flexible dashboard for logging", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 12611, + "graphTooltip": 0, + "id": 11, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "description": "Total count of log lines in the specified time range", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "rgb(31, 255, 7)", + "text": "0" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(31, 255, 7)", + "value": null + }, + { + "color": "rgb(31, 255, 7)", + "value": 10 + }, + { + "color": "rgb(31, 255, 7)", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 11, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "sum" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"})[$__interval]))", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "Total count of logs", + "type": "stat" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "description": "Total Count: of $searchable_pattern in the specified time range", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "rgb(222, 15, 43)", + "text": "0" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(222, 15, 43)", + "value": null + }, + { + "color": "rgb(222, 15, 43)", + "value": 10 + }, + { + "color": "rgb(222, 15, 43)", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 6, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "sum" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\")[$__interval]))", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "Total Count: of \"$searchable_pattern\"", + "type": "stat" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "description": "Live logs is a like 'tail -f | grep' in a real time", + "gridPos": { + "h": 22, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 2, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": true, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "{swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\"", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "Live logs (filtered by \"$searchable_pattern\")", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 0, + "y": 25 + }, + "id": 19, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "7.0.4", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"})[$__interval])) by (stream)", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "Total count of stderr / stdout pie", + "type": "piechart" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 7, + "y": 25 + }, + "id": 20, + "interval": "1m", + "links": [], + "maxDataPoints": "", + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "6.4.3", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\")[$__interval])) by (swarm_task_name)", + "queryType": "range", + "refId": "A" + } + ], + "title": "Matched word: \"$searchable_pattern\" donut", + "type": "piechart" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "#299c46", + "text": "0" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "#C4162A", + "value": 50 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 19, + "y": 25 + }, + "id": 9, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\")[$__interval])) * 100 / sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"})[$__interval]))", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "\"$searchable_pattern\" Percentage for specified time", + "type": "gauge" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Count", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 18, + "interval": "1m", + "links": [], + "maxDataPoints": "", + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\")[$__interval])) by (swarm_task_name)", + "queryType": "range", + "refId": "A" + } + ], + "title": "Matched word: \"$searchable_pattern\" historical", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 10, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(rate(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\")[30s])) by (swarm_task_name)", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "\"$searchable_pattern\" Rate per Task", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 6, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "{stream=\"stderr\"} stderr" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C4162A", + "mode": "fixed" + } + }, + { + "id": "custom.lineWidth", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "{stream=\"stdout\"} stdout" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#56A64B", + "mode": "fixed" + } + }, + { + "id": "custom.lineWidth", + "value": 2 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 7, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"})[$__interval])) by (stream)", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "Count of stderr / stdout historical", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 37, + "style": "dark", + "tags": [ + "Loki", + "logging" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "instant_prometheus-kafka-adapter", + "value": "instant_prometheus-kafka-adapter" + }, + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "definition": "label_values({swarm_service_name=~\".+\"}, swarm_service_name)", + "hide": 0, + "includeAll": false, + "label": "Service", + "multi": false, + "name": "service_name", + "options": [], + "query": "label_values({swarm_service_name=~\".+\"}, swarm_service_name)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "definition": "label_values({swarm_service_name=\"$service_name\"}, swarm_task_name)", + "hide": 0, + "includeAll": true, + "label": "Task", + "multi": true, + "name": "task_name", + "options": [], + "query": "label_values({swarm_service_name=\"$service_name\"}, swarm_task_name)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "definition": "label_values({swarm_service_name=\"$service_name\"}, stream)", + "hide": 0, + "includeAll": true, + "label": "Stream", + "multi": false, + "name": "stream", + "options": [], + "query": "label_values({swarm_service_name=\"$service_name\"}, stream)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "error", + "value": "error" + }, + "hide": 0, + "label": "Search (case insensitive)", + "name": "searchable_pattern", + "options": [ + { + "selected": true, + "text": "error", + "value": "error" + } + ], + "query": "error", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Container logs", + "uid": "fRIvzUZMf", + "version": 4, + "weekStart": "" +} diff --git a/monitoring/loki/loki-config.yml b/monitoring/loki/loki-config.yml new file mode 100644 index 00000000..40fd5c11 --- /dev/null +++ b/monitoring/loki/loki-config.yml @@ -0,0 +1,63 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + grpc_server_max_concurrent_streams: 0 + +ingester: + wal: + enabled: true + dir: /tmp/wal + lifecycler: + address: 127.0.0.1 + ring: + kvstore: + store: inmemory + replication_factor: 1 + final_sleep: 0s + chunk_idle_period: 1h + max_chunk_age: 1h + chunk_target_size: 1048576 + chunk_retain_period: 30s + max_transfer_retries: 0 + +schema_config: + configs: + - from: 2020-07-01 + store: boltdb-shipper + object_store: aws + schema: v11 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + active_index_directory: /loki/boltdb-shipper-active + cache_location: /loki/boltdb-shipper-cache + resync_interval: 30s + shared_store: s3 + aws: + s3: http://${MO_SECURITY_ADMIN_USER}:${MO_SECURITY_ADMIN_PASSWORD}@minio-01.:9000/loki + s3forcepathstyle: true + +compactor: + working_directory: /loki/boltdb-shipper-compactor + shared_store: s3 + +limits_config: + reject_old_samples: true + reject_old_samples_max_age: 168h + ingestion_rate_mb: 100 + ingestion_burst_size_mb: 150 + max_concurrent_tail_requests: 200 + max_cache_freshness_per_query: 10m + max_streams_per_user: 500 + +chunk_store_config: + max_look_back_period: 0s + +table_manager: + retention_deletes_enabled: false + retention_period: 0s diff --git a/monitoring/package-metadata.json b/monitoring/package-metadata.json index 69d9b388..2a2077a9 100644 --- a/monitoring/package-metadata.json +++ b/monitoring/package-metadata.json @@ -4,7 +4,7 @@ "description": "A package for monitoring the platform services", "type": "infrastructure", "version": "0.0.1", - "dependencies": ["message-bus-kafka"], + "dependencies": [], "environmentVariables": { "GF_SECURITY_ADMIN_USER": "admin", "GF_SECURITY_ADMIN_PASSWORD": "dev_password_only", @@ -19,6 +19,8 @@ "KC_GRAFANA_CLIENT_ID": "grafana-oauth", "KC_GRAFANA_CLIENT_SECRET": "CV14QfwnpYFj1IH5dK5lScPNCYAIYP1c", "GF_SERVER_DOMAIN": "localhost", - "GF_SERVER_ROOT_URL": "http://localhost:3000" + "GF_SERVER_ROOT_URL": "http://localhost:3000", + "MO_SECURITY_ADMIN_USER": "admin", + "MO_SECURITY_ADMIN_PASSWORD": "dev_password_only" } } diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml index b327a3bf..e0e71c5e 100644 --- a/monitoring/prometheus/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -74,6 +74,3 @@ scrape_configs: # Use the prometheus-job Swarm label as Prometheus job label. - source_labels: [__meta_dockerswarm_service_label_prometheus_job_task] target_label: job - -remote_write: - - url: "http://prometheus-kafka-adapter:8080/receive" diff --git a/monitoring/promtail/promtail-config.yml b/monitoring/promtail/promtail-config.yml new file mode 100644 index 00000000..1cce8fe2 --- /dev/null +++ b/monitoring/promtail/promtail-config.yml @@ -0,0 +1,47 @@ +server: + http_listen_address: 0.0.0.0 + http_listen_port: 9080 + +positions: + filename: /tmp/positions.yaml + +clients: +- url: http://loki:3100/loki/api/v1/push + +scrape_configs: + +- job_name: containers + static_configs: + - targets: + - localhost + labels: + job: containerlogs + __path__: /host/containers/*/*log + + pipeline_stages: + - json: + expressions: + log: log + stream: stream + time: time + tag: attrs.tag + stack_name: attrs."com.docker.stack.namespace" + swarm_service_name: attrs."com.docker.swarm.service.name" + swarm_task_name: attrs."com.docker.swarm.task.name" + swarm_node_id: attrs."com.docker.swarm.node.id" + - regex: + expression: "^/host/containers/(?P.{12}).+/.+-json.log$" + source: filename + - timestamp: + format: RFC3339Nano + source: time + - labels: + stream: + container_id: + tag: + stack_name: + swarm_service_name: + swarm_task_name: + swarm_node_id: + - output: + source: log diff --git a/monitoring/swarm.sh b/monitoring/swarm.sh index ce4ba5dc..6f6c9f0f 100644 --- a/monitoring/swarm.sh +++ b/monitoring/swarm.sh @@ -21,18 +21,23 @@ function init_vars() { SCALED_SERVICES=( "grafana" "prometheus" - "prometheus-kafka-adapter" + "loki" + "minio-01" ) if [[ "${CLUSTERED_MODE}" == "true" ]]; then SCALED_SERVICES=( "${SCALED_SERVICES[@]}" "prometheus_backup" + "minio-02" + "minio-03" + "minio-04" ) fi SERVICE_NAMES=( "${SCALED_SERVICES[@]}" "cadvisor" "node-exporter" + "promtail" ) readonly ACTION @@ -56,6 +61,9 @@ function initialize_package() { if [[ "${CLUSTERED_MODE}" == "true" ]]; then monitoring_cluster_compose_filename="docker-compose.cluster.yml" + export NUM_MINIO_SERVERS=4 + else + export NUM_MINIO_SERVERS=1 fi if [[ "${MODE}" == "dev" ]]; then @@ -77,19 +85,21 @@ function initialize_package() { function scale_services_down() { docker::scale_services_down "${SCALED_SERVICES[@]}" - docker::service_destroy "cadvisor" "node-exporter" + docker::service_destroy "cadvisor" "node-exporter" "promtail" } function destroy_package() { docker::service_destroy "${SERVICE_NAMES[@]}" - docker::try_remove_volume prometheus_data grafana_data + docker::try_remove_volume prometheus-data grafana-data minio-01-data1 minio-01-data2 prometheus_data_backup loki-data if [[ $CLUSTERED_MODE == "true" ]]; then + sleep 5 + docker::try_remove_volume minio-02-data1 minio-02-data2 log warn "Volumes are only deleted on the host on which the command is run. Monitoring volumes on other nodes are not deleted" fi - docker::prune_configs "grafana" "prometheus" + docker::prune_configs "grafana" "prometheus" "promtail" "loki" } main() { diff --git a/reverse-proxy-nginx/package-conf-insecure/http-minio-insecure.conf b/reverse-proxy-nginx/package-conf-insecure/http-minio-insecure.conf new file mode 100644 index 00000000..f0f6be12 --- /dev/null +++ b/reverse-proxy-nginx/package-conf-insecure/http-minio-insecure.conf @@ -0,0 +1,9 @@ +server { + listen 9001; + + location / { + resolver 127.0.0.11 valid=30s; + set $upstream_minio minio-01; + proxy_pass http://$upstream_minio:9001; + } +} diff --git a/reverse-proxy-nginx/package-conf-secure/http-minio-secure.conf b/reverse-proxy-nginx/package-conf-secure/http-minio-secure.conf new file mode 100644 index 00000000..3defe27e --- /dev/null +++ b/reverse-proxy-nginx/package-conf-secure/http-minio-secure.conf @@ -0,0 +1,31 @@ +server { + listen 80; + server_name minio.*; + + location /.well-known/acme-challenge/ { + resolver 127.0.0.11 valid=30s; + set $upstream_certbot certbot; + proxy_pass http://$upstream_certbot$request_uri; + } + + location / { + return 301 https://$host$request_uri; + } +} +server { + listen 443 ssl; + listen [::]:443 ssl; + server_name minio.*; + + location /.well-known/acme-challenge/ { + resolver 127.0.0.11 valid=30s; + set $upstream_certbot certbot; + proxy_pass http://$upstream_certbot$request_uri; + } + + location / { + resolver 127.0.0.11 valid=30s; + set $upstream_minio minio-01; + proxy_pass http://$upstream_minio:9001; + } +} diff --git a/reverse-proxy-nginx/package-metadata.json b/reverse-proxy-nginx/package-metadata.json index cbd0d183..a5904958 100644 --- a/reverse-proxy-nginx/package-metadata.json +++ b/reverse-proxy-nginx/package-metadata.json @@ -16,6 +16,6 @@ "RENEWAL_EMAIL": "dummy@jembi.org", "STAGING": "true", "INSECURE": "true", - "INSECURE_PORTS": "5001:5001-80:80-8080:8080-5601:5601-5488:5488-3000:3000-9200:9200-8089:8089" + "INSECURE_PORTS": "5001:5001-80:80-8080:8080-5601:5601-5488:5488-3000:3000-9200:9200-8089:8089-9001:9001" } } diff --git a/test/cucumber/features/cluster-mode/kafka-packages.cluster.feature b/test/cucumber/features/cluster-mode/kafka-packages.cluster.feature index fd64d186..9b09deb4 100644 --- a/test/cucumber/features/cluster-mode/kafka-packages.cluster.feature +++ b/test/cucumber/features/cluster-mode/kafka-packages.cluster.feature @@ -1,5 +1,5 @@ Feature: Kafka and its dependent packages? - Does Kafka and its dependent packages work as expected + Does Kafka and its dependent packages work as expected Scenario: Init Message Bus Kafka Given I use parameters "package init -n=message-bus-kafka --dev --env-file=.env.cluster" @@ -13,19 +13,8 @@ Feature: Kafka and its dependent packages? And The service "message-bus-kafka-config-importer" should be removed And There should be 6 services - Scenario: Init Monitoring - Given I use parameters "package init -n=monitoring --only --dev --env-file=.env.cluster" - When I launch the platform with params - Then The service "grafana" should be started with 1 replica - And The service "prometheus" should be started with 1 replica - And The service "prometheus-kafka-adapter" should be started with 1 replica - And The service "prometheus_backup" should be started with 1 replica - And The service "cadvisor" should be started with 3 replicas - And The service "node-exporter" should be started with 3 replicas - And The service "cadvisor" should have healthy containers - Scenario: Destroy Kafka and its dependent packages - Given I use parameters "package destroy -n=monitoring --dev --env-file=.env.cluster" + Given I use parameters "package destroy -n=message-bus-kafka --dev --env-file=.env.cluster" When I launch the platform with params Then The service "zookeeper-1" should be removed And The service "zookeeper-2" should be removed @@ -33,12 +22,6 @@ Feature: Kafka and its dependent packages? And The service "kafka" should be removed And The service "kafdrop" should be removed And The service "kafka-minion" should be removed - And The service "grafana" should be removed - And The service "prometheus" should be removed - And The service "prometheus-kafka-adapter" should be removed - And The service "prometheus_backup" should be removed - And The service "cadvisor" should be removed - And The service "node-exporter" should be removed And There should be 0 service And There should be 0 volume And There should be 0 config diff --git a/test/cucumber/features/cluster-mode/monitoring.feature b/test/cucumber/features/cluster-mode/monitoring.feature new file mode 100644 index 00000000..7170a50a --- /dev/null +++ b/test/cucumber/features/cluster-mode/monitoring.feature @@ -0,0 +1,36 @@ +Feature: Monitoring package? + Does the Monitoring package work as expected + + Scenario: Init Monitoring + Given I use parameters "package init -n=monitoring --only --dev --env-file=.env.cluster" + When I launch the platform with params + Then The service "grafana" should be started with 1 replica + And The service "prometheus" should be started with 1 replica + And The service "cadvisor" should be started with 3 replica + And The service "node-exporter" should be started with 3 replica + And The service "loki" should be started with 1 replica + And The service "promtail" should be started with 3 replica + And The service "minio-01" should be started with 1 replica + And The service "minio-02" should be started with 1 replica + And The service "minio-03" should be started with 1 replica + And The service "minio-04" should be started with 1 replica + And The service "prometheus_backup" should be started with 1 replica + And There should be 7 volumes + + Scenario: Destroy Monitoring package + Given I use parameters "package destroy -n=monitoring --dev --env-file=.env.cluster" + When I launch the platform with params + Then The service "grafana" should be removed + And The service "prometheus" should be removed + And The service "cadvisor" should be removed + And The service "node-exporter" should be removed + And The service "loki" should be removed + And The service "promtail" should be removed + And The service "minio-01" should be removed + And The service "minio-02" should be removed + And The service "minio-03" should be removed + And The service "minio-04" should be removed + And The service "prometheus_backup" should be removed + And There should be 0 service + And There should be 0 volume + And There should be 0 config diff --git a/test/cucumber/features/single-mode/kafka-packages.feature b/test/cucumber/features/single-mode/kafka-packages.feature index caced467..774eddf8 100644 --- a/test/cucumber/features/single-mode/kafka-packages.feature +++ b/test/cucumber/features/single-mode/kafka-packages.feature @@ -1,5 +1,5 @@ Feature: Kafka and its dependent packages? - Does Kafka and its dependent packages work as expected + Does Kafka and its dependent packages work as expected Scenario: Init Message Bus Kafka Given I use parameters "package init -n=message-bus-kafka --dev --env-file=.env.local" @@ -22,19 +22,8 @@ Feature: Kafka and its dependent packages? When I launch the platform with params Then The service "kafka-unbundler-consumer" should be started with 1 replica - Scenario: Init Monitoring - Given I use parameters "package init -n=monitoring --only --dev --env-file=.env.local" - When I launch the platform with params - Then The service "grafana" should be started with 1 replica - And The service "prometheus" should be started with 1 replica - And The service "prometheus-kafka-adapter" should be started with 1 replica - And The service "cadvisor" should be started with 1 replica - And The service "node-exporter" should be started with 1 replica - And The service "cadvisor" should have healthy containers - And There should be 4 volumes - Scenario: Destroy Kafka and its dependent packages - Given I use parameters "package destroy -n=kafka-mapper-consumer,kafka-unbundler-consumer,monitoring --dev --env-file=.env.local" + Given I use parameters "package destroy -n=kafka-mapper-consumer,kafka-unbundler-consumer --dev --env-file=.env.local" When I launch the platform with params Then The service "zookeeper-1" should be removed And The service "kafka" should be removed @@ -42,11 +31,6 @@ Feature: Kafka and its dependent packages? And The service "kafka-minion" should be removed And The service "kafka-mapper-consumer" should be removed And The service "kafka-unbundler-consumer" should be removed - And The service "grafana" should be removed - And The service "prometheus" should be removed - And The service "prometheus-kafka-adapter" should be removed - And The service "cadvisor" should be removed - And The service "node-exporter" should be removed And There should be 0 service And There should be 0 volume And There should be 0 config diff --git a/test/cucumber/features/single-mode/monitoring.feature b/test/cucumber/features/single-mode/monitoring.feature new file mode 100644 index 00000000..80a12ddf --- /dev/null +++ b/test/cucumber/features/single-mode/monitoring.feature @@ -0,0 +1,29 @@ +Feature: Monitoring package? + Does the Monitoring package work as expected + + Scenario: Init Monitoring + Given I use parameters "package init -n=monitoring --only --dev --env-file=.env.local" + When I launch the platform with params + Then The service "grafana" should be started with 1 replica + And The service "prometheus" should be started with 1 replica + And The service "cadvisor" should be started with 1 replica + And The service "node-exporter" should be started with 1 replica + And The service "cadvisor" should have healthy containers + And The service "loki" should be started with 1 replica + And The service "promtail" should be started with 1 replica + And The service "minio-01" should be started with 1 replica + And There should be 6 volumes + + Scenario: Destroy Monitoring package + Given I use parameters "package destroy -n=monitoring --dev --env-file=.env.cluster" + When I launch the platform with params + Then The service "grafana" should be removed + And The service "prometheus" should be removed + And The service "cadvisor" should be removed + And The service "node-exporter" should be removed + And The service "loki" should be removed + And The service "promtail" should be removed + And The service "minio-01" should be removed + And There should be 0 service + And There should be 0 volume + And There should be 0 config diff --git a/test/cucumber/package.json b/test/cucumber/package.json index e819aa85..0bbf4878 100644 --- a/test/cucumber/package.json +++ b/test/cucumber/package.json @@ -23,7 +23,9 @@ "test:cluster:keycloak": "cucumber-js 'features/cluster-mode/keycloak.cluster.feature'", "test:single:jsreport": "cucumber-js 'features/single-mode/jsreport.feature'", "test:single:superset": "cucumber-js 'features/single-mode/superset.feature'", - "test:single:mpi-mediator": "cucumber-js 'features/single-mode/mpi-mediator.feature'" + "test:single:mpi-mediator": "cucumber-js 'features/single-mode/mpi-mediator.feature'", + "test:single:monitoring": "cucumber-js 'features/single-mode/monitoring.feature'", + "test:cluster:monitoring": "cucumber-js 'features/cluster-mode/monitoring.feature'" }, "devDependencies": { "@cucumber/cucumber": "8.5.0",