apiVersion: v1 kind: Template labels: template: grafana-prometheus message: See https://github.com/nmasse-itix/OpenShift-Docker-Images/tree/master/grafana for more details. metadata: annotations: description: |- The prometheus datasource for Grafana. openshift.io/display-name: Prometheus tags: instant-app template.openshift.io/documentation-url: https://github.com/nmasse-itix/OpenShift-Docker-Images/tree/master/grafana template.openshift.io/long-description: Prometheus datasource for Grafana template.openshift.io/provider-display-name: Nicolas Massé template.openshift.io/support-url: https://github.com/nmasse-itix/OpenShift-Docker-Images/issues name: grafana-prometheus objects: - kind: ConfigMap apiVersion: v1 metadata: name: grafana-dashboards namespace: ${NAMESPACE} labels: template: grafana-prometheus data: openshift-cluster.json: |- { "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "description": "Monitors Kubernetes cluster using Prometheus. Shows overall cluster CPU / Memory / Filesystem usage as well as individual pod, containers, systemd services statistics. Uses cAdvisor metrics only.", "editable": true, "gnetId": 315, "graphTooltip": 0, "hideControls": false, "id": 2, "links": [], "refresh": "10s", "rows": [ { "collapse": false, "height": "200px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "fill": 1, "grid": {}, "height": "200px", "id": 32, "legend": { "alignAsTable": false, "avg": true, "current": true, "max": false, "min": false, "rightSide": false, "show": false, "sideWidth": 200, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum (rate (container_network_receive_bytes_total{id=\"/\"}[5m]))", "format": "time_series", "interval": "10s", "intervalFactor": 1, "legendFormat": "Received", "metric": "network", "refId": "A", "step": 10 }, { "expr": "- sum (rate (container_network_transmit_bytes_total{id=\"/\"}[5m]))", "format": "time_series", "interval": "10s", "intervalFactor": 1, "legendFormat": "Sent", "metric": "network", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Network I/O pressure", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "transparent": false, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Network I/O pressure", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "prometheus", "editable": true, "error": false, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "height": "180px", "id": 4, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 } ], "thresholds": "65, 90", "title": "Cluster memory usage", "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "height": "180px", "id": 6, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\"}[5m])) / sum (machine_cpu_cores{}) * 100", "format": "time_series", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 } ], "thresholds": "65, 90", "title": "Cluster CPU usage (5m avg)", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "height": "180px", "id": 7, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum (container_fs_usage_bytes{device=~\"^/dev/([sv]d[a-z][1-9]|mapper/[-a-zA-Z0-9]+)$\",id=\"/\"}) / sum (container_fs_limit_bytes{device=~\"^/dev/([sv]d[a-z][1-9]|mapper/[-a-zA-Z0-9]+)$\",id=\"/\"}) * 100", "format": "time_series", "interval": "10s", "intervalFactor": 1, "legendFormat": "", "metric": "", "refId": "A", "step": 10 } ], "thresholds": "65, 90", "title": "Cluster filesystem usage", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "id": 9, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "20%", "prefix": "", "prefixFontSize": "20%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 } ], "thresholds": "", "title": "Used", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "id": 10, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 } ], "thresholds": "", "title": "Total", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "id": 11, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": " cores", "postfixFontSize": "30%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\"}[5m]))", "format": "time_series", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 } ], "thresholds": "", "title": "Used", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "id": 12, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": " cores", "postfixFontSize": "30%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 } ], "thresholds": "", "title": "Total", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "id": 13, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum (container_fs_usage_bytes{device=~\"^/dev/([sv]d[a-z][1-9]|mapper/[-a-zA-Z0-9]+)$\",id=\"/\"})", "format": "time_series", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 } ], "thresholds": "", "title": "Used", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "id": 14, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum (container_fs_limit_bytes{device=~\"^/dev/([sv]d[a-z][1-9]|mapper/[-a-zA-Z0-9]+)$\",id=\"/\"})", "format": "time_series", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 } ], "thresholds": "", "title": "Total", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Total usage", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 3, "editable": true, "error": false, "fill": 0, "grid": {}, "height": "", "id": 17, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\"}[5m])) by (pod_name)", "format": "time_series", "interval": "10s", "intervalFactor": 1, "legendFormat": "{{ pod_name }}", "metric": "container_cpu", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Pods CPU usage (1m avg)", "tooltip": { "msResolution": true, "shared": true, "sort": 2, "value_type": "cumulative" }, "transparent": false, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "cores", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Pods CPU usage", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 3, "editable": true, "error": false, "fill": 0, "grid": {}, "height": "", "id": 23, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum (rate (container_cpu_usage_seconds_total{id=~\"^/system.slice.*\"}[5m])) by (systemd_service_name)", "format": "time_series", "hide": false, "interval": "10s", "intervalFactor": 1, "legendFormat": "{{ systemd_service_name }}", "metric": "container_cpu", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "System services CPU usage (5m avg)", "tooltip": { "msResolution": true, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "cores", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "System services CPU usage", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 3, "editable": true, "error": false, "fill": 0, "grid": {}, "height": "", "id": 24, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": false, "hideZero": false, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",container_name!=\"POD\"}[5m])) by (container_name, pod_name)", "format": "time_series", "hide": false, "interval": "10s", "intervalFactor": 1, "legendFormat": "pod: {{ pod_name }} | {{ container_name }}", "metric": "container_cpu", "refId": "A", "step": 10 }, { "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",name!~\"^k8s_.*\"}[5m])) by (kubernetes_io_hostname, name, image)", "format": "time_series", "hide": false, "interval": "10s", "intervalFactor": 1, "legendFormat": "docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", "metric": "container_cpu", "refId": "B", "step": 10 }, { "expr": "sum (rate (container_cpu_usage_seconds_total{rkt_container_name!=\"\"}[5m])) by (kubernetes_io_hostname, rkt_container_name)", "format": "time_series", "interval": "10s", "intervalFactor": 1, "legendFormat": "rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", "metric": "container_cpu", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Containers CPU usage (5m avg)", "tooltip": { "msResolution": true, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "cores", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Containers CPU usage", "titleSize": "h6" }, { "collapse": true, "height": "500px", "panels": [ { "aliasColors": {}, "bars": false, "datasource": "prometheus", "decimals": 3, "editable": true, "error": false, "fill": 0, "grid": {}, "id": 20, "isNew": true, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 12, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum (rate (container_cpu_usage_seconds_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)", "hide": false, "interval": "10s", "intervalFactor": 1, "legendFormat": "{{ id }}", "metric": "container_cpu", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "All processes CPU usage (1m avg)", "tooltip": { "msResolution": true, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "show": true }, "yaxes": [ { "format": "none", "label": "cores", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "All processes CPU usage", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "fill": 0, "grid": {}, "id": 25, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": 200, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (pod_name)", "interval": "10s", "intervalFactor": 1, "legendFormat": "{{ pod_name }}", "metric": "container_memory_usage:sort_desc", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Pods memory usage", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Pods memory usage", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "fill": 0, "grid": {}, "id": 26, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": 200, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum (container_memory_working_set_bytes{id=~\"^/system.slice/.*\"}) by (id)", "format": "time_series", "interval": "10s", "intervalFactor": 1, "legendFormat": "{{ id }}", "metric": "container_memory_usage:sort_desc", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "System services memory usage", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "System services memory usage", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "fill": 0, "grid": {}, "id": 27, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": 200, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}) by (container_name, pod_name)", "interval": "10s", "intervalFactor": 1, "legendFormat": "pod: {{ pod_name }} | {{ container_name }}", "metric": "container_memory_usage:sort_desc", "refId": "A", "step": 10 }, { "expr": "sum (container_memory_working_set_bytes{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, name, image)", "interval": "10s", "intervalFactor": 1, "legendFormat": "docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", "metric": "container_memory_usage:sort_desc", "refId": "B", "step": 10 }, { "expr": "sum (container_memory_working_set_bytes{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, rkt_container_name)", "interval": "10s", "intervalFactor": 1, "legendFormat": "rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", "metric": "container_memory_usage:sort_desc", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Containers memory usage", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Containers memory usage", "titleSize": "h6" }, { "collapse": false, "height": "500px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "fill": 0, "grid": {}, "id": 28, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "sideWidth": 200, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum (container_memory_working_set_bytes{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) by (id)", "interval": "10s", "intervalFactor": 1, "legendFormat": "{{ id }}", "metric": "container_memory_usage:sort_desc", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "All processes memory usage", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "All processes memory usage", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "fill": 1, "grid": {}, "id": 16, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": 200, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\"}[5m])) by (pod_name)", "format": "time_series", "interval": "10s", "intervalFactor": 1, "legendFormat": "-> {{ pod_name }}", "metric": "network", "refId": "A", "step": 10 }, { "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\"}[5m])) by (pod_name)", "format": "time_series", "interval": "10s", "intervalFactor": 1, "legendFormat": "<- {{ pod_name }}", "metric": "network", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Pods network I/O (5m avg)", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Pods network I/O", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "fill": 1, "grid": {}, "id": 30, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": 200, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\"}[5m])) by (container_name, pod_name)", "format": "time_series", "hide": false, "interval": "10s", "intervalFactor": 1, "legendFormat": "-> pod: {{ pod_name }} | {{ container_name }}", "metric": "network", "refId": "B", "step": 10 }, { "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\"}[5m])) by (container_name, pod_name)", "format": "time_series", "hide": false, "interval": "10s", "intervalFactor": 1, "legendFormat": "<- pod: {{ pod_name }} | {{ container_name }}", "metric": "network", "refId": "D", "step": 10 }, { "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name!~\"^k8s_.*\"}[5m])) by (kubernetes_io_hostname, name, image)", "format": "time_series", "hide": false, "interval": "10s", "intervalFactor": 1, "legendFormat": "-> docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", "metric": "network", "refId": "A", "step": 10 }, { "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name!~\"^k8s_.*\"}[5m])) by (kubernetes_io_hostname, name, image)", "format": "time_series", "hide": false, "interval": "10s", "intervalFactor": 1, "legendFormat": "<- docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", "metric": "network", "refId": "C", "step": 10 }, { "expr": "sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\"}[5m])) by (kubernetes_io_hostname, rkt_container_name)", "format": "time_series", "hide": false, "interval": "10s", "intervalFactor": 1, "legendFormat": "-> rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", "metric": "network", "refId": "E", "step": 10 }, { "expr": "- sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\"}[5m])) by (kubernetes_io_hostname, rkt_container_name)", "format": "time_series", "hide": false, "interval": "10s", "intervalFactor": 1, "legendFormat": "<- rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", "metric": "network", "refId": "F", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Containers network I/O (5m avg)", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Containers network I/O", "titleSize": "h6" }, { "collapse": false, "height": "500px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 2, "editable": true, "error": false, "fill": 1, "grid": {}, "id": 29, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "sideWidth": 200, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum (rate (container_network_receive_bytes_total{id!=\"/\"}[5m])) by (id)", "format": "time_series", "interval": "10s", "intervalFactor": 1, "legendFormat": "-> {{ id }}", "metric": "network", "refId": "A", "step": 10 }, { "expr": "- sum (rate (container_network_transmit_bytes_total{id!=\"/\"}[5m])) by (id)", "format": "time_series", "interval": "10s", "intervalFactor": 1, "legendFormat": "<- {{ id }}", "metric": "network", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "All processes network I/O (1m avg)", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "All processes network I/O", "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [ "kubernetes" ], "templating": { "list": [ { "allValue": ".*", "current": { "text": "All", "value": "$__all" }, "datasource": "prometheus", "hide": 0, "includeAll": true, "label": null, "multi": false, "name": "Node", "options": [], "query": "label_values(kubernetes_io_hostname)", "refresh": 1, "regex": "", "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-5m", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Kubernetes cluster monitoring (via Prometheus)", "version": 2 } - kind: ConfigMap apiVersion: v1 metadata: name: grafana-datasources namespace: ${NAMESPACE} labels: template: grafana-prometheus data: prometheus.yaml: |- datasources: - name: prometheus type: prometheus access: proxy url: http://prometheus:9090 version: 1 editable: false - apiVersion: v1 kind: Secret metadata: name: prometheus-proxy namespace: "${NAMESPACE}" labels: template: grafana-prometheus stringData: session_secret: "${SESSION_SECRET}=" - apiVersion: v1 kind: Secret metadata: name: alerts-proxy namespace: "${NAMESPACE}" labels: template: grafana-prometheus stringData: session_secret: "${SESSION_SECRET}=" - apiVersion: v1 kind: ServiceAccount metadata: name: prometheus namespace: ${NAMESPACE} labels: template: grafana-prometheus annotations: serviceaccounts.openshift.io/oauth-redirectreference.prom: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}' serviceaccounts.openshift.io/oauth-redirectreference.alerts: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}' secrets: - apiVersion: v1 kind: ClusterRoleBinding metadata: name: prometheus-is-cluster-reader groupNames: - system:cluster-readers roleRef: name: cluster-reader subjects: - kind: ServiceAccount name: prometheus namespace: ${NAMESPACE} userNames: - system:serviceaccount:${NAMESPACE}:prometheus - apiVersion: v1 kind: Service metadata: labels: app: prometheus template: grafana-prometheus name: prometheus namespace: ${NAMESPACE} annotations: service.alpha.openshift.io/serving-cert-secret-name: prometheus-tls prometheus.io/scrape: 'true' prometheus.io/scheme: https spec: ports: - name: oauth-proxy port: 443 protocol: TCP targetPort: 8443 - name: prometheus port: 9090 protocol: TCP targetPort: 9090 selector: app: prometheus sessionAffinity: None type: ClusterIP - apiVersion: v1 kind: Service metadata: labels: app: prometheus template: grafana-prometheus name: alerts namespace: ${NAMESPACE} annotations: service.alpha.openshift.io/serving-cert-secret-name: prometheus-alerts-tls spec: ports: - name: oauth-proxy port: 443 protocol: TCP targetPort: 9443 selector: app: prometheus sessionAffinity: None type: ClusterIP - apiVersion: v1 kind: Route metadata: labels: app: prometheus template: grafana-prometheus name: prometheus namespace: ${NAMESPACE} spec: host: ${PROMETHEUS_ROUTE_HOSTNAME} port: targetPort: oauth-proxy to: kind: Service name: prometheus weight: 100 wildcardPolicy: None tls: termination: reencrypt - apiVersion: v1 kind: Route metadata: labels: app: prometheus template: grafana-prometheus name: alerts namespace: ${NAMESPACE} spec: host: ${ALERTS_ROUTE_HOSTNAME} port: targetPort: oauth-proxy to: kind: Service name: alerts weight: 100 wildcardPolicy: None tls: termination: reencrypt - apiVersion: apps/v1beta1 kind: StatefulSet metadata: name: prometheus namespace: ${NAMESPACE} labels: app: prometheus spec: updateStrategy: type: RollingUpdate podManagementPolicy: Parallel selector: provider: openshift matchLabels: app: prometheus template: metadata: name: prometheus labels: app: prometheus spec: serviceAccountName: prometheus containers: # Deploy Prometheus behind an oauth proxy - name: prom-proxy image: "${PROXY_IMAGE}" imagePullPolicy: IfNotPresent ports: - containerPort: 8443 name: web args: - -provider=openshift - -https-address=:8443 - -http-address= - -email-domain=* - -upstream=http://localhost:9090 - -client-id=system:serviceaccount:${NAMESPACE}:prometheus - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "${NAMESPACE}", "namespace": "${NAMESPACE}"}' - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "${NAMESPACE}", "namespace": "${NAMESPACE}"}}' - -tls-cert=/etc/tls/private/tls.crt - -tls-key=/etc/tls/private/tls.key - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-ca=/etc/pki/tls/cert.pem - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt - -skip-auth-regex=^/metrics volumeMounts: - mountPath: /etc/tls/private name: prometheus-tls - mountPath: /etc/proxy/secrets name: prometheus-secrets - mountPath: /prometheus name: prometheus-data - name: prometheus args: - --storage.tsdb.retention=6h - --storage.tsdb.min-block-duration=2m - --config.file=/etc/prometheus/prometheus.yml - --web.listen-address=0.0.0.0:9090 image: "${PROMETHEUS_IMAGE}" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /etc/prometheus name: prometheus-config - mountPath: /prometheus name: prometheus-data # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy - name: alerts-proxy image: "${PROXY_IMAGE}" imagePullPolicy: IfNotPresent ports: - containerPort: 9443 name: web args: - -provider=openshift - -https-address=:9443 - -http-address= - -email-domain=* - -upstream=http://localhost:9099 - -client-id=system:serviceaccount:${NAMESPACE}:prometheus - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "${NAMESPACE}", "namespace": "${NAMESPACE}"}' - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "${NAMESPACE}", "namespace": "${NAMESPACE}"}}' - -tls-cert=/etc/tls/private/tls.crt - -tls-key=/etc/tls/private/tls.key - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-ca=/etc/pki/tls/cert.pem - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt volumeMounts: - mountPath: /etc/tls/private name: alerts-tls - mountPath: /etc/proxy/secrets name: alerts-secrets - name: alert-buffer args: - --storage-path=/alert-buffer/messages.db image: "${ALERTBUFFER_IMAGE}" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /alert-buffer name: alert-buffer-data ports: - containerPort: 9099 name: alert-buf - name: alertmanager args: - -config.file=/etc/alertmanager/alertmanager.yml image: "${ALERTMANAGER_IMAGE}" imagePullPolicy: IfNotPresent ports: - containerPort: 9093 name: web volumeMounts: - mountPath: /etc/alertmanager name: alertmanager-config - mountPath: /alertmanager name: alertmanager-data restartPolicy: Always volumes: - name: prometheus-config configMap: defaultMode: 420 name: prometheus - name: prometheus-secrets secret: secretName: prometheus-proxy - name: prometheus-tls secret: secretName: prometheus-tls - name: prometheus-data persistentVolumeClaim: claimName: prometheus - name: alertmanager-config configMap: defaultMode: 420 name: prometheus-alerts - name: alerts-secrets secret: secretName: alerts-proxy - name: alerts-tls secret: secretName: prometheus-alerts-tls - name: alertmanager-data persistentVolumeClaim: claimName: prometheus-alertmanager - name: alert-buffer-data persistentVolumeClaim: claimName: prometheus-alertbuffer - kind: ConfigMap apiVersion: v1 metadata: name: prometheus namespace: ${NAMESPACE} labels: template: grafana-prometheus data: prometheus.rules: |- groups: - name: example-rules interval: 30s # defaults to global interval rules: prometheus.yml: |- global: scrape_interval: 30s evaluation_interval: 30s rule_files: - '*.rules' # A scrape configuration for running Prometheus on a Kubernetes cluster. # This uses separate scrape configs for cluster components (i.e. API server, node) # and services to allow each to use different authentication configs. # # Kubernetes labels will be added as Prometheus labels on metrics via the # `labelmap` relabeling action. # Scrape config for API servers. # # Kubernetes exposes API servers as endpoints to the default/kubernetes # service so this uses `endpoints` role and uses relabelling to only keep # the endpoints associated with the default/kubernetes service using the # default named port `https`. This works for single API server deployments as # well as HA API server deployments. scrape_configs: - job_name: 'kubernetes-apiservers' kubernetes_sd_configs: - role: endpoints scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # Keep only the default/kubernetes service endpoints for the https port. This # will add targets for each API server which Kubernetes adds an endpoint to # the default/kubernetes service. relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https # Scrape config for nodes. # # Each node exposes a /metrics endpoint that contains operational metrics for # the Kubelet and other components. - job_name: 'kubernetes-nodes' scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) # Scrape config for controllers. # # Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for # the controllers. # # TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via # endpoints. - job_name: 'kubernetes-controllers' scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: endpoints # Keep only the default/kubernetes service endpoints for the https port, and then # set the port to 8444. This is the default configuration for the controllers on OpenShift # masters. relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https - source_labels: [__address__] action: replace target_label: __address__ regex: (.+)(?::\d+) replacement: $1:8444 # Scrape config for cAdvisor. # # Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that # reports container metrics for each running pod. Scrape those by default. - job_name: 'kubernetes-cadvisor' scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token metrics_path: /metrics/cadvisor kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) # Scrape config for service endpoints. # # The relabeling allows the actual service scrape endpoint to be configured # via the following annotations: # # * `prometheus.io/scrape`: Only scrape services that have a value of `true` # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need # to set this to `https` & most likely set the `tls_config` of the scrape config. # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. # * `prometheus.io/port`: If the metrics are exposed on a different port to the # service then set this appropriately. - job_name: 'kubernetes-service-endpoints' tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt # TODO: this should be per target insecure_skip_verify: true kubernetes_sd_configs: - role: endpoints relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] action: replace target_label: __scheme__ regex: (https?) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: (.+)(?::\d+);(\d+) replacement: $1:$2 - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username] action: replace target_label: __basic_auth_username__ regex: (.+) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password] action: replace target_label: __basic_auth_password__ regex: (.+) - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name alerting: alertmanagers: - scheme: http static_configs: - targets: - "localhost:9093" - kind: ConfigMap apiVersion: v1 metadata: name: prometheus-alerts namespace: ${NAMESPACE} labels: template: grafana-prometheus data: alertmanager.yml: |- global: # The root route on which each incoming alert enters. route: # default route if none match receiver: alert-buffer-wh # The labels by which incoming alerts are grouped together. For example, # multiple alerts coming in for cluster=A and alertname=LatencyHigh would # be batched into a single group. # TODO: group_by: [] # All the above attributes are inherited by all child routes and can # overwritten on each. receivers: - name: alert-buffer-wh webhook_configs: - url: http://localhost:9099/topics/alerts ## ## Template Parameters ## parameters: - description: The Docker image to use for the OAuth Proxy. displayName: OAuth Proxy image name: PROXY_IMAGE value: openshift3/oauth-proxy:v3.7 - description: The Docker image to use for Prometheus. displayName: Prometheus image name: PROMETHEUS_IMAGE value: openshift3/prometheus:v3.7 - description: The Docker image to use for the Alert Buffer. displayName: Alert Buffer image name: ALERTBUFFER_IMAGE value: openshift3/prometheus-alert-buffer:v3.7 - description: The Docker image to use for the Alert Manager. displayName: Alert Manager image name: ALERTMANAGER_IMAGE value: openshift3/prometheus-alertmanager:v3.7 - description: The desired hostname of the route to the Prometheus service. displayName: Hostname of the Prometheus Service name: PROMETHEUS_ROUTE_HOSTNAME - description: The desired hostname of the route to the Prometheus Alert service. displayName: Hostname of the Prometheus Alert Service name: ALERTS_ROUTE_HOSTNAME - description: The session secret for the proxy name: SESSION_SECRET generate: expression from: "[a-zA-Z0-9]{43}" - description: The namespace to instantiate this template under. Defaults to 'openshift-metrics'. name: NAMESPACE value: openshift-metrics