Пример настроенных уведомлений для Grafana

Пример правил уведомлений для Garana Alert

apiVersion: 1
groups:
    - orgId: 1
      name: Мониторинг виртуальных серверов
      folder: Уведомления
      interval: 1m
      rules:
        - uid: ddyilw68adaf4f
          title: LXC использование ОЗУ
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 3600
                to: 0
              datasourceUid: fdi0ylj8epn9ca
              model:
                datasource:
                    type: influxdb
                    uid: fdi0ylj8epn9ca
                intervalMs: 1000
                maxDataPoints: 43200
                query: |
                    mem = from(bucket: "proxmox")
                      |> range(start: v.timeRangeStart, stop: v.timeRangeStop)
                      |> filter(fn: (r) => r["_measurement"] == "system")
                      |> filter(fn: (r) => r["_field"] == "mem")
                      |> filter(fn: (r) => r["object"] == "lxc")
                      |> filter(fn: (r) => r["nodename"] == "proxmox")
                      |> filter(fn: (r) => r["_value"] > 0)
                      |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)

                    maxmem = from(bucket: "proxmox")
                      |> range(start: v.timeRangeStart, stop: v.timeRangeStop)
                      |> filter(fn: (r) => r["_measurement"] == "system")
                      |> filter(fn: (r) => r["_field"] == "maxmem")
                      |> filter(fn: (r) => r["object"] == "lxc")
                      |> filter(fn: (r) => r["_value"] > 0)
                      |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)

                    join(
                        tables: {mem:mem, maxmem:maxmem},
                        on: ["_time","_stop","_start","host"]

                    )

                    |> map(fn: (r) => ({
                          _time: r._time,
                          _value: float(v:r._value_mem) / float(v:r._value_maxmem) * float(v:100),
                          host: r.host
                        })
                      )

                    |> group(columns: ["host"])
                refId: A
            - refId: B
              relativeTimeRange:
                from: 3600
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 0
                            - 0
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                      reducer:
                        params: []
                        type: avg
                      type: query
                datasource:
                    name: Expression
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: B
                type: reduce
            - refId: C
              relativeTimeRange:
                from: 3600
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 80
                            - 0
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                      reducer:
                        params: []
                        type: avg
                      type: query
                datasource:
                    name: Expression
                    type: __expr__
                    uid: __expr__
                expression: B
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          dashboardUid: IfgdXjtnk
          panelId: 29
          noDataState: OK
          execErrState: Error
          for: 1m
          annotations:
            __dashboardUid__: IfgdXjtnk
            __panelId__: "29"
            description: На виртуальном сервере "{{ $labels.host }}" заканчивается ОЗУ, использовано {{ humanize $values.B.Value }}%.
          labels:
            LXC: ОЗУ
          isPaused: false
        - uid: bdylwouk68x6oe
          title: Количество запущенных контейнеров
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 900
                to: 0
              datasourceUid: PBFA97CFB590B2093
              model:
                datasource:
                    type: prometheus
                    uid: PBFA97CFB590B2093
                editorMode: code
                expr: count(count_over_time(container_memory_usage_bytes{image!=""}[1m])) by (instance) - count(count_over_time(container_memory_usage_bytes{image!=""}[5m])) by (instance)
                instant: false
                interval: ""
                intervalFactor: 2
                intervalMs: 15000
                legendFormat: containers
                maxDataPoints: 43200
                range: true
                refId: A
                step: 2
            - refId: B
              relativeTimeRange:
                from: 900
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params: []
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - B
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: B
                type: reduce
            - refId: C
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 0
                            - 0
                        type: lt
                      operator:
                        type: and
                      query:
                        params: []
                      reducer:
                        params: []
                        type: avg
                      type: query
                datasource:
                    name: Expression
                    type: __expr__
                    uid: __expr__
                expression: B
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          dashboardUid: edx4j6dcg38xsc
          panelId: 9
          noDataState: OK
          execErrState: Error
          for: 0s
          annotations:
            __dashboardUid__: edx4j6dcg38xsc
            __panelId__: "9"
            description: Количество запущенных docker на сервере "{{ reReplaceAll "(.*).local:8090" "$1" $labels.instance }}" контейнеров уменьшилось.
          labels:
            Docker: Количество
          isPaused: false
        - uid: ddykla6xrp0xsd
          title: Использование дискового пространства
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: PBFA97CFB590B2093
              model:
                datasource:
                    type: prometheus
                    uid: PBFA97CFB590B2093
                editorMode: code
                expr: "(\r\n  node_filesystem_size_bytes{fstype=~\"ext.*|xfs\", mountpoint !~\".*pod.*|/var/log.*|/var/log.hdd.*\"} \r\n  - node_filesystem_free_bytes{fstype=~\"ext.*|xfs\", mountpoint !~\".*pod.*|/var/log.*|/var/log.hdd.*\"}\r\n) * 100 / (\r\n  node_filesystem_avail_bytes{fstype=~\"ext.*|xfs\", mountpoint !~\".*pod.*|/var/log.*|/var/log.hdd.*\"} \r\n  + (\r\n    node_filesystem_size_bytes{fstype=~\"ext.*|xfs\", mountpoint !~\".*pod.*|/var/log.*|/var/log.hdd.*\"} \r\n    - node_filesystem_free_bytes{fstype=~\"ext.*|xfs\", mountpoint !~\".*pod.*|/var/log.*|/var/log.hdd.*\"}\r\n  )\r\n)"
                format: time_series
                instant: false
                interval: ""
                intervalFactor: 1
                intervalMs: 15000
                legendFormat: '{{mountpoint}}'
                maxDataPoints: 43200
                range: false
                refId: A
            - refId: B
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params: []
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: B
                type: reduce
            - refId: C
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 80
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - D
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: B
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          noDataState: NoData
          execErrState: Error
          for: 1m
          annotations:
            description: |-
                На сервере "{{ reReplaceAll "(.*):9100" "$1" $labels.instance }}" заканчивается место {{ if (gt $labels.mountpoint "/mnt/(.*)") -}} на диске "{{ reReplaceAll "/mnt/(.*)" "$1" $labels.mountpoint }}"
                {{ else -}} на основном диске {{- end }} занято {{ humanize $values.B.Value }}%.
            Занято: занято {{ humanize $values.B.Value }}%.
          labels:
            LXC: Диск
          isPaused: false
        - uid: fdylyfjaoacxse
          title: Использование ОЗУ docker контейнерами
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 900
                to: 0
              datasourceUid: PBFA97CFB590B2093
              model:
                datasource:
                    type: prometheus
                    uid: PBFA97CFB590B2093
                editorMode: code
                expr: 'round((sum by (name, instance)(container_memory_usage_bytes{image!=""}) -  sum by (name, instance) (container_memory_cache{image!=""})) / (1024 * 1024), 2) '
                instant: false
                interval: ""
                intervalFactor: 1
                intervalMs: 15000
                legendFormat: '{{ name }}'
                maxDataPoints: 43200
                metric: container_memory_usage
                range: true
                refId: A
                step: 1
            - refId: B
              relativeTimeRange:
                from: 900
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params: []
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - B
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: B
                type: reduce
            - refId: C
              relativeTimeRange:
                from: 900
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 1024
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: B
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          dashboardUid: edx4j6dcg38xsc
          panelId: 11
          noDataState: NoData
          execErrState: Error
          for: 1m
          annotations:
            __dashboardUid__: edx4j6dcg38xsc
            __panelId__: "11"
            description: Контейнер "{{ $labels.name}}" на сервере "{{ reReplaceAll "(.*).local:8090" "$1" $labels.instance }}" использует много ОЗУ, занято {{ humanize $values.B.Value }}Mb.
          labels:
            Docker: ОЗУ
          isPaused: false
    - orgId: 1
      name: Мониторинг физических серверов
      folder: Уведомления
      interval: 5m
      rules:
        - uid: adyihsmeykef4b
          title: Proxmox использование ОЗУ
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: fdi0ylj8epn9ca
              model:
                datasource:
                    type: influxdb
                    uid: fdi0ylj8epn9ca
                intervalMs: 1000
                maxDataPoints: 43200
                query: |-
                    from(bucket: "proxmox")
                      |> range(start: v.timeRangeStart, stop:v.timeRangeStop)
                      |> filter(fn: (r) =>
                        r._measurement == "memory" and
                        r._field == "memused"
                      )
                      |> filter(fn: (r) => r["host"] == "proxmox")
                      |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)
                      |> map(fn: (r) => ({
                            r with
                            _value: float(v: r._value) / 1073741824.0
                          })
                        )
                      |> last()
                refId: A
            - refId: D
              relativeTimeRange:
                from: 600
                to: 0
              datasourceUid: fdi0ylj8epn9ca
              model:
                datasource:
                    type: influxdb
                    uid: fdi0ylj8epn9ca
                intervalMs: 1000
                maxDataPoints: 43200
                query: "from(bucket: \"proxmox\")\r\n  |> range(start: v.timeRangeStart, stop:v.timeRangeStop)\r\n  |> filter(fn: (r) =>\r\n    r._measurement == \"memory\" and\r\n    r._field == \"memtotal\"\r\n  )\r\n  |> filter(fn: (r) => r[\"host\"] == \"proxmox\")\r\n  |> limit(n:1)\r\n  |> map(fn: (r) => ({\r\n      r with\r\n      _value: float(v: r._value) / 1073741824.0\r\n    })\r\n  )"
                refId: D
            - refId: B
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params: []
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - B
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: B
                type: reduce
            - refId: C
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 14
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: B
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
            - refId: E
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 0
                            - 0
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                      reducer:
                        params: []
                        type: avg
                      type: query
                datasource:
                    name: Expression
                    type: __expr__
                    uid: __expr__
                expression: D
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: E
                type: reduce
          dashboardUid: IfgdXjtnk
          panelId: 24
          noDataState: NoData
          execErrState: Error
          for: 5m
          annotations:
            __dashboardUid__: IfgdXjtnk
            __panelId__: "24"
            description: Заканчивается оперативная память на хосте Proxmox, использовано {{ humanize $values.B.Value }} из {{ humanize $values.E.Value }} Гб.
          labels:
            Proxmox: ОЗУ
          isPaused: false
        - uid: ddyituh1mrrwgd
          title: Proxmox использование дисков
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: fdi0ylj8epn9ca
              model:
                datasource:
                    type: influxdb
                    uid: fdi0ylj8epn9ca
                intervalMs: 1000
                maxDataPoints: 43200
                query: |
                    used = from(bucket: "proxmox")
                      |> range(start: v.timeRangeStart, stop: v.timeRangeStop)
                      |> filter(fn: (r) => r["_measurement"] == "system")
                      |> filter(fn: (r) => r["_field"] == "used")
                      |> filter(fn: (r) => r["nodename"] == "proxmox")
                      |> filter(fn: (r) => r["_value"] > 0)
                      |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)

                    avail = from(bucket: "proxmox")
                      |> range(start: v.timeRangeStart, stop: v.timeRangeStop)
                      |> filter(fn: (r) => r["_measurement"] == "system")
                      |> filter(fn: (r) => r["_field"] == "avail")
                      |> filter(fn: (r) => r["nodename"] == "proxmox")
                      |> filter(fn: (r) => r["_value"] > 0)
                      |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)

                    join(
                        tables: {used: used, avail: avail},
                        on: ["_time","_stop","_start","host"]
                    )
                    |> map(fn: (r) => ({
                          _time: r._time,
                          _value: float(v: r._value_used) / (float(v: r._value_used) + float(v: r._value_avail)) * 100.0,
                          host: r.host
                        })
                      )
                    |> group(columns: ["host"])
                    |> keep(columns: ["_time", "host", "_value"])
                refId: A
            - refId: B
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 0
                            - 0
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                      reducer:
                        params: []
                        type: avg
                      type: query
                datasource:
                    name: Expression
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: B
                type: reduce
            - refId: C
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 80
                            - 0
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                      reducer:
                        params: []
                        type: avg
                      type: query
                datasource:
                    name: Expression
                    type: __expr__
                    uid: __expr__
                expression: B
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          dashboardUid: IfgdXjtnk
          panelId: 14
          noDataState: NoData
          execErrState: Error
          for: 5m
          annotations:
            __dashboardUid__: IfgdXjtnk
            __panelId__: "14"
            description: На сервере Proxmox заполнен пул памяти "{{ $labels.host }}" занято {{ humanize $values.B.Value }}%
          labels:
            Proxmox: Диск
          isPaused: false
        - uid: edykj0nbfmbr4f
          title: Proxmox нагрузка процессора
          condition: E
          data:
            - refId: A
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: fdi0ylj8epn9ca
              model:
                datasource:
                    type: influxdb
                    uid: fdi0ylj8epn9ca
                intervalMs: 1000
                maxDataPoints: 43200
                query: |
                    from(bucket: "proxmox")
                      |> range(start: v.timeRangeStart, stop:v.timeRangeStop)
                      |> filter(fn: (r) =>
                        r._measurement == "cpustat" and
                        r._field == "cpu"
                      )
                      |> filter(fn: (r) => r["host"] =~ /proxmox/)
                      |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)
                      |> map(fn: (r) => ({
                            r with
                            _time: r._time,
                            _value: float(v: r._value) * float(v: 100)
                          })
                        )
                refId: A
            - refId: D
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params: []
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - D
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: D
                type: reduce
            - refId: E
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 80
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - E
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: D
                intervalMs: 1000
                maxDataPoints: 43200
                refId: E
                type: threshold
          dashboardUid: IfgdXjtnk
          panelId: 21
          noDataState: NoData
          execErrState: Error
          for: 5m
          annotations:
            __dashboardUid__: IfgdXjtnk
            __panelId__: "21"
            description: На сервере Proxmox высокая нагрузка на процессор.
          labels:
            Proxmox: CPU
          isPaused: false
        - uid: bdyinc12k5gqof
          title: Контроль температуры
          condition: A
          data:
            - refId: A
              relativeTimeRange:
                from: 300
                to: 0
              datasourceUid: PBFA97CFB590B2093
              model:
                datasource:
                    type: prometheus
                    uid: PBFA97CFB590B2093
                editorMode: code
                exemplar: false
                expr: (max by (instance, chip)(node_hwmon_temp_celsius{job=~"Servers"}) > 30) > avg by (instance, chip)(avg_over_time(node_hwmon_temp_celsius{job=~"Servers"}[1h])) * 1.1
                instant: true
                interval: ""
                intervalMs: 15000
                legendFormat: '{{chip}}'
                maxDataPoints: 43200
                range: false
                refId: A
          dashboardUid: xfpJB9FGz
          panelId: 198
          noDataState: OK
          execErrState: Error
          for: 5m
          annotations:
            __dashboardUid__: xfpJB9FGz
            __panelId__: "198"
            description: |-
                Перегрев сервера "{{ reReplaceAll "(.*):9100" "$1" $labels.instance }}", текущая температура {{ if (gt $labels.chip "thermal_thermal_zone(.*)") -}}
                процессора{{ else if (gt $labels.chip "target1:0(.*)") -}}
                HDD{{ else if (gt $labels.chip "target0:0(.*)") -}}
                SSD{{- end }} {{ humanize $values.A.Value }}°C.
          labels:
            Servers: Перегрев
          isPaused: false