ansible-collection-prometheus/roles/prometheus/defaults/main.yml

---
prometheus_version: 2.27.0
prometheus_binary_local_dir: ''
prometheus_skip_install: false

prometheus_config_dir: /etc/prometheus
prometheus_db_dir: /var/lib/prometheus
prometheus_read_only_dirs: []

prometheus_web_listen_address: "0.0.0.0:9090"
prometheus_web_external_url: ''
# See https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md
prometheus_web_config:
  tls_server_config: {}
  http_server_config: {}
  basic_auth_users: {}

prometheus_storage_retention: "30d"
# Available since Prometheus 2.7.0
# [EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units
# supported: KB, MB, GB, TB, PB.
prometheus_storage_retention_size: "0"

prometheus_config_flags_extra: {}
# prometheus_config_flags_extra:
#   storage.tsdb.retention: 15d
#   alertmanager.timeout: 10s

prometheus_alertmanager_config: []
# prometheus_alertmanager_config:
#   - scheme: https
#     path_prefix: alertmanager/
#     basic_auth:
#       username: user
#       password: pass
#     static_configs:
#       - targets: ["127.0.0.1:9093"]
#     proxy_url: "127.0.0.2"

prometheus_alert_relabel_configs: []
# prometheus_alert_relabel_configs:
#   - action: labeldrop
#     regex: replica

prometheus_global:
  scrape_interval: 15s
  scrape_timeout: 10s
  evaluation_interval: 15s

prometheus_remote_write: []
# prometheus_remote_write:
#   - url: https://dev.kausal.co/prom/push
#     basic_auth:
#       password: FOO

prometheus_remote_read: []
# prometheus_remote_read:
#   - url: https://demo.cloudalchemy.org:9201/read
#     basic_auth:
#       password: FOO

prometheus_external_labels:
  environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}"

prometheus_targets: {}
#  node:
#    - targets:
#        - localhost:9100
#      labels:
#        env: test

prometheus_scrape_configs:
  - job_name: "prometheus"
    metrics_path: "{{ prometheus_metrics_path }}"
    static_configs:
      - targets:
          - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090"
  - job_name: "node"
    file_sd_configs:
      - files:
          - "{{ prometheus_config_dir }}/file_sd/node.yml"

# Alternative config file name, searched in ansible templates path.
prometheus_config_file: 'prometheus.yml.j2'

prometheus_alert_rules_files:
  - prometheus/rules/*.rules

prometheus_static_targets_files:
  - prometheus/targets/*.yml
  - prometheus/targets/*.json

prometheus_alert_rules:
  - alert: Watchdog
    expr: vector(1)
    for: 10m
    labels:
      severity: warning
    annotations:
      description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty."
      summary: 'Ensure entire alerting pipeline is functional'
  - alert: InstanceDown
    expr: 'up == 0'
    for: 5m
    labels:
      severity: critical
    annotations:
      description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}'
      summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}'
  - alert: RebootRequired
    expr: 'node_reboot_required > 0'
    labels:
      severity: warning
    annotations:
      description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}'
      summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}'
  - alert: NodeFilesystemSpaceFillingUp
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}'
      summary: 'Filesystem is predicted to run out of space within the next 24 hours.'
    expr: "(\n  node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n  predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeFilesystemSpaceFillingUp
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}'
      summary: 'Filesystem is predicted to run out of space within the next 4 hours.'
    expr: "(\n  node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n  predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: critical
  - alert: NodeFilesystemAlmostOutOfSpace
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
      summary: 'Filesystem has less than 5% space left.'
    expr: "(\n  node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeFilesystemAlmostOutOfSpace
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
      summary: 'Filesystem has less than 3% space left.'
    expr: "(\n  node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: critical
  - alert: NodeFilesystemFilesFillingUp
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}'
      summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'
    expr: "(\n  node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n  predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeFilesystemFilesFillingUp
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}'
      summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'
    expr: "(\n  node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n  predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: critical
  - alert: NodeFilesystemAlmostOutOfFiles
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
      summary: 'Filesystem has less than 5% inodes left.'
    expr: "(\n  node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeFilesystemAlmostOutOfFiles
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
      summary: 'Filesystem has less than 3% inodes left.'
    expr: "(\n  node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: critical
  - alert: NodeNetworkReceiveErrs
    annotations:
      description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}'
      summary: 'Network interface is reporting many receive errors.'
    expr: "increase(node_network_receive_errs_total[2m]) > 10\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeNetworkTransmitErrs
    annotations:
      description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}'
      summary: 'Network interface is reporting many transmit errors.'
    expr: "increase(node_network_transmit_errs_total[2m]) > 10\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeHighNumberConntrackEntriesUsed
    annotations:
      description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}'
      summary: 'Number of conntrack are getting close to the limit'
    expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n"
    labels:
      severity: warning
  - alert: NodeClockSkewDetected
    annotations:
      message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}'
      summary: 'Clock skew detected.'
    expr: "(\n  node_timex_offset_seconds > 0.05\nand\n  deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n  node_timex_offset_seconds < -0.05\nand\n  deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"
    for: 10m
    labels:
      severity: warning
  - alert: NodeClockNotSynchronising
    annotations:
      message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}'
      summary: 'Clock not synchronising.'
    expr: "min_over_time(node_timex_sync_status[5m]) == 0\n"
    for: 10m
    labels:
      severity: warning
initial migration of roles from cloudalchemy Signed-off-by: Paweł Krupa (paulfantom) <pawel@krupa.net.pl> 2022-09-22 14:38:42 +00:00			`---`
			`prometheus_version: 2.27.0`
			`prometheus_binary_local_dir: ''`
			`prometheus_skip_install: false`

			`prometheus_config_dir: /etc/prometheus`
			`prometheus_db_dir: /var/lib/prometheus`
			`prometheus_read_only_dirs: []`

			`prometheus_web_listen_address: "0.0.0.0:9090"`
			`prometheus_web_external_url: ''`
			`# See https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md`
			`prometheus_web_config:`
			`tls_server_config: {}`
			`http_server_config: {}`
			`basic_auth_users: {}`

			`prometheus_storage_retention: "30d"`
			`# Available since Prometheus 2.7.0`
			`# [EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units`
			`# supported: KB, MB, GB, TB, PB.`
			`prometheus_storage_retention_size: "0"`

			`prometheus_config_flags_extra: {}`
			`# prometheus_config_flags_extra:`
			`# storage.tsdb.retention: 15d`
			`# alertmanager.timeout: 10s`

			`prometheus_alertmanager_config: []`
			`# prometheus_alertmanager_config:`
			`# - scheme: https`
			`# path_prefix: alertmanager/`
			`# basic_auth:`
			`# username: user`
			`# password: pass`
			`# static_configs:`
			`# - targets: ["127.0.0.1:9093"]`
			`# proxy_url: "127.0.0.2"`

			`prometheus_alert_relabel_configs: []`
			`# prometheus_alert_relabel_configs:`
			`# - action: labeldrop`
			`# regex: replica`

			`prometheus_global:`
			`scrape_interval: 15s`
			`scrape_timeout: 10s`
			`evaluation_interval: 15s`

			`prometheus_remote_write: []`
			`# prometheus_remote_write:`
			`# - url: https://dev.kausal.co/prom/push`
			`# basic_auth:`
			`# password: FOO`

			`prometheus_remote_read: []`
			`# prometheus_remote_read:`
			`# - url: https://demo.cloudalchemy.org:9201/read`
			`# basic_auth:`
			`# password: FOO`

			`prometheus_external_labels:`
			`environment: "{{ ansible_fqdn \| default(ansible_host) \| default(inventory_hostname) }}"`

			`prometheus_targets: {}`
			`# node:`
			`# - targets:`
			`# - localhost:9100`
			`# labels:`
			`# env: test`

			`prometheus_scrape_configs:`
			`- job_name: "prometheus"`
			`metrics_path: "{{ prometheus_metrics_path }}"`
			`static_configs:`
			`- targets:`
			`- "{{ ansible_fqdn \| default(ansible_host) \| default('localhost') }}:9090"`
			`- job_name: "node"`
			`file_sd_configs:`
			`- files:`
			`- "{{ prometheus_config_dir }}/file_sd/node.yml"`

			`# Alternative config file name, searched in ansible templates path.`
			`prometheus_config_file: 'prometheus.yml.j2'`

			`prometheus_alert_rules_files:`
			`- prometheus/rules/*.rules`

			`prometheus_static_targets_files:`
			`- prometheus/targets/*.yml`
			`- prometheus/targets/*.json`

			`prometheus_alert_rules:`
			`- alert: Watchdog`
			`expr: vector(1)`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty."`
			`summary: 'Ensure entire alerting pipeline is functional'`
			`- alert: InstanceDown`
			`expr: 'up == 0'`
			`for: 5m`
			`labels:`
			`severity: critical`
			`annotations:`
			`description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}'`
			`summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}'`
			`- alert: RebootRequired`
			`expr: 'node_reboot_required > 0'`
			`labels:`
			`severity: warning`
			`annotations:`
			`description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}'`
			`summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}'`
			`- alert: NodeFilesystemSpaceFillingUp`
			`annotations:`
			`description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}'`
			`summary: 'Filesystem is predicted to run out of space within the next 24 hours.'`
			`expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 246060) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"`
			`for: 1h`
			`labels:`
			`severity: warning`
			`- alert: NodeFilesystemSpaceFillingUp`
			`annotations:`
			`description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}'`
			`summary: 'Filesystem is predicted to run out of space within the next 4 hours.'`
			`expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 46060) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"`
			`for: 1h`
			`labels:`
			`severity: critical`
			`- alert: NodeFilesystemAlmostOutOfSpace`
			`annotations:`
			`description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'`
			`summary: 'Filesystem has less than 5% space left.'`
			`expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"`
			`for: 1h`
			`labels:`
			`severity: warning`
			`- alert: NodeFilesystemAlmostOutOfSpace`
			`annotations:`
			`description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'`
			`summary: 'Filesystem has less than 3% space left.'`
			`expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"`
			`for: 1h`
			`labels:`
			`severity: critical`
			`- alert: NodeFilesystemFilesFillingUp`
			`annotations:`
			`description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}'`
			`summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'`
			`expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 246060) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"`
			`for: 1h`
			`labels:`
			`severity: warning`
			`- alert: NodeFilesystemFilesFillingUp`
			`annotations:`
			`description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}'`
			`summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'`
			`expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 46060) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"`
			`for: 1h`
			`labels:`
			`severity: critical`
			`- alert: NodeFilesystemAlmostOutOfFiles`
			`annotations:`
			`description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'`
			`summary: 'Filesystem has less than 5% inodes left.'`
			`expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"`
			`for: 1h`
			`labels:`
			`severity: warning`
			`- alert: NodeFilesystemAlmostOutOfFiles`
			`annotations:`
			`description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'`
			`summary: 'Filesystem has less than 3% inodes left.'`
			`expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"`
			`for: 1h`
			`labels:`
			`severity: critical`
			`- alert: NodeNetworkReceiveErrs`
			`annotations:`
			`description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}'`
			`summary: 'Network interface is reporting many receive errors.'`
			`expr: "increase(node_network_receive_errs_total[2m]) > 10\n"`
			`for: 1h`
			`labels:`
			`severity: warning`
			`- alert: NodeNetworkTransmitErrs`
			`annotations:`
			`description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}'`
			`summary: 'Network interface is reporting many transmit errors.'`
			`expr: "increase(node_network_transmit_errs_total[2m]) > 10\n"`
			`for: 1h`
			`labels:`
			`severity: warning`
			`- alert: NodeHighNumberConntrackEntriesUsed`
			`annotations:`
			`description: '{% raw %}{{ $value \| humanizePercentage }} of conntrack entries are used{% endraw %}'`
			`summary: 'Number of conntrack are getting close to the limit'`
			`expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n"`
			`labels:`
			`severity: warning`
			`- alert: NodeClockSkewDetected`
			`annotations:`
			`message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}'`
			`summary: 'Clock skew detected.'`
			`expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"`
			`for: 10m`
			`labels:`
			`severity: warning`
			`- alert: NodeClockNotSynchronising`
			`annotations:`
			`message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}'`
			`summary: 'Clock not synchronising.'`
			`expr: "min_over_time(node_timex_sync_status[5m]) == 0\n"`
			`for: 10m`
			`labels:`
			`severity: warning`