--- prometheus_version: 2.54.1 prometheus_binary_url: "https://github.com/{{ _prometheus_repo }}/releases/download/v{{ prometheus_version }}/\ prometheus-{{ prometheus_version }}.{{ ansible_system | lower }}-{{ _prometheus_go_ansible_arch }}.tar.gz" prometheus_checksums_url: "https://github.com/{{ _prometheus_repo }}/releases/download/v{{ prometheus_version }}/sha256sums.txt" prometheus_binary_install_dir: /usr/local/bin prometheus_config_dir: /etc/prometheus prometheus_db_dir: /var/lib/prometheus prometheus_read_only_dirs: [] prometheus_web_listen_address: "0.0.0.0:9090" prometheus_web_external_url: '' prometheus_metrics_path: "/{{ (prometheus_web_external_url + '/metrics') | regex_replace('^(.*://)?(.*?)/') }}" # See https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md prometheus_web_config: tls_server_config: {} http_server_config: {} basic_auth_users: {} prometheus_storage_retention: "30d" # Available since Prometheus 2.7.0 # [EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units # supported: KB, MB, GB, TB, PB. prometheus_storage_retention_size: "0" # The Agent mode optimizes Prometheus for the remote write use case: https://prometheus.io/blog/2021/11/16/agent/ prometheus_agent_mode: false prometheus_config_flags_extra: {} # prometheus_config_flags_extra: # storage.tsdb.retention: 15d # alertmanager.timeout: 10s prometheus_alertmanager_config: [] # prometheus_alertmanager_config: # - scheme: https # path_prefix: alertmanager/ # basic_auth: # username: user # password: pass # static_configs: # - targets: ["127.0.0.1:9093"] # proxy_url: "127.0.0.2" prometheus_alert_relabel_configs: [] # prometheus_alert_relabel_configs: # - action: labeldrop # regex: replica prometheus_global: scrape_interval: 15s scrape_timeout: 10s evaluation_interval: 15s prometheus_remote_write: [] # prometheus_remote_write: # - url: https://dev.kausal.co/prom/push # basic_auth: # password: FOO prometheus_remote_read: [] # prometheus_remote_read: # - url: https://prometheus.demo.do.prometheus.io:9201/read # basic_auth: # password: FOO prometheus_external_labels: environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" prometheus_targets: {} # node: # - targets: # - localhost:9100 # labels: # env: test prometheus_scrape_configs: - job_name: "prometheus" metrics_path: "{{ prometheus_metrics_path }}" static_configs: - targets: - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090" - job_name: "node" file_sd_configs: - files: - "{{ prometheus_config_dir }}/file_sd/node.yml" # Alternative config file name, searched in ansible templates path. prometheus_config_file: 'prometheus.yml.j2' prometheus_alert_rules_files: - prometheus/rules/*.rules prometheus_static_targets_files: - prometheus/targets/*.yml - prometheus/targets/*.json prometheus_scrape_config_files: - prometheus/targets/*.yml - prometheus/targets/*.json # yamllint disable rule:line-length prometheus_alert_rules: # noqa yaml[line-length] # noqa line-length - alert: Watchdog expr: vector(1) for: 10m labels: severity: warning annotations: description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty." summary: 'Ensure entire alerting pipeline is functional' - alert: InstanceDown expr: 'up == 0' for: 5m labels: severity: critical annotations: description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}' summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}' - alert: RebootRequired expr: 'node_reboot_required > 0' labels: severity: warning annotations: description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}' summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}' - alert: NodeFilesystemSpaceFillingUp annotations: description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}' summary: 'Filesystem is predicted to run out of space within the next 24 hours.' expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" for: 1h labels: severity: warning - alert: NodeFilesystemSpaceFillingUp annotations: description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}' summary: 'Filesystem is predicted to run out of space within the next 4 hours.' expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" for: 1h labels: severity: critical - alert: NodeFilesystemAlmostOutOfSpace annotations: description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' summary: 'Filesystem has less than 5% space left.' expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" for: 1h labels: severity: warning - alert: NodeFilesystemAlmostOutOfSpace annotations: description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' summary: 'Filesystem has less than 3% space left.' expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" for: 1h labels: severity: critical - alert: NodeFilesystemFilesFillingUp annotations: description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}' summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.' expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" for: 1h labels: severity: warning - alert: NodeFilesystemFilesFillingUp annotations: description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}' summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.' expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" for: 1h labels: severity: critical - alert: NodeFilesystemAlmostOutOfFiles annotations: description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' summary: 'Filesystem has less than 5% inodes left.' expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" for: 1h labels: severity: warning - alert: NodeFilesystemAlmostOutOfFiles annotations: description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' summary: 'Filesystem has less than 3% inodes left.' expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" for: 1h labels: severity: critical - alert: NodeNetworkReceiveErrs annotations: description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}' summary: 'Network interface is reporting many receive errors.' expr: "increase(node_network_receive_errs_total[2m]) > 10\n" for: 1h labels: severity: warning - alert: NodeNetworkTransmitErrs annotations: description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}' summary: 'Network interface is reporting many transmit errors.' expr: "increase(node_network_transmit_errs_total[2m]) > 10\n" for: 1h labels: severity: warning - alert: NodeHighNumberConntrackEntriesUsed annotations: description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}' summary: 'Number of conntrack are getting close to the limit' expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n" labels: severity: warning - alert: NodeClockSkewDetected annotations: message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}' summary: 'Clock skew detected.' expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n" for: 10m labels: severity: warning - alert: NodeClockNotSynchronising annotations: message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}' summary: 'Clock not synchronising.' expr: "min_over_time(node_timex_sync_status[5m]) == 0\n" for: 10m labels: severity: warning # yamllint enable rule:line-length prometheus_system_group: 'prometheus' prometheus_system_user: "{{ prometheus_system_group }}" prometheus_stop_timeout: '600s' # Local path to stash the archive and its extraction prometheus_local_cache_path: "/tmp/prometheus-{{ ansible_system | lower }}-{{ _prometheus_go_ansible_arch }}/{{ prometheus_version }}"