Add nvidia_gpu_exporter

Signed-off-by: Jack <jack4zhang@gmail.com>
This commit is contained in:
Jack 2024-10-29 15:32:29 +08:00
parent e2d8b745dc
commit 5e6c551656
18 changed files with 483 additions and 0 deletions

View file

@ -0,0 +1,54 @@
<p><img src="https://www.circonus.com/wp-content/uploads/2015/03/sol-icon-itOps.png" alt="graph logo" title="graph" align="right" height="60" /></p>
# Ansible Role: Nvidia GPU exporter
## Description
Deploy prometheus [Nvidia GPU exporter ](https://github.com/utkuozdemir/nvidia_gpu_exporter) using ansible.
## Requirements
- Ansible >= 2.9 (It might work on previous versions, but we cannot guarantee it)
- gnu-tar on Mac deployer host (`brew install gnu-tar`)
- Passlib is required when using the basic authentication feature (`pip install passlib[bcrypt]`)
## Role Variables
All variables which can be overridden are stored in [defaults/main.yml](defaults/main.yml) file as well as in [meta/argument_specs.yml](meta/argument_specs.yml).
Please refer to the [collection docs](https://prometheus-community.github.io/ansible/branch/main/nvidia_gpu_exporter_role.html) for description and default values of the variables.
## Example
### Playbook
Use it in a playbook as follows:
```yaml
- hosts: all
roles:
- prometheus.prometheus.nvidia_gpu_exporter
```
### Demo site
We provide an example site that demonstrates a full monitoring solution based on prometheus and grafana. The repository with code and links to running instances is [available on github](https://github.com/prometheus/demo-site) and the site is hosted on [DigitalOcean](https://digitalocean.com).
## Local Testing
The preferred way of locally testing the role is to use Docker and [molecule](https://github.com/ansible-community/molecule) (v3.x). You will have to install Docker on your system. See "Get started" for a Docker package suitable for your system. Running your tests is as simple as executing `molecule test`.
## Continuous Integration
Combining molecule and circle CI allows us to test how new PRs will behave when used with multiple ansible versions and multiple operating systems. This also allows use to create test scenarios for different role configurations. As a result we have quite a large test matrix which can take more time than local testing, so please be patient.
## Contributing
See [contributor guideline](CONTRIBUTING.md).
## Troubleshooting
See [troubleshooting](TROUBLESHOOTING.md).
## License
This project is licensed under MIT License. See [LICENSE](/LICENSE) for more details.

View file

@ -0,0 +1,16 @@
---
nvidia_gpu_exporter_version: 1.2.1
nvidia_gpu_exporter_binary_url: "https://github.com/{{ _nvidia_gpu_exporter_repo }}/releases/download/v{{ nvidia_gpu_exporter_version }}/\
nvidia_gpu_exporter_{{ nvidia_gpu_exporter_version }}_{{ ansible_system | lower }}_{{ _nvidia_gpu_exporter_go_ansible_arch }}.tar.gz"
nvidia_gpu_exporter_checksums_url: "https://github.com/{{ _nvidia_gpu_exporter_repo }}/releases/download/v{{ nvidia_gpu_exporter_version }}/checksums.txt"
nvidia_gpu_exporter_web_listen_address: "0.0.0.0:9835"
nvidia_gpu_exporter_web_telemetry_path: "/metrics"
nvidia_gpu_exporter_binary_install_dir: "/usr/local/bin"
nvidia_gpu_exporter_system_group: "nvidia-gpu-exp"
nvidia_gpu_exporter_system_user: "{{ nvidia_gpu_exporter_system_group }}"
nvidia_gpu_exporter_config_dir: "/etc/nvidia_gpu_exporter"
# Local path to stash the archive and its extraction
nvidia_gpu_exporter_local_cache_path: "/tmp/nvidia_gpu_exporter-{{ ansible_system | lower }}-{{ _nvidia_gpu_exporter_go_ansible_arch }}/{{ nvidia_gpu_exporter_version }}"

View file

@ -0,0 +1,10 @@
---
- name: Restart nvidia_gpu_exporter
listen: "restart nvidia_gpu_exporter"
become: true
ansible.builtin.systemd:
daemon_reload: true
name: nvidia_gpu_exporter
state: restarted
when:
- not ansible_check_mode

View file

@ -0,0 +1,46 @@
---
# yamllint disable rule:line-length
argument_specs:
main:
short_description: "Prometheus Nvidia GPU Exporter"
description:
- "Deploy prometheus L(Nvidia GPU exporter,https://github.com/utkuozdemir/nvidia_gpu_exporter) using ansible"
author:
- "Prometheus Community"
options:
nvidia_gpu_exporter_version:
description: "Nvidia GPU exporter package version. Also accepts latest as parameter."
default: "1.2.1"
nvidia_gpu_exporter_binary_url:
description: "URL of the Nvidia GPU exporter binaries .tar.gz file"
default: "https://github.com/{{ _nvidia_gpu_exporter_repo }}/releases/download/v{{ nvidia_gpu_exporter_version }}/nvidia_gpu_exporter_{{ nvidia_gpu_exporter_version }}_{{ ansible_system | lower }}_{{ _nvidia_gpu_exporter_go_ansible_arch }}.tar.gz"
nvidia_gpu_exporter_checksums_url:
description: "URL of the Nvidia GPU exporter checksums file"
default: "https://github.com/{{ _nvidia_gpu_exporter_repo }}/releases/download/v{{ nvidia_gpu_exporter_version }}/sha256sums.txt"
nvidia_gpu_exporter_web_listen_address:
description: "Address on which Nvidia GPU exporter will listen"
default: "0.0.0.0:9835"
nvidia_gpu_exporter_web_telemetry_path:
description: "Path under which to expose metrics"
default: "/metrics"
nvidia_gpu_exporter_binary_install_dir:
description:
- "I(Advanced)"
- "Directory to install nvidia_gpu_exporter binary"
default: "/usr/local/bin"
nvidia_gpu_exporter_system_group:
description:
- "I(Advanced)"
- "System group for Nvidia GPU exporter"
default: "nvidia-gpu-exp"
nvidia_gpu_exporter_system_user:
description:
- "I(Advanced)"
- "Nvidia GPU exporter user"
default: "nvidia-gpu-exp"
nvidia_gpu_exporter_local_cache_path:
description: "Local path to stash the archive and its extraction"
default: "/tmp/nvidia_gpu_exporter-{{ ansible_system | lower }}-{{ _nvidia_gpu_exporter_go_ansible_arch }}/{{ nvidia_gpu_exporter_version }}"
nvidia_gpu_exporter_config_dir:
description: "Path to directory with nvidia_gpu_exporter configuration"
default: "/etc/nvidia_gpu_exporter"

View file

@ -0,0 +1,25 @@
---
galaxy_info:
author: "Prometheus Community"
description: "Nvidia GPU exporter"
license: "Apache"
min_ansible_version: "2.9"
platforms:
- name: "Ubuntu"
versions:
- "focal"
- "jammy"
- "noble"
- name: "Debian"
versions:
- "bullseye"
- name: "EL"
versions:
- "8"
- "9"
galaxy_tags:
- "monitoring"
- "prometheus"
- "exporter"
- "metrics"
- "system"

View file

@ -0,0 +1,14 @@
---
provisioner:
playbooks:
prepare: "${MOLECULE_PROJECT_DIRECTORY}/../../.config/molecule/alternative/prepare.yml"
inventory:
group_vars:
all:
nvidia_gpu_exporter_local_cache_path: "/tmp/nvidia_gpu_exporter-linux_x86_64"
nvidia_gpu_exporter_web_listen_address:
- '127.0.0.1:9835'
- '127.0.1.1:9835'
nvidia_gpu_exporter_version: 1.2.1
nvidia_gpu_exporter_binary_url: "https://github.com/utkuozdemir/nvidia_gpu_exporter/releases/download/v{{\
\ nvidia_gpu_exporter_version }}/nvidia_gpu_exporter_{{ nvidia_gpu_exporter_version }}_linux_x86_64.tar.gz"

View file

@ -0,0 +1,33 @@
from __future__ import (absolute_import, division, print_function)
__metaclass__ = type
from testinfra_helpers import get_target_hosts
import pytest
testinfra_hosts = get_target_hosts()
def test_service(host):
s = host.service("nvidia_gpu_exporter")
try:
assert s.is_running
except AssertionError:
# Capture service logs
journal_output = host.run('journalctl -u nvidia_gpu_exporter --since "1 hour ago"')
print("\n==== journalctl -u nvidia_gpu_exporter Output ====\n")
print(journal_output)
print("\n============================================\n")
raise # Re-raise the original assertion error
def test_protecthome_property(host):
s = host.service("nvidia_gpu_exporter")
p = s.systemd_properties
assert p.get("ProtectHome") == "yes"
@pytest.mark.parametrize("sockets", [
"tcp://127.0.1.1:9835",
])
def test_socket(host, sockets):
assert host.socket(sockets).is_listening

View file

@ -0,0 +1,6 @@
---
provisioner:
inventory:
group_vars:
all:
nvidia_gpu_exporter_web_listen_address: "127.0.0.1:9835"

View file

@ -0,0 +1,66 @@
from __future__ import (absolute_import, division, print_function)
__metaclass__ = type
from testinfra_helpers import get_target_hosts
testinfra_hosts = get_target_hosts()
def test_files(host):
files = [
"/etc/systemd/system/nvidia_gpu_exporter.service",
"/usr/local/bin/nvidia_gpu_exporter"
]
for file in files:
f = host.file(file)
assert f.exists
assert f.is_file
def test_permissions_didnt_change(host):
dirs = [
"/etc",
"/root",
"/usr",
"/var"
]
for file in dirs:
f = host.file(file)
assert f.exists
assert f.is_directory
assert f.user == "root"
assert f.group == "root"
def test_user(host):
assert host.group("nvidia-gpu-exp").exists
assert "nvidia-gpu-exp" in host.user("nvidia-gpu-exp").groups
assert host.user("nvidia-gpu-exp").shell == "/usr/sbin/nologin"
def test_service(host):
s = host.service("nvidia_gpu_exporter")
try:
assert s.is_running
except AssertionError:
# Capture service logs
journal_output = host.run('journalctl -u nvidia_gpu_exporter --since "1 hour ago"')
print("\n==== journalctl -u nvidia_gpu_exporter Output ====\n")
print(journal_output)
print("\n============================================\n")
raise # Re-raise the original assertion error
def test_protecthome_property(host):
s = host.service("nvidia_gpu_exporter")
p = s.systemd_properties
assert p.get("ProtectHome") == "yes"
def test_socket(host):
sockets = [
"tcp://127.0.0.1:9835"
]
for socket in sockets:
s = host.socket(socket)
assert s.is_listening

View file

@ -0,0 +1,6 @@
---
provisioner:
inventory:
group_vars:
all:
nvidia_gpu_exporter_version: latest

View file

@ -0,0 +1,41 @@
from __future__ import (absolute_import, division, print_function)
__metaclass__ = type
from testinfra_helpers import get_target_hosts
import pytest
testinfra_hosts = get_target_hosts()
@pytest.mark.parametrize("files", [
"/etc/systemd/system/nvidia_gpu_exporter.service",
"/usr/local/bin/nvidia_gpu_exporter"
])
def test_files(host, files):
f = host.file(files)
assert f.exists
assert f.is_file
def test_service(host):
s = host.service("nvidia_gpu_exporter")
try:
assert s.is_running
except AssertionError:
# Capture service logs
journal_output = host.run('journalctl -u nvidia_gpu_exporter --since "1 hour ago"')
print("\n==== journalctl -u nvidia_gpu_exporter Output ====\n")
print(journal_output)
print("\n============================================\n")
raise # Re-raise the original assertion error
def test_protecthome_property(host):
s = host.service("nvidia_gpu_exporter")
p = s.systemd_properties
assert p.get("ProtectHome") == "yes"
def test_socket(host):
s = host.socket("tcp://0.0.0.0:9835")
assert s.is_listening

View file

@ -0,0 +1,69 @@
---
- name: Preflight
ansible.builtin.include_tasks:
file: preflight.yml
tags:
- nvidia_gpu_exporter
- install
- configure
- run
- nvidia_gpu_exporter_install
- nvidia_gpu_exporter_configure
- nvidia_gpu_exporter_run
- name: Install
ansible.builtin.include_role:
name: prometheus.prometheus._common
tasks_from: install.yml
vars:
_common_local_cache_path: "{{ nvidia_gpu_exporter_local_cache_path }}"
_common_binaries: "{{ _nvidia_gpu_exporter_binaries }}"
_common_binary_install_dir: "{{ nvidia_gpu_exporter_binary_install_dir }}"
_common_binary_url: "{{ nvidia_gpu_exporter_binary_url }}"
_common_checksums_url: "{{ nvidia_gpu_exporter_checksums_url }}"
_common_system_group: "{{ nvidia_gpu_exporter_system_group }}"
_common_system_user: "{{ nvidia_gpu_exporter_system_user }}"
_common_config_dir: "{{ nvidia_gpu_exporter_config_dir }}"
tags:
- nvidia_gpu_exporter
- install
- nvidia_gpu_exporter_install
- name: SELinux
ansible.builtin.include_role:
name: prometheus.prometheus._common
tasks_from: selinux.yml
vars:
_common_selinux_port: "{{ nvidia_gpu_exporter_web_listen_address | urlsplit('port') }}"
when: ansible_selinux.status == "enabled"
tags:
- nvidia_gpu_exporter
- configure
- nvidia_gpu_exporter_configure
- name: Configure
ansible.builtin.include_role:
name: prometheus.prometheus._common
tasks_from: configure.yml
vars:
_common_system_user: "{{ nvidia_gpu_exporter_system_user }}"
_common_system_group: "{{ nvidia_gpu_exporter_system_group }}"
_common_config_dir: "{{ nvidia_gpu_exporter_config_dir }}"
tags:
- nvidia_gpu_exporter
- configure
- nvidia_gpu_exporter_configure
- name: Ensure Nvidia GPU Exporter is enabled on boot
become: true
ansible.builtin.systemd:
daemon_reload: true
name: nvidia_gpu_exporter
enabled: true
state: started
when:
- not ansible_check_mode
tags:
- nvidia_gpu_exporter
- run
- nvidia_gpu_exporter_run

View file

@ -0,0 +1,31 @@
---
- name: Common preflight
ansible.builtin.include_role:
name: prometheus.prometheus._common
tasks_from: preflight.yml
- name: Naive assertion of proper listen address
ansible.builtin.assert:
that:
- >-
[nvidia_gpu_exporter_web_listen_address] |
flatten |
reject('match', '.+:\\d+$') |
list |
length == 0
- name: Discover latest version
ansible.builtin.set_fact:
nvidia_gpu_exporter_version: "{{ (lookup('url', 'https://api.github.com/repos/{{ _nvidia_gpu_exporter_repo }}/releases/latest', headers=_nvidia_gpu_exporter_github_api_headers,
split_lines=False) | from_json).get('tag_name') | replace('v', '') }}"
run_once: true
until: nvidia_gpu_exporter_version is version('0.0.0', '>=')
retries: 10
when:
- nvidia_gpu_exporter_version == "latest"
tags:
- nvidia_gpu_exporter
- install
- nvidia_gpu_exporter_install
- download
- nvidia_gpu_exporter_download

View file

@ -0,0 +1,45 @@
{{ ansible_managed | comment }}
[Unit]
Description=Nvidia GPU Exporter
After=network-online.target
[Service]
Type=simple
User={{ nvidia_gpu_exporter_system_user }}
Group={{ nvidia_gpu_exporter_system_group }}
ExecStart={{ nvidia_gpu_exporter_binary_install_dir }}/nvidia_gpu_exporter \
{% if nvidia_gpu_exporter_web_listen_address is iterable and
nvidia_gpu_exporter_web_listen_address is not mapping and
nvidia_gpu_exporter_web_listen_address is not string %}
{% for address in nvidia_gpu_exporter_web_listen_address %}
'--web.listen-address={{ address }}' \
{% endfor %}
{% else %}
'--web.listen-address={{ nvidia_gpu_exporter_web_listen_address }}' \
{% endif %}
'--web.telemetry-path={{ nvidia_gpu_exporter_web_telemetry_path }}'
SyslogIdentifier=nvidia_gpu_exporter
Restart=always
RestartSec=1
StartLimitInterval=0
{% set ns = namespace(protect_home = 'yes') %}
{% for m in ansible_mounts if m.mount.startswith('/home') %}
{% set ns.protect_home = 'read-only' %}
{% endfor %}
ProtectHome={{ ns.protect_home }}
NoNewPrivileges=yes
{% if (ansible_facts.packages.systemd | first).version is version('232', '>=') %}
ProtectSystem=strict
ProtectControlGroups=true
ProtectKernelModules=true
ProtectKernelTunables=yes
{% else %}
ProtectSystem=full
{% endif %}
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,9 @@
---
_nvidia_gpu_exporter_go_ansible_arch: "{{ {'i386': 'i386',
'x86_64': 'x86_64',
'aarch64': 'arm64',
'armv7l': 'armv7',
'armv6l': 'armv6'}.get(ansible_architecture, ansible_architecture) }}"
_nvidia_gpu_exporter_repo: "utkuozdemir/nvidia_gpu_exporter"
_nvidia_gpu_exporter_github_api_headers: "{{ {'GITHUB_TOKEN': lookup('ansible.builtin.env', 'GITHUB_TOKEN')} if (lookup('ansible.builtin.env', 'GITHUB_TOKEN')) else {} }}"
_nvidia_gpu_exporter_binaries: ['nvidia_gpu_exporter']

View file

@ -0,0 +1,4 @@
#!/usr/bin/env bash
collection_root=$(pwd | grep -oP ".+\/ansible_collections\/\w+?\/\w+")
source "$collection_root/tests/integration/molecule.sh"

View file

@ -0,0 +1,4 @@
#!/usr/bin/env bash
collection_root=$(pwd | grep -oP ".+\/ansible_collections\/\w+?\/\w+")
source "$collection_root/tests/integration/molecule.sh"

View file

@ -0,0 +1,4 @@
#!/usr/bin/env bash
collection_root=$(pwd | grep -oP ".+\/ansible_collections\/\w+?\/\w+")
source "$collection_root/tests/integration/molecule.sh"