feat: use exponential backoff algorithm when polling actions (#524)

##### SUMMARY

Replace the constant poll interval of 1 second, with a truncated
exponential back off algorithm with jitter.

Below is a suite of poll interval (in seconds) generated by the new
algorithm:
```
1.49
2.14
5.46
6.51
6.57
5.57
5.98
7.13
6.59
7.10
5.54
5.03
6.56
5.96
6.72
7.21
7.05
5.31
5.60
6.33
6.82
5.42
6.08
6.60
TOTAL: 140.77
```
This commit is contained in:
Jonas L 2024-07-04 15:07:05 +02:00 committed by GitHub
parent ecaeac1175
commit 19e586fa22
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 56 additions and 11 deletions

View file

@ -0,0 +1,2 @@
minor_changes:
- Use a truncated exponential backoff algorithm when polling actions from the API.

View file

@ -3,6 +3,7 @@
from __future__ import annotations from __future__ import annotations
from contextlib import contextmanager from contextlib import contextmanager
from random import random
from ansible.module_utils.basic import missing_required_lib from ansible.module_utils.basic import missing_required_lib
@ -106,3 +107,22 @@ class Client(ClientBase):
yield yield
finally: finally:
self._requests_session = requests.Session() self._requests_session = requests.Session()
def exponential_backoff_poll_interval(*, base: float, multiplier: int, cap: float, jitter: float):
"""
Return a poll interval function, implementing a truncated exponential backoff with jitter.
:param base: Base for the exponential backoff algorithm.
:param multiplier: Multiplier for the exponential backoff algorithm.
:param cap: Value at which the interval is truncated.
:param jitter: Proportion of the interval to add as random jitter.
"""
def func(retries: int) -> float:
interval = base * multiplier**retries # Exponential backoff
interval = min(cap, interval) # Cap backoff
interval += random() * interval * jitter # Add jitter
return interval
return func

View file

@ -15,7 +15,12 @@ from ansible.module_utils.common.validation import (
check_required_one_of, check_required_one_of,
) )
from .client import ClientException, client_check_required_lib, client_get_by_name_or_id from .client import (
ClientException,
client_check_required_lib,
client_get_by_name_or_id,
exponential_backoff_poll_interval,
)
from .vendor.hcloud import APIException, Client, HCloudException from .vendor.hcloud import APIException, Client, HCloudException
from .vendor.hcloud.actions import ActionException from .vendor.hcloud.actions import ActionException
from .version import version from .version import version
@ -81,6 +86,9 @@ class AnsibleHCloud:
api_endpoint=self.module.params["api_endpoint"], api_endpoint=self.module.params["api_endpoint"],
application_name="ansible-module", application_name="ansible-module",
application_version=version, application_version=version,
# Total waiting time before timeout is > 117.0
poll_interval=exponential_backoff_poll_interval(base=1.0, multiplier=2, cap=5.0, jitter=0.5),
poll_max_retries=25,
) )
def _client_get_by_name_or_id(self, resource: str, param: str | int): def _client_get_by_name_or_id(self, resource: str, param: str | int):

View file

@ -204,7 +204,7 @@ class AnsibleHCloudCertificate(AnsibleHCloud):
resp = self.client.certificates.create_managed(**params) resp = self.client.certificates.create_managed(**params)
# Action should take 60 to 90 seconds on average, wait for 5m to # Action should take 60 to 90 seconds on average, wait for 5m to
# allow DNS or Let's Encrypt slowdowns. # allow DNS or Let's Encrypt slowdowns.
resp.action.wait_until_finished(max_retries=300) resp.action.wait_until_finished(max_retries=62) # 62 retries >= 302 seconds
except HCloudException as exception: except HCloudException as exception:
self.fail_json_hcloud(exception) self.fail_json_hcloud(exception)

View file

@ -471,7 +471,7 @@ class AnsibleHCloudServer(AnsibleHCloud):
self.result["root_password"] = resp.root_password self.result["root_password"] = resp.root_password
# Action should take 60 to 90 seconds on average, but can be >10m when creating a # Action should take 60 to 90 seconds on average, but can be >10m when creating a
# server from a custom images # server from a custom images
resp.action.wait_until_finished(max_retries=1800) resp.action.wait_until_finished(max_retries=362) # 362 retries >= 1802 seconds
for action in resp.next_actions: for action in resp.next_actions:
action.wait_until_finished() action.wait_until_finished()
@ -671,17 +671,18 @@ class AnsibleHCloudServer(AnsibleHCloud):
self.stop_server_if_forced() self.stop_server_if_forced()
upgrade_disk = self.module.params.get("upgrade_disk")
# Upgrading a server takes 160 seconds on average, upgrading the disk should
# take more time
upgrade_timeout = 600 if upgrade_disk else 180
if not self.module.check_mode: if not self.module.check_mode:
upgrade_disk = self.module.params.get("upgrade_disk")
action = self.hcloud_server.change_type( action = self.hcloud_server.change_type(
server_type=self._get_server_type(), server_type=self._get_server_type(),
upgrade_disk=upgrade_disk, upgrade_disk=upgrade_disk,
) )
action.wait_until_finished(max_retries=upgrade_timeout) # Upgrading a server takes 160 seconds on average, upgrading the disk should
# take more time
# 122 retries >= 602 seconds
# 38 retries >= 182 seconds
action.wait_until_finished(max_retries=122 if upgrade_disk else 38)
self._mark_as_changed() self._mark_as_changed()
def _update_server_ip(self, kind: Literal["ipv4", "ipv6"]) -> None: def _update_server_ip(self, kind: Literal["ipv4", "ipv6"]) -> None:
@ -867,9 +868,9 @@ class AnsibleHCloudServer(AnsibleHCloud):
try: try:
if not self.module.check_mode: if not self.module.check_mode:
image = self._get_image(self.hcloud_server.server_type) image = self._get_image(self.hcloud_server.server_type)
# When we rebuild the server progress takes some more time.
resp = self.client.servers.rebuild(self.hcloud_server, image) resp = self.client.servers.rebuild(self.hcloud_server, image)
resp.action.wait_until_finished(1000) # When we rebuild the server progress takes some more time.
resp.action.wait_until_finished(max_retries=202) # 202 retries >= 1002 seconds
self._mark_as_changed() self._mark_as_changed()
self._get_server() self._get_server()

View file

@ -0,0 +1,14 @@
from __future__ import annotations
from ansible_collections.hetzner.hcloud.plugins.module_utils.client import (
exponential_backoff_poll_interval,
)
def test_exponential_backoff_poll_interval():
poll_interval = exponential_backoff_poll_interval(base=1.0, multiplier=2, cap=5.0, jitter=0.0)
poll_max_retries = 25
results = [poll_interval(i) for i in range(poll_max_retries)]
assert sum(results) == 117.0
assert results[:6] == [1.0, 2.0, 4.0, 5.0, 5.0, 5.0]