From 816388325262e5c7dd681c8ac6392248b6ca9f0e Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 31 May 2022 21:33:23 +0900 Subject: [PATCH] utils: Use exclusive load to wake up from WFE Commit 9c795fbdbf445d144d331ff2a19a0f42fe0fc190 introduced the pair of WFE and SEV for spinlock, but it caused delays of tens of seconds. A possible explanation for the delay is lack of data synchronization barrier between the store instruction and SEV instruction. Arm Architecture Reference Manual for A-profile architecture (issue H.a) says: > Arm recommends that software includes a Data Synchronization Barrier > (DSB) instruction before any SEV instruction. The DSB instruction > ensures that no instructions, including any SEV instructions, that > appear in program order after the DSB instruction, can execute until > the DSB instruction has completed. However, inserting a DSB instruction still didn't resolve the delay. The exclusive load is an alternative to the SEV instruction. The manual says: > ...However, in Armv8, when the global monitor for a PE changes from > Exclusive Access state to Open Access state, an event is generated. > This is equivalent to issuing an SEVL instruction on the PE for which > the monitor state has changed. It removes the need for spinlock code > to include an SEV instruction after clearing a spinlock. As an additional benefit, the exclusive load is local to the PE and eliminates spurious events for other PEs. Trusted Firmware-A v2.6 also employs the same algorithm. Signed-off-by: Akihiko Odaki --- src/utils.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/utils.c b/src/utils.c index 314ba0d8..293f75b9 100644 --- a/src/utils.c +++ b/src/utils.c @@ -124,19 +124,28 @@ void spin_init(spinlock_t *lock) void spin_lock(spinlock_t *lock) { + s64 tmp; s64 me = smp_id(); if (__atomic_load_n(&lock->lock, __ATOMIC_ACQUIRE) == me) { lock->count++; return; } - s64 free = -1; - - while (!__atomic_compare_exchange_n(&lock->lock, &free, me, false, __ATOMIC_ACQUIRE, - __ATOMIC_RELAXED)) { - free = -1; - sysop("wfe"); - } + __asm__ volatile("1:\n" + "mov\t%0, -1\n" + "2:\n" + "\tcasa\t%0, %2, %1\n" + "\tcmn\t%0, 1\n" + "\tbeq\t3f\n" + "\tldxr\t%0, %1\n" + "\tcmn\t%0, 1\n" + "\tbeq\t2b\n" + "\twfe\n" + "\tb\t1b\n" + "3:" + : "=&r"(tmp), "+m"(lock->lock) + : "r"(me) + : "cc", "memory"); assert(__atomic_load_n(&lock->lock, __ATOMIC_RELAXED) == me); lock->count++; @@ -149,7 +158,6 @@ void spin_unlock(spinlock_t *lock) assert(lock->count > 0); if (!--lock->count) __atomic_store_n(&lock->lock, -1L, __ATOMIC_RELEASE); - sysop("sev"); } bool is_heap(void *addr)