Merge branch '2021-09-24-arm64-optimized-str-funcs' into next

- Bring in, but disable by default, asm optimized string functions for arm64.
2024-09-29 19:02:12 +00:00 · 2021-09-24 10:13:44 -04:00 · 2021-09-24 10:13:44 -04:00 · 7d1fcaea12
commit 7d1fcaea12
parent 2c14ff5879 4e062fc955
6 changed files with 532 additions and 8 deletions
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@ -455,8 +455,8 @@ config ARM_CORTEX_CPU_IS_UP

 config USE_ARCH_MEMCPY
 	bool "Use an assembly optimized implementation of memcpy"
-	default y
-	depends on !ARM64
+	default y if !ARM64
+	depends on !ARM64 || (ARM64 && (GCC_VERSION >= 90400))
 	help
 	  Enable the generation of an optimized version of memcpy.
 	  Such an implementation may be faster under some conditions
@ -465,7 +465,7 @@ config USE_ARCH_MEMCPY
 config SPL_USE_ARCH_MEMCPY
 	bool "Use an assembly optimized implementation of memcpy for SPL"
 	default y if USE_ARCH_MEMCPY
-	depends on !ARM64 && SPL
+	depends on SPL
 	help
 	  Enable the generation of an optimized version of memcpy.
 	  Such an implementation may be faster under some conditions
@ -474,16 +474,43 @@ config SPL_USE_ARCH_MEMCPY
 config TPL_USE_ARCH_MEMCPY
 	bool "Use an assembly optimized implementation of memcpy for TPL"
 	default y if USE_ARCH_MEMCPY
-	depends on !ARM64 && TPL
+	depends on TPL
 	help
 	  Enable the generation of an optimized version of memcpy.
 	  Such an implementation may be faster under some conditions
 	  but may increase the binary size.

+config USE_ARCH_MEMMOVE
+	bool "Use an assembly optimized implementation of memmove" if !ARM64
+	default USE_ARCH_MEMCPY if ARM64
+	depends on ARM64
+	help
+	  Enable the generation of an optimized version of memmove.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
+config SPL_USE_ARCH_MEMMOVE
+	bool "Use an assembly optimized implementation of memmove for SPL" if !ARM64
+	default SPL_USE_ARCH_MEMCPY if ARM64
+	depends on SPL && ARM64
+	help
+	  Enable the generation of an optimized version of memmove.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
+config TPL_USE_ARCH_MEMMOVE
+	bool "Use an assembly optimized implementation of memmove for TPL" if !ARM64
+	default TPL_USE_ARCH_MEMCPY if ARM64
+	depends on TPL && ARM64
+	help
+	  Enable the generation of an optimized version of memmove.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
 config USE_ARCH_MEMSET
 	bool "Use an assembly optimized implementation of memset"
-	default y
-	depends on !ARM64
+	default y if !ARM64
+	depends on !ARM64 || (ARM64 && (GCC_VERSION >= 90400))
 	help
 	  Enable the generation of an optimized version of memset.
 	  Such an implementation may be faster under some conditions
@ -492,7 +519,7 @@ config USE_ARCH_MEMSET
 config SPL_USE_ARCH_MEMSET
 	bool "Use an assembly optimized implementation of memset for SPL"
 	default y if USE_ARCH_MEMSET
-	depends on !ARM64 && SPL
+	depends on SPL
 	help
 	  Enable the generation of an optimized version of memset.
 	  Such an implementation may be faster under some conditions
@ -501,7 +528,7 @@ config SPL_USE_ARCH_MEMSET
 config TPL_USE_ARCH_MEMSET
 	bool "Use an assembly optimized implementation of memset for TPL"
 	default y if USE_ARCH_MEMSET
-	depends on !ARM64 && TPL
+	depends on TPL
 	help
 	  Enable the generation of an optimized version of memset.
 	  Such an implementation may be faster under some conditions
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@ -19,7 +19,11 @@ extern char * strchr(const char * s, int c);
 #endif
 extern void * memcpy(void *, const void *, __kernel_size_t);

+#if CONFIG_IS_ENABLED(USE_ARCH_MEMMOVE)
+#define __HAVE_ARCH_MEMMOVE
+#else
 #undef __HAVE_ARCH_MEMMOVE
+#endif
 extern void * memmove(void *, const void *, __kernel_size_t);

 #undef __HAVE_ARCH_MEMCHR
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@ -39,8 +39,13 @@ obj-$(CONFIG_$(SPL_TPL_)FRAMEWORK) += spl.o
 obj-$(CONFIG_SPL_FRAMEWORK) += zimage.o
 obj-$(CONFIG_OF_LIBFDT) += bootm-fdt.o
 endif
+ifdef CONFIG_ARM64
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset-arm64.o
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy-arm64.o
+else
 obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset.o
 obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy.o
+endif
 obj-$(CONFIG_SEMIHOSTING) += semihosting.o

 obj-y	+= bdinfo.o
--- a/arch/arm/lib/asmdefs.h
+++ b/arch/arm/lib/asmdefs.h
@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Macros for asm code.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+#if defined(__aarch64__)
+
+/* Branch Target Identitication support.  */
+#define BTI_C		hint	34
+#define BTI_J		hint	36
+/* Return address signing support (pac-ret).  */
+#define PACIASP		hint	25; .cfi_window_save
+#define AUTIASP		hint	29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 3;				\
+  .word 4;				\
+  .word 16;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .word 0;				\
+  .text
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files.  */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;	\
+  BTI_C;
+
+#else
+
+#define END_FILE
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;
+
+#endif
+
+#define ENTRY(name)	ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)	\
+  .global name;		\
+  .type name,%function;	\
+  name:
+
+#define END(name)	\
+  .cfi_endproc;		\
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+#endif
--- a/arch/arm/lib/memcpy-arm64.S
+++ b/arch/arm/lib/memcpy-arm64.S
@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_lw	w10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	x14
+#define E_h	x15
+#define F_l	x16
+#define F_h	x17
+#define G_l	count
+#define G_h	dst
+#define H_l	src
+#define H_h	srcend
+#define tmp1	x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (memmove)
+ENTRY (memcpy)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldp	A_l, A_h, [src]
+	ldp	D_l, D_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_l, E_h, [src, 32]
+	ldp	F_l, F_h, [src, 48]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_l, G_h, [srcend, -64]
+	ldp	H_l, H_h, [srcend, -48]
+	stp	G_l, G_h, [dstend, -64]
+	stp	H_l, H_h, [dstend, -48]
+L(copy96):
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	E_l, E_h, [dstin, 32]
+	stp	F_l, F_h, [dstin, 48]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cbz	tmp1, L(copy0)
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+	ldp	D_l, D_h, [src]
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+	ldp	D_l, D_h, [srcend, -16]
+	and	tmp1, dstend, 15
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+	ret
+
+END (memcpy)
--- a/arch/arm/lib/memset-arm64.S
+++ b/arch/arm/lib/memset-arm64.S
@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2021, Arm Limited.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include <asm/macro.h>
+#include "asmdefs.h"
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
+
+ENTRY (memset)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+
+	/*
+	 * The optimized memset uses the dc opcode, which causes problems
+	 * when the cache is disabled. Let's check if the cache is disabled
+	 * and use a very simple memset implementation in this case. Otherwise
+	 * jump to the optimized version.
+	 */
+	switch_el x6, 3f, 2f, 1f
+3:	mrs	x6, sctlr_el3
+	b	0f
+2:	mrs	x6, sctlr_el2
+	b	0f
+1:	mrs	x6, sctlr_el1
+0:
+	tst	x6, #CR_C
+	bne	9f
+
+	/*
+	 * A very "simple" memset implementation without the use of the
+	 * dc opcode. Can be run with caches disabled.
+	 */
+	mov	x3, #0x0
+	cmp	count, x3	/* check for zero length */
+	beq	8f
+4:	strb	valw, [dstin, x3]
+	add	x3, x3, #0x1
+	cmp	count, x3
+	bne	4b
+8:	ret
+9:
+
+	/* Here the optimized memset version starts */
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	.p2align 4
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 160
+	ccmp	valw, 0, 0, hs
+	b.ne	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+END (memset)