riscv: assembler versions of memcpy, memmove, memset

Provide optimized versions of memcpy(), memmove(), memset() copied from
the Linux kernel.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Reviewed-by: Leo Yu-Chi Liang <ycliang@andestech.com>
This commit is contained in:
Heinrich Schuchardt 2021-03-27 12:37:04 +01:00 committed by Leo Yu-Chi Liang
parent f709a0b6f9
commit 8f0dc4cfd1
6 changed files with 383 additions and 22 deletions

View file

@ -271,4 +271,82 @@ config STACK_SIZE_SHIFT
config OF_BOARD_FIXUP
default y if OF_SEPARATE && RISCV_SMODE
config USE_ARCH_MEMCPY
bool "Use an assembly optimized implementation of memcpy"
default y
help
Enable the generation of an optimized version of memcpy.
Such an implementation may be faster under some conditions
but may increase the binary size.
config SPL_USE_ARCH_MEMCPY
bool "Use an assembly optimized implementation of memcpy for SPL"
default y if USE_ARCH_MEMCPY
depends on SPL
help
Enable the generation of an optimized version of memcpy.
Such an implementation may be faster under some conditions
but may increase the binary size.
config TPL_USE_ARCH_MEMCPY
bool "Use an assembly optimized implementation of memcpy for TPL"
default y if USE_ARCH_MEMCPY
depends on TPL
help
Enable the generation of an optimized version of memcpy.
Such an implementation may be faster under some conditions
but may increase the binary size.
config USE_ARCH_MEMMOVE
bool "Use an assembly optimized implementation of memmove"
default y
help
Enable the generation of an optimized version of memmove.
Such an implementation may be faster under some conditions
but may increase the binary size.
config SPL_USE_ARCH_MEMMOVE
bool "Use an assembly optimized implementation of memmove for SPL"
default y if USE_ARCH_MEMCPY
depends on SPL
help
Enable the generation of an optimized version of memmove.
Such an implementation may be faster under some conditions
but may increase the binary size.
config TPL_USE_ARCH_MEMMOVE
bool "Use an assembly optimized implementation of memmove for TPL"
default y if USE_ARCH_MEMCPY
depends on TPL
help
Enable the generation of an optimized version of memmove.
Such an implementation may be faster under some conditions
but may increase the binary size.
config USE_ARCH_MEMSET
bool "Use an assembly optimized implementation of memset"
default y
help
Enable the generation of an optimized version of memset.
Such an implementation may be faster under some conditions
but may increase the binary size.
config SPL_USE_ARCH_MEMSET
bool "Use an assembly optimized implementation of memset for SPL"
default y if USE_ARCH_MEMSET
depends on SPL
help
Enable the generation of an optimized version of memset.
Such an implementation may be faster under some conditions
but may increase the binary size.
config TPL_USE_ARCH_MEMSET
bool "Use an assembly optimized implementation of memset for TPL"
default y if USE_ARCH_MEMSET
depends on TPL
help
Enable the generation of an optimized version of memset.
Such an implementation may be faster under some conditions
but may increase the binary size.
endmenu

View file

@ -19,31 +19,25 @@
#undef __HAVE_ARCH_STRRCHR
#undef __HAVE_ARCH_STRCHR
#undef __HAVE_ARCH_MEMCPY
#undef __HAVE_ARCH_MEMMOVE
#undef __HAVE_ARCH_MEMCHR
#undef __HAVE_ARCH_MEMZERO
#undef __HAVE_ARCH_MEMSET
#ifdef CONFIG_MARCO_MEMSET
#define memset(_p, _v, _n) \
(typeof(_p) (p) = (_p); \
typeof(_v) (v) = (_v); \
typeof(_n) (n) = (_n); \
{ \
if ((n) != 0) { \
if (__builtin_constant_p((v)) && (v) == 0) \
__memzero((p), (n)); \
else \
memset((p), (v), (n)); \
} \
(p); \
})
#define memzero(_p, _n) \
(typeof(_p) (p) = (_p); \
typeof(_n) (n) = (_n); \
{ if ((n) != 0) __memzero((p), (n)); (p); })
#undef __HAVE_ARCH_MEMCPY
#if CONFIG_IS_ENABLED(USE_ARCH_MEMCPY)
#define __HAVE_ARCH_MEMCPY
#endif
extern void *memcpy(void *, const void *, __kernel_size_t);
#undef __HAVE_ARCH_MEMMOVE
#if CONFIG_IS_ENABLED(USE_ARCH_MEMMOVE)
#define __HAVE_ARCH_MEMMOVE
#endif
extern void *memmove(void *, const void *, __kernel_size_t);
#undef __HAVE_ARCH_MEMZERO
#if CONFIG_IS_ENABLED(USE_ARCH_MEMSET)
#define __HAVE_ARCH_MEMSET
#endif
extern void *memset(void *, int, __kernel_size_t);
#endif /* __ASM_RISCV_STRING_H */

View file

@ -36,3 +36,7 @@ CFLAGS_REMOVE_$(EFI_RELOC) := $(CFLAGS_NON_EFI)
extra-$(CONFIG_CMD_BOOTEFI_HELLO_COMPILE) += $(EFI_CRT0) $(EFI_RELOC)
extra-$(CONFIG_CMD_BOOTEFI_SELFTEST) += $(EFI_CRT0) $(EFI_RELOC)
extra-$(CONFIG_EFI) += $(EFI_CRT0) $(EFI_RELOC)
obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset.o
obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMMOVE) += memmove.o
obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy.o

108
arch/riscv/lib/memcpy.S Normal file
View file

@ -0,0 +1,108 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2013 Regents of the University of California
*/
#include <linux/linkage.h>
#include <asm/asm.h>
/* void *memcpy(void *, const void *, size_t) */
ENTRY(__memcpy)
WEAK(memcpy)
move t6, a0 /* Preserve return value */
/* Defer to byte-oriented copy for small sizes */
sltiu a3, a2, 128
bnez a3, 4f
/* Use word-oriented copy only if low-order bits match */
andi a3, t6, SZREG-1
andi a4, a1, SZREG-1
bne a3, a4, 4f
beqz a3, 2f /* Skip if already aligned */
/*
* Round to nearest double word-aligned address
* greater than or equal to start address
*/
andi a3, a1, ~(SZREG-1)
addi a3, a3, SZREG
/* Handle initial misalignment */
sub a4, a3, a1
1:
lb a5, 0(a1)
addi a1, a1, 1
sb a5, 0(t6)
addi t6, t6, 1
bltu a1, a3, 1b
sub a2, a2, a4 /* Update count */
2:
andi a4, a2, ~((16*SZREG)-1)
beqz a4, 4f
add a3, a1, a4
3:
REG_L a4, 0(a1)
REG_L a5, SZREG(a1)
REG_L a6, 2*SZREG(a1)
REG_L a7, 3*SZREG(a1)
REG_L t0, 4*SZREG(a1)
REG_L t1, 5*SZREG(a1)
REG_L t2, 6*SZREG(a1)
REG_L t3, 7*SZREG(a1)
REG_L t4, 8*SZREG(a1)
REG_L t5, 9*SZREG(a1)
REG_S a4, 0(t6)
REG_S a5, SZREG(t6)
REG_S a6, 2*SZREG(t6)
REG_S a7, 3*SZREG(t6)
REG_S t0, 4*SZREG(t6)
REG_S t1, 5*SZREG(t6)
REG_S t2, 6*SZREG(t6)
REG_S t3, 7*SZREG(t6)
REG_S t4, 8*SZREG(t6)
REG_S t5, 9*SZREG(t6)
REG_L a4, 10*SZREG(a1)
REG_L a5, 11*SZREG(a1)
REG_L a6, 12*SZREG(a1)
REG_L a7, 13*SZREG(a1)
REG_L t0, 14*SZREG(a1)
REG_L t1, 15*SZREG(a1)
addi a1, a1, 16*SZREG
REG_S a4, 10*SZREG(t6)
REG_S a5, 11*SZREG(t6)
REG_S a6, 12*SZREG(t6)
REG_S a7, 13*SZREG(t6)
REG_S t0, 14*SZREG(t6)
REG_S t1, 15*SZREG(t6)
addi t6, t6, 16*SZREG
bltu a1, a3, 3b
andi a2, a2, (16*SZREG)-1 /* Update count */
4:
/* Handle trailing misalignment */
beqz a2, 6f
add a3, a1, a2
/* Use word-oriented copy if co-aligned to word boundary */
or a5, a1, t6
or a5, a5, a3
andi a5, a5, 3
bnez a5, 5f
7:
lw a4, 0(a1)
addi a1, a1, 4
sw a4, 0(t6)
addi t6, t6, 4
bltu a1, a3, 7b
ret
5:
lb a4, 0(a1)
addi a1, a1, 1
sb a4, 0(t6)
addi t6, t6, 1
bltu a1, a3, 5b
6:
ret
END(__memcpy)

64
arch/riscv/lib/memmove.S Normal file
View file

@ -0,0 +1,64 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
ENTRY(__memmove)
WEAK(memmove)
move t0, a0
move t1, a1
beq a0, a1, exit_memcpy
beqz a2, exit_memcpy
srli t2, a2, 0x2
slt t3, a0, a1
beqz t3, do_reverse
andi a2, a2, 0x3
li t4, 1
beqz t2, byte_copy
word_copy:
lw t3, 0(a1)
addi t2, t2, -1
addi a1, a1, 4
sw t3, 0(a0)
addi a0, a0, 4
bnez t2, word_copy
beqz a2, exit_memcpy
j byte_copy
do_reverse:
add a0, a0, a2
add a1, a1, a2
andi a2, a2, 0x3
li t4, -1
beqz t2, reverse_byte_copy
reverse_word_copy:
addi a1, a1, -4
addi t2, t2, -1
lw t3, 0(a1)
addi a0, a0, -4
sw t3, 0(a0)
bnez t2, reverse_word_copy
beqz a2, exit_memcpy
reverse_byte_copy:
addi a0, a0, -1
addi a1, a1, -1
byte_copy:
lb t3, 0(a1)
addi a2, a2, -1
sb t3, 0(a0)
add a1, a1, t4
add a0, a0, t4
bnez a2, byte_copy
exit_memcpy:
move a0, t0
move a1, t1
ret
END(__memmove)

113
arch/riscv/lib/memset.S Normal file
View file

@ -0,0 +1,113 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2013 Regents of the University of California
*/
#include <linux/linkage.h>
#include <asm/asm.h>
/* void *memset(void *, int, size_t) */
ENTRY(__memset)
WEAK(memset)
move t0, a0 /* Preserve return value */
/* Defer to byte-oriented fill for small sizes */
sltiu a3, a2, 16
bnez a3, 4f
/*
* Round to nearest XLEN-aligned address
* greater than or equal to start address
*/
addi a3, t0, SZREG-1
andi a3, a3, ~(SZREG-1)
beq a3, t0, 2f /* Skip if already aligned */
/* Handle initial misalignment */
sub a4, a3, t0
1:
sb a1, 0(t0)
addi t0, t0, 1
bltu t0, a3, 1b
sub a2, a2, a4 /* Update count */
2: /* Duff's device with 32 XLEN stores per iteration */
/* Broadcast value into all bytes */
andi a1, a1, 0xff
slli a3, a1, 8
or a1, a3, a1
slli a3, a1, 16
or a1, a3, a1
#ifdef CONFIG_64BIT
slli a3, a1, 32
or a1, a3, a1
#endif
/* Calculate end address */
andi a4, a2, ~(SZREG-1)
add a3, t0, a4
andi a4, a4, 31*SZREG /* Calculate remainder */
beqz a4, 3f /* Shortcut if no remainder */
neg a4, a4
addi a4, a4, 32*SZREG /* Calculate initial offset */
/* Adjust start address with offset */
sub t0, t0, a4
/* Jump into loop body */
/* Assumes 32-bit instruction lengths */
la a5, 3f
#ifdef CONFIG_64BIT
srli a4, a4, 1
#endif
add a5, a5, a4
jr a5
3:
REG_S a1, 0(t0)
REG_S a1, SZREG(t0)
REG_S a1, 2*SZREG(t0)
REG_S a1, 3*SZREG(t0)
REG_S a1, 4*SZREG(t0)
REG_S a1, 5*SZREG(t0)
REG_S a1, 6*SZREG(t0)
REG_S a1, 7*SZREG(t0)
REG_S a1, 8*SZREG(t0)
REG_S a1, 9*SZREG(t0)
REG_S a1, 10*SZREG(t0)
REG_S a1, 11*SZREG(t0)
REG_S a1, 12*SZREG(t0)
REG_S a1, 13*SZREG(t0)
REG_S a1, 14*SZREG(t0)
REG_S a1, 15*SZREG(t0)
REG_S a1, 16*SZREG(t0)
REG_S a1, 17*SZREG(t0)
REG_S a1, 18*SZREG(t0)
REG_S a1, 19*SZREG(t0)
REG_S a1, 20*SZREG(t0)
REG_S a1, 21*SZREG(t0)
REG_S a1, 22*SZREG(t0)
REG_S a1, 23*SZREG(t0)
REG_S a1, 24*SZREG(t0)
REG_S a1, 25*SZREG(t0)
REG_S a1, 26*SZREG(t0)
REG_S a1, 27*SZREG(t0)
REG_S a1, 28*SZREG(t0)
REG_S a1, 29*SZREG(t0)
REG_S a1, 30*SZREG(t0)
REG_S a1, 31*SZREG(t0)
addi t0, t0, 32*SZREG
bltu t0, a3, 3b
andi a2, a2, SZREG-1 /* Update count */
4:
/* Handle trailing misalignment */
beqz a2, 6f
add a3, t0, a2
5:
sb a1, 0(t0)
addi t0, t0, 1
bltu t0, a3, 5b
6:
ret
END(__memset)