u-boot/arch/arc/lib/strcmp.S

/*
 * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
 *
 * SPDX-License-Identifier:	GPL-2.0+
 */

/*
 * This is optimized primarily for the ARC700.
 * It would be possible to speed up the loops by one cycle / word
 * respective one cycle / byte by forcing double source 1 alignment, unrolling
 * by a factor of two, and speculatively loading the second word / byte of
 * source 1; however, that would increase the overhead for loop setup / finish,
 * and strcmp might often terminate early.
 */

.global strcmp
.align 4
strcmp:
	or	%r2, %r0, %r1
	bmsk_s	%r2, %r2, 1
	brne	%r2, 0, .Lcharloop
	mov_s	%r12, 0x01010101
	ror	%r5, %r12
.Lwordloop:
	ld.ab	%r2, [%r0, 4]
	ld.ab	%r3, [%r1, 4]
	nop_s
	sub	%r4, %r2, %r12
	bic	%r4, %r4, %r2
	and	%r4, %r4, %r5
	brne	%r4, 0, .Lfound0
	breq	%r2 ,%r3, .Lwordloop
#ifdef	__LITTLE_ENDIAN__
	xor	%r0, %r2, %r3	/* mask for difference */
	sub_s	%r1, %r0, 1
	bic_s	%r0, %r0, %r1	/* mask for least significant difference bit */
	sub	%r1, %r5, %r0
	xor	%r0, %r5, %r1	/* mask for least significant difference byte */
	and_s	%r2, %r2, %r0
	and_s	%r3, %r3, %r0
#endif /* _ENDIAN__ */
	cmp_s	%r2, %r3
	mov_s	%r0, 1
	j_s.d	[%blink]
	bset.lo	%r0, %r0, 31

	.balign	4
#ifdef __LITTLE_ENDIAN__
.Lfound0:
	xor	%r0, %r2, %r3	/* mask for difference */
	or	%r0, %r0, %r4	/* or in zero indicator */
	sub_s	%r1, %r0, 1
	bic_s	%r0, %r0, %r1	/* mask for least significant difference bit */
	sub	%r1, %r5, %r0
	xor	%r0, %r5, %r1	/* mask for least significant difference byte */
	and_s	%r2, %r2, %r0
	and_s	%r3, %r3, %r0
	sub.f	%r0, %r2, %r3
	mov.hi	%r0, 1
	j_s.d	[%blink]
	bset.lo	%r0, %r0, 31
#else /* __BIG_ENDIAN__ */
	/*
	 * The zero-detection above can mis-detect 0x01 bytes as zeroes
	 * because of carry-propagateion from a lower significant zero byte.
	 * We can compensate for this by checking that bit0 is zero.
	 * This compensation is not necessary in the step where we
	 * get a low estimate for r2, because in any affected bytes
	 * we already have 0x00 or 0x01, which will remain unchanged
	 * when bit 7 is cleared.
	 */
	.balign	4
.Lfound0:
	lsr	%r0, %r4, 8
	lsr_s	%r1, %r2
	bic_s	%r2, %r2, %r0	/* get low estimate for r2 and get ... */
	bic_s	%r0, %r0, %r1	/* <this is the adjusted mask for zeros> */
	or_s	%r3, %r3, %r0	/* ... high estimate r3 so that r2 > r3 will */
	cmp_s	%r3, %r2	/* ... be independent of trailing garbage */
	or_s	%r2, %r2, %r0	/* likewise for r3 > r2 */
	bic_s	%r3, %r3, %r0
	rlc	%r0, 0		/* r0 := r2 > r3 ? 1 : 0 */
	cmp_s	%r2, %r3
	j_s.d	[%blink]
	bset.lo	%r0, %r0, 31
#endif /* _ENDIAN__ */

	.balign	4
.Lcharloop:
	ldb.ab	%r2,[%r0,1]
	ldb.ab	%r3,[%r1,1]
	nop_s
	breq	%r2, 0, .Lcmpend
	breq	%r2, %r3, .Lcharloop
.Lcmpend:
	j_s.d	[%blink]
	sub	%r0, %r2, %r3
arc: add library functions These are library functions used by ARC700 architecture. Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com> Cc: Vineet Gupta <vgupta@synopsys.com> Cc: Francois Bedard <fbedard@synopsys.com> Cc: Wolfgang Denk <wd@denx.de> Cc: Heiko Schocher <hs@denx.de> 2014-02-04 08:56:15 +00:00			`/*`
			`* Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.`
			`*`
			`* SPDX-License-Identifier: GPL-2.0+`
			`*/`

			`/*`
			`* This is optimized primarily for the ARC700.`
			`* It would be possible to speed up the loops by one cycle / word`
			`* respective one cycle / byte by forcing double source 1 alignment, unrolling`
			`* by a factor of two, and speculatively loading the second word / byte of`
			`* source 1; however, that would increase the overhead for loop setup / finish,`
			`* and strcmp might often terminate early.`
			`*/`

			`.global strcmp`
			`.align 4`
			`strcmp:`
			`or %r2, %r0, %r1`
			`bmsk_s %r2, %r2, 1`
			`brne %r2, 0, .Lcharloop`
			`mov_s %r12, 0x01010101`
			`ror %r5, %r12`
			`.Lwordloop:`
			`ld.ab %r2, [%r0, 4]`
			`ld.ab %r3, [%r1, 4]`
			`nop_s`
			`sub %r4, %r2, %r12`
			`bic %r4, %r4, %r2`
			`and %r4, %r4, %r5`
			`brne %r4, 0, .Lfound0`
			`breq %r2 ,%r3, .Lwordloop`
			`#ifdef __LITTLE_ENDIAN__`
			`xor %r0, %r2, %r3 /* mask for difference */`
			`sub_s %r1, %r0, 1`
			`bic_s %r0, %r0, %r1 /* mask for least significant difference bit */`
			`sub %r1, %r5, %r0`
			`xor %r0, %r5, %r1 /* mask for least significant difference byte */`
			`and_s %r2, %r2, %r0`
			`and_s %r3, %r3, %r0`
			`#endif /* _ENDIAN__ */`
			`cmp_s %r2, %r3`
			`mov_s %r0, 1`
			`j_s.d [%blink]`
			`bset.lo %r0, %r0, 31`

			`.balign 4`
			`#ifdef __LITTLE_ENDIAN__`
			`.Lfound0:`
			`xor %r0, %r2, %r3 /* mask for difference */`
			`or %r0, %r0, %r4 /* or in zero indicator */`
			`sub_s %r1, %r0, 1`
			`bic_s %r0, %r0, %r1 /* mask for least significant difference bit */`
			`sub %r1, %r5, %r0`
			`xor %r0, %r5, %r1 /* mask for least significant difference byte */`
			`and_s %r2, %r2, %r0`
			`and_s %r3, %r3, %r0`
			`sub.f %r0, %r2, %r3`
			`mov.hi %r0, 1`
			`j_s.d [%blink]`
			`bset.lo %r0, %r0, 31`
			`#else /* __BIG_ENDIAN__ */`
			`/*`
			`* The zero-detection above can mis-detect 0x01 bytes as zeroes`
			`* because of carry-propagateion from a lower significant zero byte.`
			`* We can compensate for this by checking that bit0 is zero.`
			`* This compensation is not necessary in the step where we`
			`* get a low estimate for r2, because in any affected bytes`
			`* we already have 0x00 or 0x01, which will remain unchanged`
			`* when bit 7 is cleared.`
			`*/`
			`.balign 4`
			`.Lfound0:`
			`lsr %r0, %r4, 8`
			`lsr_s %r1, %r2`
			`bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */`
			`bic_s %r0, %r0, %r1 /* <this is the adjusted mask for zeros> */`
			`or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */`
			`cmp_s %r3, %r2 /* ... be independent of trailing garbage */`
			`or_s %r2, %r2, %r0 /* likewise for r3 > r2 */`
			`bic_s %r3, %r3, %r0`
			`rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */`
			`cmp_s %r2, %r3`
			`j_s.d [%blink]`
			`bset.lo %r0, %r0, 31`
			`#endif /* _ENDIAN__ */`

			`.balign 4`
			`.Lcharloop:`
			`ldb.ab %r2,[%r0,1]`
			`ldb.ab %r3,[%r1,1]`
			`nop_s`
			`breq %r2, 0, .Lcmpend`
			`breq %r2, %r3, .Lcharloop`
			`.Lcmpend:`
			`j_s.d [%blink]`
			`sub %r0, %r2, %r3`