armv8m/strcpy: add arch optimize version

This commit is contained in:
wangyongrong 2023-09-20 16:27:27 +08:00 committed by Xiang Xiao
parent abfb7da553
commit 2d817deecb
3 changed files with 319 additions and 0 deletions

View File

@ -14,6 +14,7 @@ config ARMV8M_STRING_FUNCTION
select ARMV8M_MEMSET
select ARMV8M_MEMMOVE
select ARMV8M_STRCMP
select ARMV8M_STRCPY
select ARMV8M_STRLEN
config ARMV8M_MEMCHR
@ -56,6 +57,14 @@ config ARMV8M_STRCMP
---help---
Enable optimized ARMv8-M specific strcmp() library function
config ARMV8M_STRCPY
bool "Enable optimized strcpy() for ARMv8-M"
default n
select LIBC_ARCH_STRCPY
depends on ARCH_TOOLCHAIN_GNU
---help---
Enable optimized ARMv8-M specific strcpy() library function
config ARMV8M_STRLEN
bool "Enable optimized strlen() for ARMv8-M"
default n

View File

@ -42,6 +42,10 @@ ifeq ($(CONFIG_ARMV8M_STRCMP),y)
ASRCS += arch_strcmp.S
endif
ifeq ($(CONFIG_ARMV8M_STRCPY),y)
ASRCS += arch_strcpy.S
endif
ifeq ($(CONFIG_ARMV8M_STRLEN),y)
ASRCS += arch_strlen.S
endif

View File

@ -0,0 +1,306 @@
/***************************************************************************
* libs/libc/machine/arm/armv8-m/gnu/arch_strcpy.S
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
***************************************************************************/
/* This strcpy borrowed some ideas from arch_strcmp.S(). */
/* Parameters and result. */
#define dst r0
#define src r1
#define result r0
/* Internal variables, or callee saved registers */
#define tmp1 r4
#define tmp2 r5
#define tmp3 r6
#define src_offset r7
#ifdef __ARM_BIG_ENDIAN
# define MASK_0 0xff000000
# define MASK_1 0xff0000
# define MASK_2 0xff00
# define MASK_3 0xff
# define BYTE_0_SHIFT 24
# define BYTE_1_SHIFT 16
# define BYTE_2_SHIFT 8
# define BYTE_3_SHIFT 0
#else
# define MASK_0 0xff
# define MASK_1 0xff00
# define MASK_2 0xff0000
# define MASK_3 0xff000000
# define BYTE_0_SHIFT 0
# define BYTE_1_SHIFT 8
# define BYTE_2_SHIFT 16
# define BYTE_3_SHIFT 24
#endif
.syntax unified
.text
.align 2
.global strcpy
.thumb
.type strcpy, %function
strcpy:
push {result, tmp1, tmp2, tmp3, src_offset}
eor tmp1, dst, src
tst tmp1, #3
/* If dst and src not at same byte offset from a word boundary */
bne .Lstrs_diff_offset
/* Process same byte offset then, get the offset */
ands tmp1, src, #3
beq .Ldst_src_aligned
/* get number of bytes unaligned */
rsb tmp1, #4
.Lbyte_copy_until_dsr_src_aligned:
ldrb tmp2, [src], #1
cmp tmp2, #0
beq .Lcopy_done
strb tmp2, [dst], #1
subs tmp1, #1
bne .Lbyte_copy_until_dsr_src_aligned
.Ldst_src_aligned:
/* Now dst and src are aligned */
ldr tmp1, [src], #4
sub tmp2, tmp1, #0x01010101
bic tmp2, tmp1
tst tmp2, #0x80808080
/* All zero means no zero byte is detected */
it eq
streq tmp1, [dst], #4
beq .Ldst_src_aligned
/* There is a zero in the word, copy until zero */
sub src, #4
.Lbyte_copy_until_zero:
ldrb tmp2, [src], #1
cmp tmp2, #0
beq .Lcopy_done
strb tmp2, [dst], #1
b .Lbyte_copy_until_zero
/* Make dst aligned, so we won't write anything before dst.
* If we attempt to write before dst, atomic read-write must
* be ensured. Atomic operation complicates things.
* So the solution here is byte by byte copy until dst aligned.
*/
.Lstrs_diff_offset:
ands tmp1, dst, #3
beq .Ldiff_offset_loop_begin
/* get number of dst bytes unaligned */
rsb tmp1, #4
.Lbyte_copy_until_dst_aligned:
ldrb tmp2, [src], #1
cmp tmp2, #0
beq .Lcopy_done
strb tmp2, [dst], #1
subs tmp1, #1
bne .Lbyte_copy_until_dst_aligned
.Ldiff_offset_loop_begin:
/* src_offset mustn't be 0 here */
and src_offset, src, 3
lsls src_offset, #3
bic src, #3
/* first word logic
* prepend 0xff to make the algorithm simpler
* only the first word needs to be prepended
*/
ldr tmp1, [src], #4
mov tmp2, #0xffffffff
rsb tmp3, src_offset, #32
#ifdef __ARM_BIG_ENDIAN
lsls tmp2, tmp3
#else
lsrs tmp2, tmp3
#endif
orr tmp1, tmp1, tmp2
/* Test if the first word contains zero */
sub tmp3, tmp1, #0x01010101
bic tmp3, tmp1
tst tmp3, #0x80808080
/* non-zero means zero byte is detected */
bne .Ltail_copy
/* before loop, set tmp2=tmp1 to simplify the logic in the loop */
mov tmp2, tmp1
.Ldiff_offset_loop:
mov tmp1, tmp2
ldr tmp2, [src], #4
/* Test if contains zero */
sub tmp3, tmp2, #0x01010101
bic tmp3, tmp2
tst tmp3, #0x80808080
/* non-zero means zero byte is detected */
bne .Ltail_copy
/* Now let's fill dst */
#ifdef __ARM_BIG_ENDIAN
lsls tmp1, src_offset
rsb tmp3, src_offset, #32
lsrs tmp3, tmp2, tmp3
orr tmp1, tmp1, tmp3
#else
lsrs tmp1, src_offset
rsb tmp3, src_offset, #32
lsls tmp3, tmp2, tmp3
orr tmp1, tmp1, tmp3
#endif
str tmp1, [dst], #4
b .Ldiff_offset_loop
.Ltail_copy:
cmp src_offset, #24
beq .Loffset_3
cmp src_offset, #16
beq .Loffset_2
/* src_offset == 8 here */
ands tmp3, tmp1, MASK_1
beq .Lcopy_done
lsrs tmp3, BYTE_1_SHIFT
strb tmp3, [dst], #1
.Loffset_2:
ands tmp3, tmp1, MASK_2
beq .Lcopy_done
lsrs tmp3, BYTE_2_SHIFT
strb tmp3, [dst], #1
.Loffset_3:
ands tmp3, tmp1, MASK_3
beq .Lcopy_done
lsrs tmp3, BYTE_3_SHIFT
strb tmp3, [dst], #1
ands tmp3, tmp2, MASK_0
beq .Lcopy_done
lsrs tmp3, BYTE_0_SHIFT
strb tmp3, [dst], #1
ands tmp3, tmp2, MASK_1
beq .Lcopy_done
lsrs tmp3, BYTE_1_SHIFT
strb tmp3, [dst], #1
ands tmp3, tmp2, MASK_2
beq .Lcopy_done
lsrs tmp3, BYTE_2_SHIFT
strb tmp3, [dst], #1
.Lcopy_done:
mov tmp3, #0
strb tmp3, [dst]
pop {result, tmp1, tmp2, tmp3, src_offset}
bx lr
#if 0
/* Pseudo Code of strcpy when dst/src not at same byte offset */
/* Make dst aligned, so we won't write anything before dst.
* If we attempt to write before dst, atomic read-write must
* be ensured. Atomic operation complicates things.
* So the solution here is byte by byte copy until dst aligned.
*/
if (dst & 3 == 0)
goto diff_offset_loop_begin;
ByteCopyUntilDstAligned();
.diff_offset_loop_begin:
/* src_offset mustn't be 0 here */
src_offset = src & 3;
src_offset = src_offset * 8;
src = src & 0xfffffffc;
tmp1 = *src;
src +=4;
/* first word logic
* prepend 0xff to make the algorithm simpler
* only the first word needs to be prepended
*/
if (src_offset != 0)
{
tmp2 = 0xffffffff
#if big endian
tmp2 = tmp2 << (32 - src_offset)
#else
tmp2 = tmp2 >> (32 - src_offset)
#endif
tmp1 |= tmp2
}
if (HasZeroByte(tmp1))
{
goto .tail_copy;
}
/* before loop, set tmp2=tmp1 to simplify the logic in the loop */
tmp2 = tmp1
.diff_offset_loop:
tmp1 = tmp2;
tmp2 = *src;
src += 4;
/* double word tail means we have to copy from tmp1 and tmp2 to dst */
if (HasZeroByte(tmp2))
{
goto .tail_copy;
}
/* Now let's fill dst */
#if big endian
tmp1 = tmp1 << (src_offset);
tmp1 |= tmp2 >> (32 - src_offset);
*dst = tmp1;
#else
tmp1 = tmp1 >> (src_offset);
tmp1 |= tmp2 << (32 - src_offset);
*dst = tmp1;
#endif
dst +=4;
goto .diff_offset_loop;
/* byte by byte copy at the tail */
.tail_copy:
if (src_offset == 3)
goto offset_3;
if (src_offset == 2)
goto offset_2;
/* src_offset mustn't be 0 here */
/* default src_offset == 1 */
if (tmp1 & MASK_1 == 0)
goto cpy_done;
*dst++ = tmp1 & MASK_1;
offset_2:
if (tmp1 & MASK_2 == 0)
goto cpy_done;
*dst++ = tmp1 & MASK_2;
offset_3:
if (tmp1 & MASK_3 == 0)
goto cpy_done;
*dst++ = tmp1 & MASK_3;
if (tmp2 & MASK_0 == 0)
goto cpy_done;
*dst++ = tmp2 & MASK_0;
if (tmp2 & MASK_1 == 0)
goto cpy_done;
*dst++ = tmp2 & MASK_1;
if (tmp2 & MASK_2 == 0)
goto cpy_done;
*dst++ = tmp2 & MASK_2;
/* tmp2 BYTE3 must be zero here */
.cpy_done:
*dst++ = 0;
#endif /* Pseudo code end */