armv8m/strcpy: add arch optimize version
This commit is contained in:
parent
abfb7da553
commit
2d817deecb
|
@ -14,6 +14,7 @@ config ARMV8M_STRING_FUNCTION
|
|||
select ARMV8M_MEMSET
|
||||
select ARMV8M_MEMMOVE
|
||||
select ARMV8M_STRCMP
|
||||
select ARMV8M_STRCPY
|
||||
select ARMV8M_STRLEN
|
||||
|
||||
config ARMV8M_MEMCHR
|
||||
|
@ -56,6 +57,14 @@ config ARMV8M_STRCMP
|
|||
---help---
|
||||
Enable optimized ARMv8-M specific strcmp() library function
|
||||
|
||||
config ARMV8M_STRCPY
|
||||
bool "Enable optimized strcpy() for ARMv8-M"
|
||||
default n
|
||||
select LIBC_ARCH_STRCPY
|
||||
depends on ARCH_TOOLCHAIN_GNU
|
||||
---help---
|
||||
Enable optimized ARMv8-M specific strcpy() library function
|
||||
|
||||
config ARMV8M_STRLEN
|
||||
bool "Enable optimized strlen() for ARMv8-M"
|
||||
default n
|
||||
|
|
|
@ -42,6 +42,10 @@ ifeq ($(CONFIG_ARMV8M_STRCMP),y)
|
|||
ASRCS += arch_strcmp.S
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_ARMV8M_STRCPY),y)
|
||||
ASRCS += arch_strcpy.S
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_ARMV8M_STRLEN),y)
|
||||
ASRCS += arch_strlen.S
|
||||
endif
|
||||
|
|
|
@ -0,0 +1,306 @@
|
|||
/***************************************************************************
|
||||
* libs/libc/machine/arm/armv8-m/gnu/arch_strcpy.S
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership. The
|
||||
* ASF licenses this file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the
|
||||
* License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
/* This strcpy borrowed some ideas from arch_strcmp.S(). */
|
||||
|
||||
/* Parameters and result. */
|
||||
#define dst r0
|
||||
#define src r1
|
||||
#define result r0
|
||||
|
||||
/* Internal variables, or callee saved registers */
|
||||
#define tmp1 r4
|
||||
#define tmp2 r5
|
||||
#define tmp3 r6
|
||||
#define src_offset r7
|
||||
|
||||
#ifdef __ARM_BIG_ENDIAN
|
||||
# define MASK_0 0xff000000
|
||||
# define MASK_1 0xff0000
|
||||
# define MASK_2 0xff00
|
||||
# define MASK_3 0xff
|
||||
# define BYTE_0_SHIFT 24
|
||||
# define BYTE_1_SHIFT 16
|
||||
# define BYTE_2_SHIFT 8
|
||||
# define BYTE_3_SHIFT 0
|
||||
#else
|
||||
# define MASK_0 0xff
|
||||
# define MASK_1 0xff00
|
||||
# define MASK_2 0xff0000
|
||||
# define MASK_3 0xff000000
|
||||
# define BYTE_0_SHIFT 0
|
||||
# define BYTE_1_SHIFT 8
|
||||
# define BYTE_2_SHIFT 16
|
||||
# define BYTE_3_SHIFT 24
|
||||
#endif
|
||||
|
||||
.syntax unified
|
||||
.text
|
||||
.align 2
|
||||
.global strcpy
|
||||
.thumb
|
||||
.type strcpy, %function
|
||||
|
||||
strcpy:
|
||||
push {result, tmp1, tmp2, tmp3, src_offset}
|
||||
eor tmp1, dst, src
|
||||
tst tmp1, #3
|
||||
/* If dst and src not at same byte offset from a word boundary */
|
||||
bne .Lstrs_diff_offset
|
||||
/* Process same byte offset then, get the offset */
|
||||
ands tmp1, src, #3
|
||||
beq .Ldst_src_aligned
|
||||
/* get number of bytes unaligned */
|
||||
rsb tmp1, #4
|
||||
|
||||
.Lbyte_copy_until_dsr_src_aligned:
|
||||
ldrb tmp2, [src], #1
|
||||
cmp tmp2, #0
|
||||
beq .Lcopy_done
|
||||
strb tmp2, [dst], #1
|
||||
subs tmp1, #1
|
||||
bne .Lbyte_copy_until_dsr_src_aligned
|
||||
|
||||
.Ldst_src_aligned:
|
||||
/* Now dst and src are aligned */
|
||||
ldr tmp1, [src], #4
|
||||
sub tmp2, tmp1, #0x01010101
|
||||
bic tmp2, tmp1
|
||||
tst tmp2, #0x80808080
|
||||
/* All zero means no zero byte is detected */
|
||||
it eq
|
||||
streq tmp1, [dst], #4
|
||||
beq .Ldst_src_aligned
|
||||
|
||||
/* There is a zero in the word, copy until zero */
|
||||
sub src, #4
|
||||
.Lbyte_copy_until_zero:
|
||||
ldrb tmp2, [src], #1
|
||||
cmp tmp2, #0
|
||||
beq .Lcopy_done
|
||||
strb tmp2, [dst], #1
|
||||
b .Lbyte_copy_until_zero
|
||||
|
||||
/* Make dst aligned, so we won't write anything before dst.
|
||||
* If we attempt to write before dst, atomic read-write must
|
||||
* be ensured. Atomic operation complicates things.
|
||||
* So the solution here is byte by byte copy until dst aligned.
|
||||
*/
|
||||
.Lstrs_diff_offset:
|
||||
ands tmp1, dst, #3
|
||||
beq .Ldiff_offset_loop_begin
|
||||
/* get number of dst bytes unaligned */
|
||||
rsb tmp1, #4
|
||||
|
||||
.Lbyte_copy_until_dst_aligned:
|
||||
ldrb tmp2, [src], #1
|
||||
cmp tmp2, #0
|
||||
beq .Lcopy_done
|
||||
strb tmp2, [dst], #1
|
||||
subs tmp1, #1
|
||||
bne .Lbyte_copy_until_dst_aligned
|
||||
|
||||
.Ldiff_offset_loop_begin:
|
||||
/* src_offset mustn't be 0 here */
|
||||
and src_offset, src, 3
|
||||
lsls src_offset, #3
|
||||
bic src, #3
|
||||
/* first word logic
|
||||
* prepend 0xff to make the algorithm simpler
|
||||
* only the first word needs to be prepended
|
||||
*/
|
||||
ldr tmp1, [src], #4
|
||||
mov tmp2, #0xffffffff
|
||||
rsb tmp3, src_offset, #32
|
||||
|
||||
#ifdef __ARM_BIG_ENDIAN
|
||||
lsls tmp2, tmp3
|
||||
#else
|
||||
lsrs tmp2, tmp3
|
||||
#endif
|
||||
orr tmp1, tmp1, tmp2
|
||||
/* Test if the first word contains zero */
|
||||
sub tmp3, tmp1, #0x01010101
|
||||
bic tmp3, tmp1
|
||||
tst tmp3, #0x80808080
|
||||
/* non-zero means zero byte is detected */
|
||||
bne .Ltail_copy
|
||||
|
||||
/* before loop, set tmp2=tmp1 to simplify the logic in the loop */
|
||||
mov tmp2, tmp1
|
||||
.Ldiff_offset_loop:
|
||||
mov tmp1, tmp2
|
||||
ldr tmp2, [src], #4
|
||||
/* Test if contains zero */
|
||||
sub tmp3, tmp2, #0x01010101
|
||||
bic tmp3, tmp2
|
||||
tst tmp3, #0x80808080
|
||||
/* non-zero means zero byte is detected */
|
||||
bne .Ltail_copy
|
||||
/* Now let's fill dst */
|
||||
#ifdef __ARM_BIG_ENDIAN
|
||||
lsls tmp1, src_offset
|
||||
rsb tmp3, src_offset, #32
|
||||
lsrs tmp3, tmp2, tmp3
|
||||
orr tmp1, tmp1, tmp3
|
||||
#else
|
||||
lsrs tmp1, src_offset
|
||||
rsb tmp3, src_offset, #32
|
||||
lsls tmp3, tmp2, tmp3
|
||||
orr tmp1, tmp1, tmp3
|
||||
#endif
|
||||
str tmp1, [dst], #4
|
||||
b .Ldiff_offset_loop
|
||||
|
||||
.Ltail_copy:
|
||||
cmp src_offset, #24
|
||||
beq .Loffset_3
|
||||
cmp src_offset, #16
|
||||
beq .Loffset_2
|
||||
/* src_offset == 8 here */
|
||||
ands tmp3, tmp1, MASK_1
|
||||
beq .Lcopy_done
|
||||
lsrs tmp3, BYTE_1_SHIFT
|
||||
strb tmp3, [dst], #1
|
||||
.Loffset_2:
|
||||
ands tmp3, tmp1, MASK_2
|
||||
beq .Lcopy_done
|
||||
lsrs tmp3, BYTE_2_SHIFT
|
||||
strb tmp3, [dst], #1
|
||||
.Loffset_3:
|
||||
ands tmp3, tmp1, MASK_3
|
||||
beq .Lcopy_done
|
||||
lsrs tmp3, BYTE_3_SHIFT
|
||||
strb tmp3, [dst], #1
|
||||
ands tmp3, tmp2, MASK_0
|
||||
beq .Lcopy_done
|
||||
lsrs tmp3, BYTE_0_SHIFT
|
||||
strb tmp3, [dst], #1
|
||||
ands tmp3, tmp2, MASK_1
|
||||
beq .Lcopy_done
|
||||
lsrs tmp3, BYTE_1_SHIFT
|
||||
strb tmp3, [dst], #1
|
||||
ands tmp3, tmp2, MASK_2
|
||||
beq .Lcopy_done
|
||||
lsrs tmp3, BYTE_2_SHIFT
|
||||
strb tmp3, [dst], #1
|
||||
.Lcopy_done:
|
||||
mov tmp3, #0
|
||||
strb tmp3, [dst]
|
||||
pop {result, tmp1, tmp2, tmp3, src_offset}
|
||||
bx lr
|
||||
|
||||
#if 0
|
||||
/* Pseudo Code of strcpy when dst/src not at same byte offset */
|
||||
|
||||
/* Make dst aligned, so we won't write anything before dst.
|
||||
* If we attempt to write before dst, atomic read-write must
|
||||
* be ensured. Atomic operation complicates things.
|
||||
* So the solution here is byte by byte copy until dst aligned.
|
||||
*/
|
||||
if (dst & 3 == 0)
|
||||
goto diff_offset_loop_begin;
|
||||
ByteCopyUntilDstAligned();
|
||||
|
||||
.diff_offset_loop_begin:
|
||||
/* src_offset mustn't be 0 here */
|
||||
src_offset = src & 3;
|
||||
src_offset = src_offset * 8;
|
||||
src = src & 0xfffffffc;
|
||||
tmp1 = *src;
|
||||
src +=4;
|
||||
/* first word logic
|
||||
* prepend 0xff to make the algorithm simpler
|
||||
* only the first word needs to be prepended
|
||||
*/
|
||||
if (src_offset != 0)
|
||||
{
|
||||
tmp2 = 0xffffffff
|
||||
#if big endian
|
||||
tmp2 = tmp2 << (32 - src_offset)
|
||||
#else
|
||||
tmp2 = tmp2 >> (32 - src_offset)
|
||||
#endif
|
||||
tmp1 |= tmp2
|
||||
}
|
||||
if (HasZeroByte(tmp1))
|
||||
{
|
||||
goto .tail_copy;
|
||||
}
|
||||
|
||||
/* before loop, set tmp2=tmp1 to simplify the logic in the loop */
|
||||
tmp2 = tmp1
|
||||
.diff_offset_loop:
|
||||
tmp1 = tmp2;
|
||||
tmp2 = *src;
|
||||
src += 4;
|
||||
|
||||
/* double word tail means we have to copy from tmp1 and tmp2 to dst */
|
||||
if (HasZeroByte(tmp2))
|
||||
{
|
||||
goto .tail_copy;
|
||||
}
|
||||
/* Now let's fill dst */
|
||||
#if big endian
|
||||
tmp1 = tmp1 << (src_offset);
|
||||
tmp1 |= tmp2 >> (32 - src_offset);
|
||||
*dst = tmp1;
|
||||
#else
|
||||
tmp1 = tmp1 >> (src_offset);
|
||||
tmp1 |= tmp2 << (32 - src_offset);
|
||||
*dst = tmp1;
|
||||
#endif
|
||||
dst +=4;
|
||||
goto .diff_offset_loop;
|
||||
|
||||
/* byte by byte copy at the tail */
|
||||
.tail_copy:
|
||||
if (src_offset == 3)
|
||||
goto offset_3;
|
||||
if (src_offset == 2)
|
||||
goto offset_2;
|
||||
|
||||
/* src_offset mustn't be 0 here */
|
||||
/* default src_offset == 1 */
|
||||
if (tmp1 & MASK_1 == 0)
|
||||
goto cpy_done;
|
||||
*dst++ = tmp1 & MASK_1;
|
||||
offset_2:
|
||||
if (tmp1 & MASK_2 == 0)
|
||||
goto cpy_done;
|
||||
*dst++ = tmp1 & MASK_2;
|
||||
offset_3:
|
||||
if (tmp1 & MASK_3 == 0)
|
||||
goto cpy_done;
|
||||
*dst++ = tmp1 & MASK_3;
|
||||
if (tmp2 & MASK_0 == 0)
|
||||
goto cpy_done;
|
||||
*dst++ = tmp2 & MASK_0;
|
||||
if (tmp2 & MASK_1 == 0)
|
||||
goto cpy_done;
|
||||
*dst++ = tmp2 & MASK_1;
|
||||
if (tmp2 & MASK_2 == 0)
|
||||
goto cpy_done;
|
||||
*dst++ = tmp2 & MASK_2;
|
||||
/* tmp2 BYTE3 must be zero here */
|
||||
|
||||
.cpy_done:
|
||||
*dst++ = 0;
|
||||
#endif /* Pseudo code end */
|
Loading…
Reference in New Issue