armv8m/strcpy: add arch optimize version

2023-09-20 16:27:27 +08:00 · 2023-09-20 16:27:27 +08:00 · 2d817deecb
parent abfb7da553
commit 2d817deecb
3 changed files with 319 additions and 0 deletions
--- a/libs/libc/machine/arm/armv8-m/Kconfig
+++ b/libs/libc/machine/arm/armv8-m/Kconfig
@ -14,6 +14,7 @@ config ARMV8M_STRING_FUNCTION
 	select ARMV8M_MEMSET
 	select ARMV8M_MEMMOVE
 	select ARMV8M_STRCMP
+	select ARMV8M_STRCPY
 	select ARMV8M_STRLEN

 config ARMV8M_MEMCHR
@ -56,6 +57,14 @@ config ARMV8M_STRCMP
 	---help---
 		Enable optimized ARMv8-M specific strcmp() library function

+config ARMV8M_STRCPY
+	bool "Enable optimized strcpy() for ARMv8-M"
+	default n
+	select LIBC_ARCH_STRCPY
+	depends on ARCH_TOOLCHAIN_GNU
+	---help---
+		Enable optimized ARMv8-M specific strcpy() library function
+
 config ARMV8M_STRLEN
 	bool "Enable optimized strlen() for ARMv8-M"
 	default n
--- a/libs/libc/machine/arm/armv8-m/Make.defs
+++ b/libs/libc/machine/arm/armv8-m/Make.defs
@ -42,6 +42,10 @@ ifeq ($(CONFIG_ARMV8M_STRCMP),y)
 ASRCS += arch_strcmp.S
 endif

+ifeq ($(CONFIG_ARMV8M_STRCPY),y)
+ASRCS += arch_strcpy.S
+endif
+
 ifeq ($(CONFIG_ARMV8M_STRLEN),y)
 ASRCS += arch_strlen.S
 endif
--- a/libs/libc/machine/arm/armv8-m/gnu/arch_strcpy.S
+++ b/libs/libc/machine/arm/armv8-m/gnu/arch_strcpy.S
@ -0,0 +1,306 @@
+/***************************************************************************
+ * libs/libc/machine/arm/armv8-m/gnu/arch_strcpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ***************************************************************************/
+
+/* This strcpy borrowed some ideas from arch_strcmp.S(). */
+
+/* Parameters and result. */
+#define dst     r0
+#define src     r1
+#define result  r0
+
+/* Internal variables, or callee saved registers */
+#define tmp1          r4
+#define tmp2          r5
+#define tmp3          r6
+#define src_offset    r7
+
+#ifdef __ARM_BIG_ENDIAN
+#  define MASK_0          0xff000000
+#  define MASK_1          0xff0000
+#  define MASK_2          0xff00
+#  define MASK_3          0xff
+#  define BYTE_0_SHIFT    24
+#  define BYTE_1_SHIFT    16
+#  define BYTE_2_SHIFT    8
+#  define BYTE_3_SHIFT    0
+#else
+#  define MASK_0          0xff
+#  define MASK_1          0xff00
+#  define MASK_2          0xff0000
+#  define MASK_3          0xff000000
+#  define BYTE_0_SHIFT    0
+#  define BYTE_1_SHIFT    8
+#  define BYTE_2_SHIFT    16
+#  define BYTE_3_SHIFT    24
+#endif
+
+    .syntax unified
+    .text
+    .align  2
+    .global strcpy
+    .thumb
+    .type   strcpy, %function
+
+strcpy:
+    push    {result, tmp1, tmp2, tmp3, src_offset}
+    eor     tmp1, dst, src
+    tst     tmp1, #3
+    /* If dst and src not at same byte offset from a word boundary */
+    bne     .Lstrs_diff_offset
+    /* Process same byte offset then, get the offset */
+    ands    tmp1, src, #3
+    beq     .Ldst_src_aligned
+    /* get number of bytes unaligned */
+    rsb     tmp1, #4
+
+.Lbyte_copy_until_dsr_src_aligned:
+    ldrb    tmp2, [src], #1
+    cmp     tmp2, #0
+    beq     .Lcopy_done
+    strb    tmp2, [dst], #1
+    subs    tmp1, #1
+    bne     .Lbyte_copy_until_dsr_src_aligned
+
+.Ldst_src_aligned:
+    /* Now dst and src are aligned */
+    ldr     tmp1, [src], #4
+    sub     tmp2, tmp1, #0x01010101
+    bic     tmp2, tmp1
+    tst     tmp2, #0x80808080
+    /* All zero means no zero byte is detected */
+    it      eq
+    streq   tmp1, [dst], #4
+    beq     .Ldst_src_aligned
+
+    /* There is a zero in the word, copy until zero */
+    sub     src, #4
+.Lbyte_copy_until_zero:
+    ldrb    tmp2, [src], #1
+    cmp     tmp2, #0
+    beq     .Lcopy_done
+    strb    tmp2, [dst], #1
+    b       .Lbyte_copy_until_zero
+
+/* Make dst aligned, so we won't write anything before dst.
+ * If we attempt to write before dst, atomic read-write must
+ * be ensured. Atomic operation complicates things.
+ * So the solution here is byte by byte copy until dst aligned.
+ */
+.Lstrs_diff_offset:
+    ands    tmp1, dst, #3
+    beq     .Ldiff_offset_loop_begin
+    /* get number of dst bytes unaligned */
+    rsb     tmp1, #4
+
+.Lbyte_copy_until_dst_aligned:
+    ldrb    tmp2, [src], #1
+    cmp     tmp2, #0
+    beq     .Lcopy_done
+    strb    tmp2, [dst], #1
+    subs    tmp1, #1
+    bne     .Lbyte_copy_until_dst_aligned
+
+.Ldiff_offset_loop_begin:
+    /* src_offset mustn't be 0 here */
+    and     src_offset, src, 3
+    lsls    src_offset, #3
+    bic     src, #3
+/* first word logic
+ * prepend 0xff to make the algorithm simpler
+ * only the first word needs to be prepended
+ */
+    ldr     tmp1, [src], #4
+    mov     tmp2, #0xffffffff
+    rsb     tmp3, src_offset, #32
+
+#ifdef __ARM_BIG_ENDIAN
+    lsls    tmp2, tmp3
+#else
+    lsrs    tmp2, tmp3
+#endif
+    orr     tmp1, tmp1, tmp2
+    /* Test if the first word contains zero */
+    sub     tmp3, tmp1, #0x01010101
+    bic     tmp3, tmp1
+    tst     tmp3, #0x80808080
+    /* non-zero means zero byte is detected */
+    bne     .Ltail_copy
+
+    /* before loop, set tmp2=tmp1 to simplify the logic in the loop */
+    mov     tmp2, tmp1
+.Ldiff_offset_loop:
+    mov     tmp1, tmp2
+    ldr     tmp2, [src], #4
+    /* Test if  contains zero */
+    sub     tmp3, tmp2, #0x01010101
+    bic     tmp3, tmp2
+    tst     tmp3, #0x80808080
+    /* non-zero means zero byte is detected */
+    bne     .Ltail_copy
+    /* Now let's fill dst */
+#ifdef __ARM_BIG_ENDIAN
+    lsls    tmp1, src_offset
+    rsb     tmp3, src_offset, #32
+    lsrs    tmp3, tmp2, tmp3
+    orr     tmp1, tmp1, tmp3
+#else
+    lsrs    tmp1, src_offset
+    rsb     tmp3, src_offset, #32
+    lsls    tmp3, tmp2, tmp3
+    orr     tmp1, tmp1, tmp3
+#endif
+    str     tmp1, [dst], #4
+    b       .Ldiff_offset_loop
+
+.Ltail_copy:
+    cmp     src_offset, #24
+    beq     .Loffset_3
+    cmp     src_offset, #16
+    beq     .Loffset_2
+    /*  src_offset == 8 here */
+    ands    tmp3, tmp1, MASK_1
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_1_SHIFT
+    strb    tmp3, [dst], #1
+.Loffset_2:
+    ands    tmp3, tmp1, MASK_2
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_2_SHIFT
+    strb    tmp3, [dst], #1
+.Loffset_3:
+    ands    tmp3, tmp1, MASK_3
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_3_SHIFT
+    strb    tmp3, [dst], #1
+    ands    tmp3, tmp2, MASK_0
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_0_SHIFT
+    strb    tmp3, [dst], #1
+    ands    tmp3, tmp2, MASK_1
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_1_SHIFT
+    strb    tmp3, [dst], #1
+    ands    tmp3, tmp2, MASK_2
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_2_SHIFT
+    strb    tmp3, [dst], #1
+.Lcopy_done:
+    mov     tmp3, #0
+    strb    tmp3, [dst]
+    pop     {result, tmp1, tmp2, tmp3, src_offset}
+    bx      lr
+
+#if 0
+/* Pseudo Code of strcpy when dst/src not at same byte offset */
+
+/* Make dst aligned, so we won't write anything before dst.
+ * If we attempt to write before dst, atomic read-write must
+ * be ensured. Atomic operation complicates things.
+ * So the solution here is byte by byte copy until dst aligned.
+ */
+    if (dst & 3 == 0)
+        goto diff_offset_loop_begin;
+    ByteCopyUntilDstAligned();
+
+.diff_offset_loop_begin:
+/* src_offset mustn't be 0 here */
+    src_offset = src & 3;
+    src_offset = src_offset * 8;
+    src = src & 0xfffffffc;
+    tmp1 = *src;
+    src +=4;
+/* first word logic
+ * prepend 0xff to make the algorithm simpler
+ * only the first word needs to be prepended
+ */
+    if (src_offset != 0)
+    {
+        tmp2 = 0xffffffff
+#if  big endian
+        tmp2 = tmp2 << (32 - src_offset)
+#else
+        tmp2 = tmp2 >> (32 - src_offset)
+#endif
+        tmp1 |= tmp2
+    }
+    if (HasZeroByte(tmp1))
+    {
+        goto .tail_copy;
+    }
+
+/* before loop, set tmp2=tmp1 to simplify the logic in the loop */
+    tmp2 = tmp1
+.diff_offset_loop:
+    tmp1 = tmp2;
+    tmp2 = *src;
+    src += 4;
+
+ /* double word tail means we have to copy from tmp1 and tmp2 to dst */
+    if (HasZeroByte(tmp2))
+    {
+        goto .tail_copy;
+    }
+/* Now let's fill dst */
+#if  big endian
+    tmp1 = tmp1 << (src_offset);
+    tmp1 |= tmp2 >> (32 - src_offset);
+    *dst = tmp1;
+#else
+    tmp1 = tmp1 >> (src_offset);
+    tmp1 |= tmp2 << (32 - src_offset);
+    *dst = tmp1;
+#endif
+    dst +=4;
+    goto .diff_offset_loop;
+
+/* byte by byte copy at the tail */
+.tail_copy:
+    if (src_offset == 3)
+        goto offset_3;
+    if (src_offset == 2)
+        goto offset_2;
+
+/* src_offset mustn't be 0 here */
+/* default src_offset == 1 */
+    if (tmp1 & MASK_1 == 0)
+        goto cpy_done;
+    *dst++ = tmp1 & MASK_1;
+offset_2:
+    if (tmp1 & MASK_2 == 0)
+        goto cpy_done;
+    *dst++ = tmp1 & MASK_2;
+offset_3:
+    if (tmp1 & MASK_3 == 0)
+        goto cpy_done;
+    *dst++ = tmp1 & MASK_3;
+    if (tmp2 & MASK_0 == 0)
+        goto cpy_done;
+    *dst++ = tmp2 & MASK_0;
+    if (tmp2 & MASK_1 == 0)
+        goto cpy_done;
+    *dst++ = tmp2 & MASK_1;
+    if (tmp2 & MASK_2 == 0)
+        goto cpy_done;
+    *dst++ = tmp2 & MASK_2;
+/* tmp2 BYTE3 must be zero here */
+
+.cpy_done:
+    *dst++ = 0;
+#endif  /* Pseudo code end */