x86/mem: Move memmove to out of line assembler

author Nick Desaulniers <ndesaulniers@google.com>

Tue, 18 Oct 2022 17:21:55 +0000 (10:21 -0700)

committer Dave Hansen <dave.hansen@linux.intel.com>

Tue, 1 Nov 2022 22:44:07 +0000 (15:44 -0700)
author Nick Desaulniers <ndesaulniers@google.com>
Tue, 18 Oct 2022 17:21:55 +0000 (10:21 -0700)
committer Dave Hansen <dave.hansen@linux.intel.com>
Tue, 1 Nov 2022 22:44:07 +0000 (15:44 -0700)
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile

index 7ba5f61d7273538ca65dd6f8b904fc417d6478ef..4f1a40a86534329100d0344bb9388ddede6dc54e 100644 (file)
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -60,6 +60,7 @@ ifeq ($(CONFIG_X86_32),y)
          lib-y += checksum_32.o
          lib-y += strstr_32.o
          lib-y += string_32.o
+        lib-y += memmove_32.o
  ifneq ($(CONFIG_X86_CMPXCHG64),y)
          lib-y += cmpxchg8b_emu.o atomic64_386_32.o
  endif
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c

index ef3af7ff2c8a5df893be44086468284d59634f5d..a29b64befb937472e3cb09c9f77cc3c97ede4bcc 100644 (file)
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -17,190 +17,3 @@ __visible void *memset(void *s, int c, size_t count)
         return __memset(s, c, count);
  }
  EXPORT_SYMBOL(memset);
-
-__visible void *memmove(void *dest, const void *src, size_t n)
-{
-       int d0,d1,d2,d3,d4,d5;
-       char *ret = dest;
-
-       __asm__ __volatile__(
-               /* Handle more 16 bytes in loop */
-               "cmp $0x10, %0\n\t"
-               "jb     1f\n\t"
-
-               /* Decide forward/backward copy mode */
-               "cmp %2, %1\n\t"
-               "jb     2f\n\t"
-
-               /*
-                * movs instruction have many startup latency
-                * so we handle small size by general register.
-                */
-               "cmp  $680, %0\n\t"
-               "jb 3f\n\t"
-               /*
-                * movs instruction is only good for aligned case.
-                */
-               "mov %1, %3\n\t"
-               "xor %2, %3\n\t"
-               "and $0xff, %3\n\t"
-               "jz 4f\n\t"
-               "3:\n\t"
-               "sub $0x10, %0\n\t"
-
-               /*
-                * We gobble 16 bytes forward in each loop.
-                */
-               "3:\n\t"
-               "sub $0x10, %0\n\t"
-               "mov 0*4(%1), %3\n\t"
-               "mov 1*4(%1), %4\n\t"
-               "mov  %3, 0*4(%2)\n\t"
-               "mov  %4, 1*4(%2)\n\t"
-               "mov 2*4(%1), %3\n\t"
-               "mov 3*4(%1), %4\n\t"
-               "mov  %3, 2*4(%2)\n\t"
-               "mov  %4, 3*4(%2)\n\t"
-               "lea  0x10(%1), %1\n\t"
-               "lea  0x10(%2), %2\n\t"
-               "jae 3b\n\t"
-               "add $0x10, %0\n\t"
-               "jmp 1f\n\t"
-
-               /*
-                * Handle data forward by movs.
-                */
-               ".p2align 4\n\t"
-               "4:\n\t"
-               "mov -4(%1, %0), %3\n\t"
-               "lea -4(%2, %0), %4\n\t"
-               "shr $2, %0\n\t"
-               "rep movsl\n\t"
-               "mov %3, (%4)\n\t"
-               "jmp 11f\n\t"
-               /*
-                * Handle data backward by movs.
-                */
-               ".p2align 4\n\t"
-               "6:\n\t"
-               "mov (%1), %3\n\t"
-               "mov %2, %4\n\t"
-               "lea -4(%1, %0), %1\n\t"
-               "lea -4(%2, %0), %2\n\t"
-               "shr $2, %0\n\t"
-               "std\n\t"
-               "rep movsl\n\t"
-               "mov %3,(%4)\n\t"
-               "cld\n\t"
-               "jmp 11f\n\t"
-
-               /*
-                * Start to prepare for backward copy.
-                */
-               ".p2align 4\n\t"
-               "2:\n\t"
-               "cmp  $680, %0\n\t"
-               "jb 5f\n\t"
-               "mov %1, %3\n\t"
-               "xor %2, %3\n\t"
-               "and $0xff, %3\n\t"
-               "jz 6b\n\t"
-
-               /*
-                * Calculate copy position to tail.
-                */
-               "5:\n\t"
-               "add %0, %1\n\t"
-               "add %0, %2\n\t"
-               "sub $0x10, %0\n\t"
-
-               /*
-                * We gobble 16 bytes backward in each loop.
-                */
-               "7:\n\t"
-               "sub $0x10, %0\n\t"
-
-               "mov -1*4(%1), %3\n\t"
-               "mov -2*4(%1), %4\n\t"
-               "mov  %3, -1*4(%2)\n\t"
-               "mov  %4, -2*4(%2)\n\t"
-               "mov -3*4(%1), %3\n\t"
-               "mov -4*4(%1), %4\n\t"
-               "mov  %3, -3*4(%2)\n\t"
-               "mov  %4, -4*4(%2)\n\t"
-               "lea  -0x10(%1), %1\n\t"
-               "lea  -0x10(%2), %2\n\t"
-               "jae 7b\n\t"
-               /*
-                * Calculate copy position to head.
-                */
-               "add $0x10, %0\n\t"
-               "sub %0, %1\n\t"
-               "sub %0, %2\n\t"
-
-               /*
-                * Move data from 8 bytes to 15 bytes.
-                */
-               ".p2align 4\n\t"
-               "1:\n\t"
-               "cmp $8, %0\n\t"
-               "jb 8f\n\t"
-               "mov 0*4(%1), %3\n\t"
-               "mov 1*4(%1), %4\n\t"
-               "mov -2*4(%1, %0), %5\n\t"
-               "mov -1*4(%1, %0), %1\n\t"
-
-               "mov  %3, 0*4(%2)\n\t"
-               "mov  %4, 1*4(%2)\n\t"
-               "mov  %5, -2*4(%2, %0)\n\t"
-               "mov  %1, -1*4(%2, %0)\n\t"
-               "jmp 11f\n\t"
-
-               /*
-                * Move data from 4 bytes to 7 bytes.
-                */
-               ".p2align 4\n\t"
-               "8:\n\t"
-               "cmp $4, %0\n\t"
-               "jb 9f\n\t"
-               "mov 0*4(%1), %3\n\t"
-               "mov -1*4(%1, %0), %4\n\t"
-               "mov  %3, 0*4(%2)\n\t"
-               "mov  %4, -1*4(%2, %0)\n\t"
-               "jmp 11f\n\t"
-
-               /*
-                * Move data from 2 bytes to 3 bytes.
-                */
-               ".p2align 4\n\t"
-               "9:\n\t"
-               "cmp $2, %0\n\t"
-               "jb 10f\n\t"
-               "movw 0*2(%1), %%dx\n\t"
-               "movw -1*2(%1, %0), %%bx\n\t"
-               "movw %%dx, 0*2(%2)\n\t"
-               "movw %%bx, -1*2(%2, %0)\n\t"
-               "jmp 11f\n\t"
-
-               /*
-                * Move data for 1 byte.
-                */
-               ".p2align 4\n\t"
-               "10:\n\t"
-               "cmp $1, %0\n\t"
-               "jb 11f\n\t"
-               "movb (%1), %%cl\n\t"
-               "movb %%cl, (%2)\n\t"
-               ".p2align 4\n\t"
-               "11:"
-               : "=&c" (d0), "=&S" (d1), "=&D" (d2),
-                 "=r" (d3),"=r" (d4), "=r"(d5)
-               :"0" (n),
-                "1" (src),
-                "2" (dest)
-               :"memory");
-
-       return ret;
-
-}
-EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memmove_32.S b/arch/x86/lib/memmove_32.S

new file mode 100644 (file)

index 0000000..0588b2c
--- /dev/null
+++ b/arch/x86/lib/memmove_32.S
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+
+SYM_FUNC_START(memmove)
+/*
+ * void *memmove(void *dest_in, const void *src_in, size_t n)
+ * -mregparm=3 passes these in registers:
+ * dest_in: %eax
+ * src_in: %edx
+ * n: %ecx
+ * See also: arch/x86/entry/calling.h for description of the calling convention.
+ *
+ * n can remain in %ecx, but for `rep movsl`, we'll need dest in %edi and src
+ * in %esi.
+ */
+.set dest_in, %eax
+.set dest, %edi
+.set src_in, %edx
+.set src, %esi
+.set n, %ecx
+.set tmp0, %edx
+.set tmp0w, %dx
+.set tmp1, %ebx
+.set tmp1w, %bx
+.set tmp2, %eax
+.set tmp3b, %cl
+
+/*
+ * Save all callee-saved registers, because this function is going to clobber
+ * all of them:
+ */
+       pushl   %ebp
+       movl    %esp, %ebp      // set standard frame pointer
+
+       pushl   %ebx
+       pushl   %edi
+       pushl   %esi
+       pushl   %eax            // save 'dest_in' parameter [eax] as the return value
+
+       movl src_in, src
+       movl dest_in, dest
+
+       /* Handle more 16 bytes in loop */
+       cmpl    $0x10, n
+       jb      .Lmove_16B
+
+       /* Decide forward/backward copy mode */
+       cmpl    dest, src
+       jb      .Lbackwards_header
+
+       /*
+        * movs instruction have many startup latency
+        * so we handle small size by general register.
+        */
+       cmpl    $680, n
+       jb      .Ltoo_small_forwards
+       /* movs instruction is only good for aligned case. */
+       movl    src, tmp0
+       xorl    dest, tmp0
+       andl    $0xff, tmp0
+       jz      .Lforward_movs
+.Ltoo_small_forwards:
+       subl    $0x10, n
+
+       /* We gobble 16 bytes forward in each loop. */
+.Lmove_16B_forwards_loop:
+       subl    $0x10, n
+       movl    0*4(src), tmp0
+       movl    1*4(src), tmp1
+       movl    tmp0, 0*4(dest)
+       movl    tmp1, 1*4(dest)
+       movl    2*4(src), tmp0
+       movl    3*4(src), tmp1
+       movl    tmp0, 2*4(dest)
+       movl    tmp1, 3*4(dest)
+       leal    0x10(src), src
+       leal    0x10(dest), dest
+       jae     .Lmove_16B_forwards_loop
+       addl    $0x10, n
+       jmp     .Lmove_16B
+
+       /* Handle data forward by movs. */
+.p2align 4
+.Lforward_movs:
+       movl    -4(src, n), tmp0
+       leal    -4(dest, n), tmp1
+       shrl    $2, n
+       rep     movsl
+       movl    tmp0, (tmp1)
+       jmp     .Ldone
+
+       /* Handle data backward by movs. */
+.p2align 4
+.Lbackwards_movs:
+       movl    (src), tmp0
+       movl    dest, tmp1
+       leal    -4(src, n), src
+       leal    -4(dest, n), dest
+       shrl    $2, n
+       std
+       rep     movsl
+       movl    tmp0,(tmp1)
+       cld
+       jmp     .Ldone
+
+       /* Start to prepare for backward copy. */
+.p2align 4
+.Lbackwards_header:
+       cmpl    $680, n
+       jb      .Ltoo_small_backwards
+       movl    src, tmp0
+       xorl    dest, tmp0
+       andl    $0xff, tmp0
+       jz      .Lbackwards_movs
+
+       /* Calculate copy position to tail. */
+.Ltoo_small_backwards:
+       addl    n, src
+       addl    n, dest
+       subl    $0x10, n
+
+       /* We gobble 16 bytes backward in each loop. */
+.Lmove_16B_backwards_loop:
+       subl    $0x10, n
+
+       movl    -1*4(src), tmp0
+       movl    -2*4(src), tmp1
+       movl    tmp0, -1*4(dest)
+       movl    tmp1, -2*4(dest)
+       movl    -3*4(src), tmp0
+       movl    -4*4(src), tmp1
+       movl    tmp0, -3*4(dest)
+       movl    tmp1, -4*4(dest)
+       leal    -0x10(src), src
+       leal    -0x10(dest), dest
+       jae     .Lmove_16B_backwards_loop
+       /* Calculate copy position to head. */
+       addl    $0x10, n
+       subl    n, src
+       subl    n, dest
+
+       /* Move data from 8 bytes to 15 bytes. */
+.p2align 4
+.Lmove_16B:
+       cmpl    $8, n
+       jb      .Lmove_8B
+       movl    0*4(src), tmp0
+       movl    1*4(src), tmp1
+       movl    -2*4(src, n), tmp2
+       movl    -1*4(src, n), src
+
+       movl    tmp0, 0*4(dest)
+       movl    tmp1, 1*4(dest)
+       movl    tmp2, -2*4(dest, n)
+       movl    src, -1*4(dest, n)
+       jmp     .Ldone
+
+       /* Move data from 4 bytes to 7 bytes. */
+.p2align 4
+.Lmove_8B:
+       cmpl    $4, n
+       jb      .Lmove_4B
+       movl    0*4(src), tmp0
+       movl    -1*4(src, n), tmp1
+       movl    tmp0, 0*4(dest)
+       movl    tmp1, -1*4(dest, n)
+       jmp     .Ldone
+
+       /* Move data from 2 bytes to 3 bytes. */
+.p2align 4
+.Lmove_4B:
+       cmpl    $2, n
+       jb      .Lmove_1B
+       movw    0*2(src), tmp0w
+       movw    -1*2(src, n), tmp1w
+       movw    tmp0w, 0*2(dest)
+       movw    tmp1w, -1*2(dest, n)
+       jmp     .Ldone
+
+       /* Move data for 1 byte. */
+.p2align 4
+.Lmove_1B:
+       cmpl    $1, n
+       jb      .Ldone
+       movb    (src), tmp3b
+       movb    tmp3b, (dest)
+.p2align 4
+.Ldone:
+       popl    dest_in // restore 'dest_in' [eax] as the return value
+       /* Restore all callee-saved registers: */
+       popl    %esi
+       popl    %edi
+       popl    %ebx
+       popl    %ebp
+
+       RET
+SYM_FUNC_END(memmove)
+EXPORT_SYMBOL(memmove)
diff --git a/lib/memcpy_kunit.c b/lib/memcpy_kunit.c

index 2b5cc70ac53fcfb794d37cf69fc5ead928b2873f..7513e6d5dc9042c531a5cad6f4666e4d1fe50ee6 100644 (file)
--- a/lib/memcpy_kunit.c
+++ b/lib/memcpy_kunit.c
@@ -105,6 +105,8 @@ static void memcpy_test(struct kunit *test)
  #undef TEST_OP
  }
  
+static unsigned char larger_array [2048];
+
  static void memmove_test(struct kunit *test)
  {
  #define TEST_OP "memmove"
@@ -179,6 +181,26 @@ static void memmove_test(struct kunit *test)
         ptr = &overlap.data[2];
         memmove(ptr, overlap.data, 5);
         compare("overlapping write", overlap, overlap_expected);
+
+       /* Verify larger overlapping moves. */
+       larger_array[256] = 0xAAu;
+       /*
+        * Test a backwards overlapping memmove first. 256 and 1024 are
+        * important for i386 to use rep movsl.
+        */
+       memmove(larger_array, larger_array + 256, 1024);
+       KUNIT_ASSERT_EQ(test, larger_array[0], 0xAAu);
+       KUNIT_ASSERT_EQ(test, larger_array[256], 0x00);
+       KUNIT_ASSERT_NULL(test,
+               memchr(larger_array + 1, 0xaa, ARRAY_SIZE(larger_array) - 1));
+       /* Test a forwards overlapping memmove. */
+       larger_array[0] = 0xBBu;
+       memmove(larger_array + 256, larger_array, 1024);
+       KUNIT_ASSERT_EQ(test, larger_array[0], 0xBBu);
+       KUNIT_ASSERT_EQ(test, larger_array[256], 0xBBu);
+       KUNIT_ASSERT_NULL(test, memchr(larger_array + 1, 0xBBu, 256 - 1));
+       KUNIT_ASSERT_NULL(test,
+               memchr(larger_array + 257, 0xBBu, ARRAY_SIZE(larger_array) - 257));
  #undef TEST_OP
  }
author	Nick Desaulniers <ndesaulniers@google.com>
	Tue, 18 Oct 2022 17:21:55 +0000 (10:21 -0700)
committer	Dave Hansen <dave.hansen@linux.intel.com>
	Tue, 1 Nov 2022 22:44:07 +0000 (15:44 -0700)
arch/x86/lib/Makefile		patch \| blob \| history
arch/x86/lib/memcpy_32.c		patch \| blob \| history
arch/x86/lib/memmove_32.S	[new file with mode: 0644]	patch \| blob
lib/memcpy_kunit.c		patch \| blob \| history