summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArthur Heymans <arthur@aheymans.xyz>2022-11-04 20:20:14 +0100
committerMartin L Roth <gaumless@gmail.com>2022-11-12 23:22:17 +0000
commit9df0fee8fab3dc7a79632122257845e8d5720095 (patch)
tree5d55bddc36495917dd233d8c53513c80de9a1a95
parentbf89aaecfa66c04de6f091ba366d776c357e972a (diff)
arch/x86/memmove: Add 64bit version
The 64bit handles 64bit input variables properly. TESTED: Both qemu and real hardware can use LZ4 properly which use this code. Change-Id: Ib43ec19df97194d6b1c18bfacb5fe8211ba0ffe5 Signed-off-by: Arthur Heymans <arthur@aheymans.xyz> Reviewed-on: https://review.coreboot.org/c/coreboot/+/69231 Tested-by: build bot (Jenkins) <no-reply@coreboot.org> Reviewed-by: Angel Pons <th3fanbus@gmail.com>
-rw-r--r--src/arch/x86/Makefile.inc22
-rw-r--r--src/arch/x86/memmove_32.c (renamed from src/arch/x86/memmove.c)0
-rw-r--r--src/arch/x86/memmove_64.S197
3 files changed, 211 insertions, 8 deletions
diff --git a/src/arch/x86/Makefile.inc b/src/arch/x86/Makefile.inc
index 50c344c7d3..d281037515 100644
--- a/src/arch/x86/Makefile.inc
+++ b/src/arch/x86/Makefile.inc
@@ -85,7 +85,8 @@ bootblock-$(CONFIG_IDT_IN_EVERY_STAGE) += exception.c
bootblock-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S
bootblock-y += memcpy.c
bootblock-y += memset.c
-bootblock-y += memmove.c
+bootblock-$(CONFIG_ARCH_BOOTBLOCK_X86_32) += memmove_32.c
+bootblock-$(CONFIG_ARCH_BOOTBLOCK_X86_64) += memmove_64.S
bootblock-$(CONFIG_COLLECT_TIMESTAMPS_TSC) += timestamp.c
bootblock-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
bootblock-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c
@@ -134,7 +135,8 @@ verstage-$(CONFIG_HAVE_CF9_RESET) += cf9_reset.c
verstage-y += cpu_common.c
verstage-y += memset.c
verstage-y += memcpy.c
-verstage-y += memmove.c
+verstage-$(CONFIG_ARCH_VERSTAGE_X86_32) += memmove_32.c
+verstage-$(CONFIG_ARCH_VERSTAGE_X86_64) += memmove_64.S
verstage-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
verstage-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c
# If verstage is a separate stage it means there's no need
@@ -172,7 +174,8 @@ romstage-y += cpu_common.c
romstage-$(CONFIG_IDT_IN_EVERY_STAGE) += exception.c
romstage-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S
romstage-y += memcpy.c
-romstage-y += memmove.c
+romstage-$(CONFIG_ARCH_ROMSTAGE_X86_32) += memmove_32.c
+romstage-$(CONFIG_ARCH_ROMSTAGE_X86_64) += memmove_64.S
romstage-y += memset.c
romstage-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
romstage-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c
@@ -217,7 +220,8 @@ postcar-$(CONFIG_IDT_IN_EVERY_STAGE) += exception.c
postcar-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S
postcar-y += exit_car.S
postcar-y += memcpy.c
-postcar-y += memmove.c
+postcar-$(CONFIG_ARCH_POSTCAR_X86_32) += memmove_32.c
+postcar-$(CONFIG_ARCH_POSTCAR_X86_64) += memmove_64.S
postcar-y += memset.c
postcar-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
postcar-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c
@@ -261,7 +265,8 @@ ramstage-y += exception.c
ramstage-y += idt.S
ramstage-$(CONFIG_IOAPIC) += ioapic.c
ramstage-y += memcpy.c
-ramstage-y += memmove.c
+ramstage-$(CONFIG_ARCH_RAMSTAGE_X86_32) += memmove_32.c
+ramstage-$(CONFIG_ARCH_RAMSTAGE_X86_64) += memmove_64.S
ramstage-y += memset.c
ramstage-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
ramstage-$(CONFIG_GENERATE_MP_TABLE) += mpspec.c
@@ -278,11 +283,11 @@ ramstage-$(CONFIG_HAVE_ACPI_RESUME) += wakeup.S
ramstage-$(CONFIG_HAVE_CF9_RESET) += cf9_reset.c
rmodules_x86_32-y += memcpy.c
-rmodules_x86_32-y += memmove.c
+rmodules_x86_32-y += memmove_32.c
rmodules_x86_32-y += memset.c
rmodules_x86_64-y += memcpy.c
-rmodules_x86_64-y += memmove.c
+rmodules_x86_64-y += memmove_64.S
rmodules_x86_64-y += memset.c
ifeq ($(CONFIG_ARCH_RAMSTAGE_X86_32),y)
@@ -324,7 +329,8 @@ smm-$(CONFIG_DEBUG_HW_BREAKPOINTS_IN_ALL_STAGES) += breakpoint.c
smm-$(CONFIG_IDT_IN_EVERY_STAGE) += exception.c
smm-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S
smm-y += memcpy.c
-smm-y += memmove.c
+smm-$(CONFIG_ARCH_RAMSTAGE_X86_32) += memmove_32.c
+smm-$(CONFIG_ARCH_RAMSTAGE_X86_64) += memmove_64.S
smm-y += memset.c
smm-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
smm-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c
diff --git a/src/arch/x86/memmove.c b/src/arch/x86/memmove_32.c
index 3ec50b26ae..3ec50b26ae 100644
--- a/src/arch/x86/memmove.c
+++ b/src/arch/x86/memmove_32.c
diff --git a/src/arch/x86/memmove_64.S b/src/arch/x86/memmove_64.S
new file mode 100644
index 0000000000..ebec8ee3e7
--- /dev/null
+++ b/src/arch/x86/memmove_64.S
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/* This code originates from Linux 5.19 */
+
+/*
+ * Implement memmove(). This can handle overlap between src and dst.
+ *
+ * Input:
+ * rdi: dest
+ * rsi: src
+ * rdx: count
+ *
+ * Output:
+ * rax: dest
+ */
+.global memmove
+memmove:
+
+ mov %rdi, %rax
+
+ /* Decide forward/backward copy mode */
+ cmp %rdi, %rsi
+ jge .Lmemmove_begin_forward
+ mov %rsi, %r8
+ add %rdx, %r8
+ cmp %rdi, %r8
+ jg 2f
+
+ /* Don't optimize for FSRM and ERMS like Linux */
+.Lmemmove_begin_forward:
+ cmp $0x20, %rdx
+ jb 1f
+
+ /*
+ * movsq instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ cmp $680, %rdx
+ jb 3f
+ /*
+ * movsq instruction is only good for aligned case.
+ */
+
+ cmpb %dil, %sil
+ je 4f
+3:
+ sub $0x20, %rdx
+ /*
+ * We gobble 32 bytes forward in each loop.
+ */
+5:
+ sub $0x20, %rdx
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r10
+ movq 2*8(%rsi), %r9
+ movq 3*8(%rsi), %r8
+ leaq 4*8(%rsi), %rsi
+
+ movq %r11, 0*8(%rdi)
+ movq %r10, 1*8(%rdi)
+ movq %r9, 2*8(%rdi)
+ movq %r8, 3*8(%rdi)
+ leaq 4*8(%rdi), %rdi
+ jae 5b
+ addq $0x20, %rdx
+ jmp 1f
+ /*
+ * Handle data forward by movsq.
+ */
+ .p2align 4
+4:
+ movq %rdx, %rcx
+ movq -8(%rsi, %rdx), %r11
+ lea -8(%rdi, %rdx), %r10
+ shrq $3, %rcx
+ rep movsq
+ movq %r11, (%r10)
+ jmp 13f
+.Lmemmove_end_forward:
+
+ /*
+ * Handle data backward by movsq.
+ */
+ .p2align 4
+7:
+ movq %rdx, %rcx
+ movq (%rsi), %r11
+ movq %rdi, %r10
+ leaq -8(%rsi, %rdx), %rsi
+ leaq -8(%rdi, %rdx), %rdi
+ shrq $3, %rcx
+ std
+ rep movsq
+ cld
+ movq %r11, (%r10)
+ jmp 13f
+
+ /*
+ * Start to prepare for backward copy.
+ */
+ .p2align 4
+2:
+ cmp $0x20, %rdx
+ jb 1f
+ cmp $680, %rdx
+ jb 6f
+ cmp %dil, %sil
+ je 7b
+6:
+ /*
+ * Calculate copy position to tail.
+ */
+ addq %rdx, %rsi
+ addq %rdx, %rdi
+ subq $0x20, %rdx
+ /*
+ * We gobble 32 bytes backward in each loop.
+ */
+8:
+ subq $0x20, %rdx
+ movq -1*8(%rsi), %r11
+ movq -2*8(%rsi), %r10
+ movq -3*8(%rsi), %r9
+ movq -4*8(%rsi), %r8
+ leaq -4*8(%rsi), %rsi
+
+ movq %r11, -1*8(%rdi)
+ movq %r10, -2*8(%rdi)
+ movq %r9, -3*8(%rdi)
+ movq %r8, -4*8(%rdi)
+ leaq -4*8(%rdi), %rdi
+ jae 8b
+ /*
+ * Calculate copy position to head.
+ */
+ addq $0x20, %rdx
+ subq %rdx, %rsi
+ subq %rdx, %rdi
+1:
+ cmpq $16, %rdx
+ jb 9f
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r10
+ movq -2*8(%rsi, %rdx), %r9
+ movq -1*8(%rsi, %rdx), %r8
+ movq %r11, 0*8(%rdi)
+ movq %r10, 1*8(%rdi)
+ movq %r9, -2*8(%rdi, %rdx)
+ movq %r8, -1*8(%rdi, %rdx)
+ jmp 13f
+ .p2align 4
+9:
+ cmpq $8, %rdx
+ jb 10f
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ movq 0*8(%rsi), %r11
+ movq -1*8(%rsi, %rdx), %r10
+ movq %r11, 0*8(%rdi)
+ movq %r10, -1*8(%rdi, %rdx)
+ jmp 13f
+10:
+ cmpq $4, %rdx
+ jb 11f
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ movl (%rsi), %r11d
+ movl -4(%rsi, %rdx), %r10d
+ movl %r11d, (%rdi)
+ movl %r10d, -4(%rdi, %rdx)
+ jmp 13f
+11:
+ cmp $2, %rdx
+ jb 12f
+ /*
+ * Move data from 2 bytes to 3 bytes.
+ */
+ movw (%rsi), %r11w
+ movw -2(%rsi, %rdx), %r10w
+ movw %r11w, (%rdi)
+ movw %r10w, -2(%rdi, %rdx)
+ jmp 13f
+12:
+ cmp $1, %rdx
+ jb 13f
+ /*
+ * Move data for 1 byte.
+ */
+ movb (%rsi), %r11b
+ movb %r11b, (%rdi)
+13:
+ RET