aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Hendricks <dhendrix@chromium.org>2013-03-06 20:43:55 -0800
committerRonald G. Minnich <rminnich@gmail.com>2013-03-08 23:14:26 +0100
commitae0e8d3613ad9cb6872c58cd95fc9774b3b17f5b (patch)
tree452b6c60e2f9bf9f5d0c373a27eed092675e13e6
parent31c5e07a04e90c03822d216d2dc92454b42e21ce (diff)
Eliminate do_div().
This eliminates the use of do_div() in favor of using libgcc functions. This was tested by building and booting on Google Snow (ARMv7) and Qemu (x86). printk()s which use division in vtxprintf() look good. Change-Id: Icad001d84a3c05bfbf77098f3d644816280b4a4d Signed-off-by: Gabe Black <gabeblack@chromium.org> Signed-off-by: David Hendricks <dhendrix@chromium.org> Reviewed-on: http://review.coreboot.org/2606 Tested-by: build bot (Jenkins) Reviewed-by: Paul Menzel <paulepanter@users.sourceforge.net> Reviewed-by: Ronald G. Minnich <rminnich@gmail.com>
-rw-r--r--src/arch/armv7/Makefile.inc2
-rw-r--r--src/arch/armv7/include/div64.h233
-rw-r--r--src/arch/armv7/lib/Makefile.inc2
-rw-r--r--src/arch/armv7/lib/div64.S208
-rw-r--r--src/arch/x86/Makefile.inc8
-rw-r--r--src/arch/x86/include/div64.h30
-rw-r--r--src/console/vtxprintf.c17
-rw-r--r--src/cpu/x86/smm/Makefile.inc4
-rw-r--r--src/lib/Makefile.inc2
9 files changed, 11 insertions, 495 deletions
diff --git a/src/arch/armv7/Makefile.inc b/src/arch/armv7/Makefile.inc
index e708f6d818..39c115ef8f 100644
--- a/src/arch/armv7/Makefile.inc
+++ b/src/arch/armv7/Makefile.inc
@@ -124,7 +124,7 @@ endif
$(objgenerated)/coreboot_ram.o: $(stages_o) $$(ramstage-objs) $(LIBGCC_FILE_NAME)
@printf " CC $(subst $(obj)/,,$(@))\n"
ifeq ($(CONFIG_COMPILER_LLVM_CLANG),y)
- $(LD) -m -m armelf_linux_eabi -r -o $@ --wrap __divdi3 --wrap __udivdi3 --wrap __moddi3 --wrap __umoddi3 --wrap __uidiv --wrap __do_div64 --start-group $(ramstage-objs) $(LIBGCC_FILE_NAME) --end-group
+ $(LD) -m -m armelf_linux_eabi -r -o $@ --wrap __divdi3 --wrap __udivdi3 --wrap __moddi3 --wrap __umoddi3 --wrap __uidiv --start-group $(ramstage-objs) $(LIBGCC_FILE_NAME) --end-group
else
$(CC) $(CFLAGS) -nostdlib -r -o $@ -Wl,--start-group $(stages_o) $(ramstage-objs) $(LIBGCC_FILE_NAME) -Wl,--end-group
endif
diff --git a/src/arch/armv7/include/div64.h b/src/arch/armv7/include/div64.h
deleted file mode 100644
index 5baed2b294..0000000000
--- a/src/arch/armv7/include/div64.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* taken from linux 2.6.31.14 */
-
-#ifndef __ASM_ARM_DIV64
-#define __ASM_ARM_DIV64
-
-//#include <asm/system.h>
-//#include <linux/types.h>
-// FIXME
-
-#define __asmeq(x, y) ".ifnc " x "," y " ; .err ; .endif\n\t"
-#define __LINUX_ARM_ARCH__ 7
-
-/*
- * The semantics of do_div() are:
- *
- * uint32_t do_div(uint64_t *n, uint32_t base)
- * {
- * uint32_t remainder = *n % base;
- * *n = *n / base;
- * return remainder;
- * }
- *
- * In other words, a 64-bit dividend with a 32-bit divisor producing
- * a 64-bit result and a 32-bit remainder. To accomplish this optimally
- * we call a special __do_div64 helper with completely non standard
- * calling convention for arguments and results (beware).
- */
-
-#ifdef __ARMEB__
-#define __xh "r0"
-#define __xl "r1"
-#else
-#define __xl "r0"
-#define __xh "r1"
-#endif
-
-#define __do_div_asm(n, base) \
-({ \
- register unsigned int __base asm("r4") = base; \
- register unsigned long long __n asm("r0") = n; \
- register unsigned long long __res asm("r2"); \
- register unsigned int __rem asm(__xh); \
- asm( __asmeq("%0", __xh) \
- __asmeq("%1", "r2") \
- __asmeq("%2", "r0") \
- __asmeq("%3", "r4") \
- "bl __do_div64" \
- : "=r" (__rem), "=r" (__res) \
- : "r" (__n), "r" (__base) \
- : "ip", "lr", "cc"); \
- n = __res; \
- __rem; \
-})
-
-#if __GNUC__ < 4
-
-/*
- * gcc versions earlier than 4.0 are simply too problematic for the
- * optimized implementation below. First there is gcc PR 15089 that
- * tend to trig on more complex constructs, spurious .global __udivsi3
- * are inserted even if none of those symbols are referenced in the
- * generated code, and those gcc versions are not able to do constant
- * propagation on long long values anyway.
- */
-#define do_div(n, base) __do_div_asm(n, base)
-
-#elif __GNUC__ >= 4
-
-//#include <asm/bug.h>
-
-/*
- * If the divisor happens to be constant, we determine the appropriate
- * inverse at compile time to turn the division into a few inline
- * multiplications instead which is much faster. And yet only if compiling
- * for ARMv4 or higher (we need umull/umlal) and if the gcc version is
- * sufficiently recent to perform proper long long constant propagation.
- * (It is unfortunate that gcc doesn't perform all this internally.)
- */
-#define do_div(n, base) \
-({ \
- unsigned int __r, __b = (base); \
- if (!__builtin_constant_p(__b) || __b == 0 || \
- (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) { \
- /* non-constant divisor (or zero): slow path */ \
- __r = __do_div_asm(n, __b); \
- } else if ((__b & (__b - 1)) == 0) { \
- /* Trivial: __b is constant and a power of 2 */ \
- /* gcc does the right thing with this code. */ \
- __r = n; \
- __r &= (__b - 1); \
- n /= __b; \
- } else { \
- /* Multiply by inverse of __b: n/b = n*(p/b)/p */ \
- /* We rely on the fact that most of this code gets */ \
- /* optimized away at compile time due to constant */ \
- /* propagation and only a couple inline assembly */ \
- /* instructions should remain. Better avoid any */ \
- /* code construct that might prevent that. */ \
- unsigned long long __res, __x, __t, __m, __n = n; \
- unsigned int __c, __p, __z = 0; \
- /* preserve low part of n for reminder computation */ \
- __r = __n; \
- /* determine number of bits to represent __b */ \
- __p = 1 << __div64_fls(__b); \
- /* compute __m = ((__p << 64) + __b - 1) / __b */ \
- __m = (~0ULL / __b) * __p; \
- __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \
- /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */ \
- __x = ~0ULL / __b * __b - 1; \
- __res = (__m & 0xffffffff) * (__x & 0xffffffff); \
- __res >>= 32; \
- __res += (__m & 0xffffffff) * (__x >> 32); \
- __t = __res; \
- __res += (__x & 0xffffffff) * (__m >> 32); \
- __t = (__res < __t) ? (1ULL << 32) : 0; \
- __res = (__res >> 32) + __t; \
- __res += (__m >> 32) * (__x >> 32); \
- __res /= __p; \
- /* Now sanitize and optimize what we've got. */ \
- if (~0ULL % (__b / (__b & -__b)) == 0) { \
- /* those cases can be simplified with: */ \
- __n /= (__b & -__b); \
- __m = ~0ULL / (__b / (__b & -__b)); \
- __p = 1; \
- __c = 1; \
- } else if (__res != __x / __b) { \
- /* We can't get away without a correction */ \
- /* to compensate for bit truncation errors. */ \
- /* To avoid it we'd need an additional bit */ \
- /* to represent __m which would overflow it. */ \
- /* Instead we do m=p/b and n/b=(n*m+m)/p. */ \
- __c = 1; \
- /* Compute __m = (__p << 64) / __b */ \
- __m = (~0ULL / __b) * __p; \
- __m += ((~0ULL % __b + 1) * __p) / __b; \
- } else { \
- /* Reduce __m/__p, and try to clear bit 31 */ \
- /* of __m when possible otherwise that'll */ \
- /* need extra overflow handling later. */ \
- unsigned int __bits = -(__m & -__m); \
- __bits |= __m >> 32; \
- __bits = (~__bits) << 1; \
- /* If __bits == 0 then setting bit 31 is */ \
- /* unavoidable. Simply apply the maximum */ \
- /* possible reduction in that case. */ \
- /* Otherwise the MSB of __bits indicates the */ \
- /* best reduction we should apply. */ \
- if (!__bits) { \
- __p /= (__m & -__m); \
- __m /= (__m & -__m); \
- } else { \
- __p >>= __div64_fls(__bits); \
- __m >>= __div64_fls(__bits); \
- } \
- /* No correction needed. */ \
- __c = 0; \
- } \
- /* Now we have a combination of 2 conditions: */ \
- /* 1) whether or not we need a correction (__c), and */ \
- /* 2) whether or not there might be an overflow in */ \
- /* the cross product (__m & ((1<<63) | (1<<31))) */ \
- /* Select the best insn combination to perform the */ \
- /* actual __m * __n / (__p << 64) operation. */ \
- if (!__c) { \
- asm ( "umull %Q0, %R0, %1, %Q2\n\t" \
- "mov %Q0, #0" \
- : "=&r" (__res) \
- : "r" (__m), "r" (__n) \
- : "cc" ); \
- } else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \
- __res = __m; \
- asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \
- "mov %Q0, #0" \
- : "+&r" (__res) \
- : "r" (__m), "r" (__n) \
- : "cc" ); \
- } else { \
- asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \
- "cmn %Q0, %Q1\n\t" \
- "adcs %R0, %R0, %R1\n\t" \
- "adc %Q0, %3, #0" \
- : "=&r" (__res) \
- : "r" (__m), "r" (__n), "r" (__z) \
- : "cc" ); \
- } \
- if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \
- asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \
- "umlal %R0, %Q0, %Q1, %R2\n\t" \
- "mov %R0, #0\n\t" \
- "umlal %Q0, %R0, %R1, %R2" \
- : "+&r" (__res) \
- : "r" (__m), "r" (__n) \
- : "cc" ); \
- } else { \
- asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \
- "umlal %R0, %1, %Q2, %R3\n\t" \
- "mov %R0, #0\n\t" \
- "adds %Q0, %1, %Q0\n\t" \
- "adc %R0, %R0, #0\n\t" \
- "umlal %Q0, %R0, %R2, %R3" \
- : "+&r" (__res), "+&r" (__z) \
- : "r" (__m), "r" (__n) \
- : "cc" ); \
- } \
- __res /= __p; \
- /* The reminder can be computed with 32-bit regs */ \
- /* only, and gcc is good at that. */ \
- { \
- unsigned int __res0 = __res; \
- unsigned int __b0 = __b; \
- __r -= __res0 * __b0; \
- } \
- /* BUG_ON(__r >= __b || __res * __b + __r != n); */ \
- n = __res; \
- } \
- __r; \
-})
-
-/* our own fls implementation to make sure constant propagation is fine */
-#define __div64_fls(bits) \
-({ \
- unsigned int __left = (bits), __nr = 0; \
- if (__left & 0xffff0000) __nr += 16, __left >>= 16; \
- if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \
- if (__left & 0x000000f0) __nr += 4, __left >>= 4; \
- if (__left & 0x0000000c) __nr += 2, __left >>= 2; \
- if (__left & 0x00000002) __nr += 1; \
- __nr; \
-})
-
-#endif
-
-#endif
diff --git a/src/arch/armv7/lib/Makefile.inc b/src/arch/armv7/lib/Makefile.inc
index 388864aa28..508f7766f7 100644
--- a/src/arch/armv7/lib/Makefile.inc
+++ b/src/arch/armv7/lib/Makefile.inc
@@ -6,12 +6,10 @@ bootblock-y += cache-cp15.c
romstage-y += cache_v7.c
romstage-y += cache-cp15.c
romstage-y += div0.c
-romstage-y += div64.S
romstage-y += syslib.c
romstage-$(CONFIG_EARLY_CONSOLE) += early_console.c
ramstage-y += div0.c
-ramstage-y += div64.S
#ramstage-y += interrupts.c
#ramstage-y += memcpy.S
#ramstage-y += memset.S
diff --git a/src/arch/armv7/lib/div64.S b/src/arch/armv7/lib/div64.S
deleted file mode 100644
index 44edf480ca..0000000000
--- a/src/arch/armv7/lib/div64.S
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * linux/arch/arm/lib/div64.S
- *
- * Optimized computation of 64-bit dividend / 32-bit divisor
- *
- * Author: Nicolas Pitre
- * Created: Oct 5, 2003
- * Copyright: Monta Vista Software, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-// FIXME
-//#include <linux/linkage.h>
-#define __LINUX_ARM_ARCH__ 7
-
-#ifdef __ARMEB__
-#define xh r0
-#define xl r1
-#define yh r2
-#define yl r3
-#else
-#define xl r0
-#define xh r1
-#define yl r2
-#define yh r3
-#endif
-
-/*
- * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
- *
- * Note: Calling convention is totally non standard for optimal code.
- * This is meant to be used by do_div() from include/asm/div64.h only.
- *
- * Input parameters:
- * xh-xl = dividend (clobbered)
- * r4 = divisor (preserved)
- *
- * Output values:
- * yh-yl = result
- * xh = remainder
- *
- * Clobbered regs: xl, ip
- */
-
-
-.globl __do_div64;
-.align 4,0x90
-__do_div64:
-
- @ Test for easy paths first.
- subs ip, r4, #1
- bls 9f @ divisor is 0 or 1
- tst ip, r4
- beq 8f @ divisor is power of 2
-
- @ See if we need to handle upper 32-bit result.
- cmp xh, r4
- mov yh, #0
- blo 3f
-
- @ Align divisor with upper part of dividend.
- @ The aligned divisor is stored in yl preserving the original.
- @ The bit position is stored in ip.
-
-#if __LINUX_ARM_ARCH__ >= 5
-
- clz yl, r4
- clz ip, xh
- sub yl, yl, ip
- mov ip, #1
- mov ip, ip, lsl yl
- mov yl, r4, lsl yl
-
-#else
-
- mov yl, r4
- mov ip, #1
-1: cmp yl, #0x80000000
- cmpcc yl, xh
- movcc yl, yl, lsl #1
- movcc ip, ip, lsl #1
- bcc 1b
-
-#endif
-
- @ The division loop for needed upper bit positions.
- @ Break out early if dividend reaches 0.
-2: cmp xh, yl
- orrcs yh, yh, ip
- subcss xh, xh, yl
- movnes ip, ip, lsr #1
- mov yl, yl, lsr #1
- bne 2b
-
- @ See if we need to handle lower 32-bit result.
-3: cmp xh, #0
- mov yl, #0
- cmpeq xl, r4
- movlo xh, xl
- movlo pc, lr
-
- @ The division loop for lower bit positions.
- @ Here we shift remainer bits leftwards rather than moving the
- @ divisor for comparisons, considering the carry-out bit as well.
- mov ip, #0x80000000
-4: movs xl, xl, lsl #1
- adcs xh, xh, xh
- beq 6f
- cmpcc xh, r4
-5: orrcs yl, yl, ip
- subcs xh, xh, r4
- movs ip, ip, lsr #1
- bne 4b
- mov pc, lr
-
- @ The top part of remainder became zero. If carry is set
- @ (the 33th bit) this is a false positive so resume the loop.
- @ Otherwise, if lower part is also null then we are done.
-6: bcs 5b
- cmp xl, #0
- moveq pc, lr
-
- @ We still have remainer bits in the low part. Bring them up.
-
-#if __LINUX_ARM_ARCH__ >= 5
-
- clz xh, xl @ we know xh is zero here so...
- add xh, xh, #1
- mov xl, xl, lsl xh
- mov ip, ip, lsr xh
-
-#else
-
-7: movs xl, xl, lsl #1
- mov ip, ip, lsr #1
- bcc 7b
-
-#endif
-
- @ Current remainder is now 1. It is worthless to compare with
- @ divisor at this point since divisor can not be smaller than 3 here.
- @ If possible, branch for another shift in the division loop.
- @ If no bit position left then we are done.
- movs ip, ip, lsr #1
- mov xh, #1
- bne 4b
- mov pc, lr
-
-8: @ Division by a power of 2: determine what that divisor order is
- @ then simply shift values around
-
-#if __LINUX_ARM_ARCH__ >= 5
-
- clz ip, r4
- rsb ip, ip, #31
-
-#else
-
- mov yl, r4
- cmp r4, #(1 << 16)
- mov ip, #0
- movhs yl, yl, lsr #16
- movhs ip, #16
-
- cmp yl, #(1 << 8)
- movhs yl, yl, lsr #8
- addhs ip, ip, #8
-
- cmp yl, #(1 << 4)
- movhs yl, yl, lsr #4
- addhs ip, ip, #4
-
- cmp yl, #(1 << 2)
- addhi ip, ip, #3
- addls ip, ip, yl, lsr #1
-
-#endif
-
- mov yh, xh, lsr ip
- mov yl, xl, lsr ip
- rsb ip, ip, #32
- orr yl, yl, xh, lsl ip
- mov xh, xl, lsl ip
- mov xh, xh, lsr ip
- mov pc, lr
-
- @ eq -> division by 1: obvious enough...
-9: moveq yl, xl
- moveq yh, xh
- moveq xh, #0
- moveq pc, lr
-
- @ Division by 0:
- str lr, [sp, #-8]!
- bl __div0
-
- @ as wrong as it could be...
- mov yl, #0
- mov yh, #0
- mov xh, #0
- ldr pc, [sp], #8
-
-@.type __do_div64, @function;
-@.size __do_div64, .-__do_div64,
-
diff --git a/src/arch/x86/Makefile.inc b/src/arch/x86/Makefile.inc
index 98723f3f1a..0892efd37b 100644
--- a/src/arch/x86/Makefile.inc
+++ b/src/arch/x86/Makefile.inc
@@ -354,9 +354,9 @@ endif
$(objcbfs)/romstage_null.debug: $$(romstage-objs) $(objgenerated)/romstage_null.ld
@printf " LINK $(subst $(obj)/,,$(@))\n"
ifeq ($(CONFIG_COMPILER_LLVM_CLANG),y)
- $(LD) -nostdlib -nostartfiles -static -o $@ -L$(obj) $(romstage-objs) -T $(objgenerated)/romstage_null.ld
+ $(LD) -nostdlib -nostartfiles -static -o $@ -L$(obj) --wrap __divdi3 --wrap __udivdi3 --wrap __moddi3 --wrap __umoddi3 --start-group $(romstage-objs) $(LIBGCC_FILE_NAME) --end-group -T $(objgenerated)/romstage_null.ld
else
- $(CC) -nostdlib -nostartfiles -static -o $@ -L$(obj) -T $(objgenerated)/romstage_null.ld $(romstage-objs)
+ $(CC) -nostdlib -nostartfiles -static -o $@ -L$(obj) -T $(objgenerated)/romstage_null.ld -Wl,--wrap,__divdi3 -Wl,--wrap,__udivdi3 -Wl,--wrap,__moddi3 -Wl,--wrap,__umoddi3 -Wl,--start-group $(romstage-objs) $(LIBGCC_FILE_NAME) -Wl,--end-group
endif
$(NM) $@ | grep -q " [DdBb] "; if [ $$? -eq 0 ]; then \
echo "Forbidden global variables in romstage:"; \
@@ -366,9 +366,9 @@ endif
$(objcbfs)/romstage_xip.debug: $$(romstage-objs) $(objgenerated)/romstage_xip.ld
@printf " LINK $(subst $(obj)/,,$(@))\n"
ifeq ($(CONFIG_COMPILER_LLVM_CLANG),y)
- $(LD) -nostdlib -nostartfiles -static -o $@ -L$(obj) $(romstage-objs) -T $(objgenerated)/romstage_xip.ld
+ $(LD) -nostdlib -nostartfiles -static -o $@ -L$(obj) --wrap __divdi3 --wrap __udivdi3 --wrap __moddi3 --wrap __umoddi3 --start-group $(romstage-objs) $(LIBGCC_FILE_NAME) --end-group -T $(objgenerated)/romstage_xip.ld
else
- $(CC) -nostdlib -nostartfiles -static -o $@ -L$(obj) -T $(objgenerated)/romstage_xip.ld $(romstage-objs)
+ $(CC) -nostdlib -nostartfiles -static -o $@ -L$(obj) -T $(objgenerated)/romstage_xip.ld -Wl,--wrap,__divdi3 -Wl,--wrap,__udivdi3 -Wl,--wrap,__moddi3 -Wl,--wrap,__umoddi3 -Wl,--start-group $(romstage-objs) $(LIBGCC_FILE_NAME) -Wl,--end-group
endif
$(objgenerated)/romstage_null.ld: $$(ldscripts) $(obj)/ldoptions
diff --git a/src/arch/x86/include/div64.h b/src/arch/x86/include/div64.h
deleted file mode 100644
index bbc9921d29..0000000000
--- a/src/arch/x86/include/div64.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef __I386_DIV64
-#define __I386_DIV64
-
-/*
- * do_div() is NOT a C function. It wants to return
- * two values (the quotient and the remainder), but
- * since that doesn't work very well in C, what it
- * does is:
- *
- * - modifies the 64-bit dividend _in_place_
- * - returns the 32-bit remainder
- *
- * This ends up being the most efficient "calling
- * convention" on x86.
- */
-#define do_div(n,base) ({ \
- unsigned long __upper, __low, __high, __mod, __base; \
- __base = (base); \
- asm("":"=a" (__low), "=d" (__high):"A" (n)); \
- __upper = __high; \
- if (__high) { \
- __upper = __high % (__base); \
- __high = __high / (__base); \
- } \
- asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
- asm("":"=A" (n):"a" (__low),"d" (__high)); \
- __mod; \
-})
-
-#endif
diff --git a/src/console/vtxprintf.c b/src/console/vtxprintf.c
index 9de25845bd..4fc6f35bd1 100644
--- a/src/console/vtxprintf.c
+++ b/src/console/vtxprintf.c
@@ -5,7 +5,6 @@
*/
#include <string.h>
-#include <div64.h>
#include <console/console.h>
#include <console/vtxprintf.h>
@@ -70,20 +69,8 @@ static int number(void (*tx_byte)(unsigned char byte),
if (num == 0)
tmp[i++]='0';
else while (num != 0){
- /* there are some nice optimizations in the
- * Macros-From-Hell that form the div64 code
- * *IF* you call it with a constant.
- * We're firmware, we only do bases
- * 8, 10, and 16. Let's be smart.
- * This greatly helps ARM, reduces the
- * code footprint at compile time, and does not hurt x86.
- */
- if (base == 10)
- tmp[i++] = digits[do_div(num,10)];
- else if (base == 8)
- tmp[i++] = digits[do_div(num,8)];
- else /* sorry, you're out of choices */
- tmp[i++] = digits[do_div(num,16)];
+ tmp[i++] = digits[num % base];
+ num /= base;
}
if (i > precision)
precision = i;
diff --git a/src/cpu/x86/smm/Makefile.inc b/src/cpu/x86/smm/Makefile.inc
index 5f3f663233..405cf891ad 100644
--- a/src/cpu/x86/smm/Makefile.inc
+++ b/src/cpu/x86/smm/Makefile.inc
@@ -36,8 +36,8 @@ endif
smm-y += smihandler.c
smm-y += smiutil.c
-$(obj)/cpu/x86/smm/smm.o: $$(smm-objs)
- $(CC) $(LDFLAGS) -nostdlib -r -o $@ $^
+$(obj)/cpu/x86/smm/smm.o: $$(smm-objs) $(LIBGCC_FILE_NAME)
+ $(CC) $(LDFLAGS) -nostdlib -r -o $@ -Wl,--wrap,__divdi3 -Wl,--wrap,__udivdi3 -Wl,--wrap,__moddi3 -Wl,--wrap,__umoddi3 -Wl,--start-group $(smm-objs) $(LIBGCC_FILE_NAME) -Wl,--end-group
$(obj)/cpu/x86/smm/smm_wrap: $(obj)/cpu/x86/smm/smm.o $(src)/cpu/x86/smm/$(SMM_LDSCRIPT) $(obj)/ldoptions
$(CC) $(SMM_LDFLAGS) -nostdlib -nostartfiles -static -o $(obj)/cpu/x86/smm/smm.elf -T $(src)/cpu/x86/smm/$(SMM_LDSCRIPT) $(obj)/cpu/x86/smm/smm.o
diff --git a/src/lib/Makefile.inc b/src/lib/Makefile.inc
index 879ad7e190..be57f29411 100644
--- a/src/lib/Makefile.inc
+++ b/src/lib/Makefile.inc
@@ -48,6 +48,7 @@ romstage-$(CONFIG_USBDEBUG) += usbdebug.c
romstage-$(CONFIG_COLLECT_TIMESTAMPS) += timestamp.c cbmem.c
romstage-y += compute_ip_checksum.c
romstage-y += memmove.c
+romstage-$(CONFIG_ARCH_X86) += gcc.c
ramstage-y += hardwaremain.c
ramstage-y += selfboot.c
@@ -94,6 +95,7 @@ smm-y += cbfs.c memcmp.c
smm-$(CONFIG_CONSOLE_SERIAL8250) += uart8250.c
smm-$(CONFIG_CONSOLE_SERIAL8250MEM) += uart8250mem.c
smm-$(CONFIG_USBDEBUG) += usbdebug.c
+smm-y += gcc.c
$(obj)/lib/version.ramstage.o : $(obj)/build.h