src/cpu/x86/sipi_vector.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249

/* SPDX-License-Identifier: GPL-2.0-only */

#include <cpu/x86/cpu_info.S.inc>
#include <cpu/x86/cr.h>
#include <cpu/amd/mtrr.h>
#include <cpu/x86/msr.h>
#include <arch/ram_segs.h>

#define __RAMSTAGE__

/* The SIPI vector is responsible for initializing the APs in the system. It
 * loads microcode, sets up MSRs, and enables caching before calling into
 * C code. */

.section ".module_parameters", "aw", @progbits
ap_start_params:
gdtaddr:
.word 0 /* limit */
.long 0 /* table */
.word 0 /* unused */
idt_ptr:
.long 0
per_cpu_segment_descriptors:
.long 0
per_cpu_segment_selector:
.long 0
stack_top:
.long 0
stack_size:
.long 0
microcode_lock:
.long 0
microcode_ptr:
.long 0
msr_table_ptr:
.long 0
msr_count:
.long 0
c_handler:
.long 0
ap_count:
.long 0

#define CR0_CLEAR_FLAGS_CACHE_ENABLE (CR0_CD | CR0_NW)
#define CR0_SET_FLAGS (CR0_CLEAR_FLAGS_CACHE_ENABLE | CR0_PE)
#define CR0_CLEAR_FLAGS \
	(CR0_PG | CR0_AM | CR0_WP | CR0_NE | CR0_TS | CR0_EM | CR0_MP)

.text
.code16
.global _start
_start:
	cli
	xorl	%eax, %eax
	movl	%eax, %cr3    /* Invalidate TLB*/

	/* On hyper threaded cpus, invalidating the cache here is
	 * very very bad.  Don't.
	 */

	/* setup the data segment */
	movw	%cs, %ax
	movw	%ax, %ds

	/* The gdtaddr needs to be relative to the data segment in order
	 * to properly dereference it. The .text section comes first in an
	 * rmodule so _start can be used as a proxy for the load address. */
	movl	$(gdtaddr), %ebx
	sub	$(_start), %ebx

	lgdtl	(%ebx)

	movl	%cr0, %eax
	andl	$~CR0_CLEAR_FLAGS, %eax
	orl	$CR0_SET_FLAGS, %eax
	movl	%eax, %cr0

	ljmpl	$RAM_CODE_SEG, $1f
1:
	.code32
	movw	$RAM_DATA_SEG, %ax
	movw	%ax, %ds
	movw	%ax, %es
	movw	%ax, %ss
	xor	%ax, %ax /* zero out the gs and fs segment index */
	movw	%ax, %fs
	movw	%ax, %gs /* Will be used for cpu_info */

	/* Load the Interrupt descriptor table */
	mov	idt_ptr, %ebx
	lidt	(%ebx)

1:
	/* Obtain CPU number. */
	movl	ap_count, %ecx
	inc	%ecx
	lock cmpxchg %ecx, ap_count
	jnz	1b

	/* Setup stacks for each CPU. */
	movl	stack_size, %eax
	mul	%ecx
	movl	stack_top, %edx
	subl	%eax, %edx
	mov	%edx, %esp

	push_cpu_info index=%ecx
	push_per_cpu_segment_data

	/*
	 * Update the AP's per_cpu_segment_descriptor to point to the
	 * per_cpu_segment_data that was allocated on the stack.
	 */
	set_segment_descriptor_base per_cpu_segment_descriptors, %esp, %ecx

	mov	%ecx, %eax
	shl	$3, %eax /* The index is << 3 in the segment selector */
	add	per_cpu_segment_selector, %eax
	mov	%eax, %gs

	/*
	 * The following code only needs to run on Intel platforms and thus the caller
	 * doesn't provide a microcode_ptr if not on Intel.
	 * On Intel platforms which update microcode using FIT the version check will
	 * also skip the microcode update.
	 */

	/* Determine if one should check microcode versions. */
	mov	microcode_ptr, %edi
	test	%edi, %edi
	jz	microcode_done /* Bypass if no microde exists. */

	/* Get the Microcode version. */
	xorl	%eax, %eax
	xorl	%edx, %edx
	movl	$IA32_BIOS_SIGN_ID, %ecx
	wrmsr
	mov	$1, %eax
	cpuid
	mov	$IA32_BIOS_SIGN_ID, %ecx
	rdmsr
	/* If something already loaded skip loading again. */
	test	%edx, %edx
	jnz	microcode_done

	/*
	 * Intel SDM and various BWGs specify to use a semaphore to update microcode
	 * on one thread per core on Hyper-Threading enabled CPUs. Due to this complex
	 * code would be necessary to determine the core #ID, initializing and picking
	 * the right semaphore out of CONFIG_MAX_CPUS / 2.
	 * Instead of the per core approachm, as recommended, use one global spinlock.
	 * Assuming that only pre-FIT platforms with Hyper-Threading enabled and at
	 * most 8 threads will ever run into this condition, the boot delay is negligible.
	 */

	/* Determine if parallel microcode loading is allowed. */
	cmpl	$0xffffffff, microcode_lock
	je	load_microcode

	/* Protect microcode loading. */
lock_microcode:
	lock btsl $0, microcode_lock
	jc	lock_microcode

load_microcode:
	/* Load new microcode. */
	mov	$IA32_BIOS_UPDT_TRIG, %ecx
	xor	%edx, %edx
	mov	%edi, %eax
	/* The microcode pointer is passed in pointing to the header. Adjust
	 * pointer to reflect the payload (header size is 48 bytes). */
	add	$48, %eax
	pusha
	wrmsr
	popa

	/* Unconditionally unlock microcode loading. */
	cmpl	$0xffffffff, microcode_lock
	je	microcode_done

	xor	%eax, %eax
	mov	%eax, microcode_lock

microcode_done:
	/*
	 * Load MSRs. Each entry in the table consists of:
	 * 0: index,
	 * 4: value[31:0]
	 * 8: value[63:32]
	 */
	mov	msr_table_ptr, %edi
	mov	msr_count, %ebx
	test	%ebx, %ebx
	jz	1f

#if CONFIG(X86_AMD_FIXED_MTRRS)
	/* Allow modification of RdDram and WrDram bits */
	mov	$SYSCFG_MSR, %ecx
	rdmsr
	or	$SYSCFG_MSR_MtrrFixDramModEn, %eax
	wrmsr
#endif

load_msr:
	mov	(%edi), %ecx
	mov	4(%edi), %eax
	mov	8(%edi), %edx
	wrmsr
	add	$12, %edi
	dec	%ebx
	jnz	load_msr

#if CONFIG(X86_AMD_FIXED_MTRRS)
	mov	$SYSCFG_MSR, %ecx
	rdmsr
	and	$~SYSCFG_MSR_MtrrFixDramModEn, %eax
	wrmsr
#endif

1:
	/* Enable caching. */
	mov	%cr0, %eax
	and	$~(CR0_CLEAR_FLAGS_CACHE_ENABLE), %eax
	mov	%eax, %cr0

#if CONFIG(SSE)
	/* Enable sse instructions. */
	mov	%cr4, %eax
	orl	$(CR4_OSFXSR | CR4_OSXMMEXCPT), %eax
	mov	%eax, %cr4
#endif

	andl	$0xfffffff0, %esp /* ensure stack alignment */

#if ENV_X86_64
	/* entry64.inc preserves ebx. */
#include <cpu/x86/64bit/entry64.inc>

	movabs	c_handler, %eax
	call	*%rax
#else
	mov	c_handler, %eax
	call	*%eax
#endif


halt_jump:
	hlt
	jmp	halt_jump