1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
/*
* This file is part of the coreboot project.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* Optimized assembly for low-level CPU operations on ARM64 processors.
*/
#include <arch/asm.h>
#include <arch/cache.h>
.macro dcache_apply_all crm
dsb sy
mrs x0, clidr_el1 // read CLIDR
and w3, w0, #0x07000000 // narrow to LoC
lsr w3, w3, #23 // left align LoC (low 4 bits)
cbz w3, 5f //done
mov w10, #0 // w10 = 2 * cache level
mov w8, #1 // w8 = constant 0b1
1: //next_level
add w2, w10, w10, lsr #1 // calculate 3 * cache level
lsr w1, w0, w2 // extract 3-bit cache type for this level
and w1, w1, #0x7 // w1 = cache type
cmp w1, #2 // is it data or i&d?
b.lt 4f //skip
msr csselr_el1, x10 // select current cache level
isb // sync change of csselr
mrs x1, ccsidr_el1 // w1 = read ccsidr
and w2, w1, #7 // w2 = log2(linelen_bytes) - 4
add w2, w2, #4 // w2 = log2(linelen_bytes)
ubfx w4, w1, #3, #10 // w4 = associativity - 1 (also
// max way number)
clz w5, w4 // w5 = 32 - log2(ways)
// (bit position of way in DC)
lsl w9, w4, w5 // w9 = max way number
// (aligned for DC)
lsl w16, w8, w5 // w16 = amount to decrement (way
// number per iteration)
2: //next_way
ubfx w7, w1, #13, #15 // w7 = max set #, right aligned
lsl w7, w7, w2 // w7 = max set #, DC aligned
lsl w17, w8, w2 // w17 = amount to decrement (set
// number per iteration)
3: //next_set
orr w11, w10, w9 // w11 = combine way # & cache #
orr w11, w11, w7 // ... and set #
dc \crm, x11 // clean and/or invalidate line
subs w7, w7, w17 // decrement set number
b.ge 3b //next_set
subs x9, x9, x16 // decrement way number
b.ge 2b //next_way
4: //skip
add w10, w10, #2 // increment 2 *cache level
cmp w3, w10 // Went beyond LoC?
b.gt 1b //next_level
5: //done
dsb sy
isb
ret
.endm
ENTRY(dcache_invalidate_all)
dcache_apply_all crm=isw
ENDPROC(dcache_invalidate_all)
ENTRY(dcache_clean_all)
dcache_apply_all crm=csw
ENDPROC(dcache_clean_all)
ENTRY(dcache_clean_invalidate_all)
dcache_apply_all crm=cisw
ENDPROC(dcache_clean_invalidate_all)
/* This must be implemented in assembly to ensure there are no accesses to
memory (e.g. the stack) in between disabling and flushing the cache. */
ENTRY(mmu_disable)
str x30, [sp, #-0x8]
mrs x0, sctlr_el3
mov x1, #~(SCTLR_C | SCTLR_M)
and x0, x0, x1
msr sctlr_el3, x0
isb
bl dcache_clean_invalidate_all
ldr x30, [sp, #-0x8]
ret
ENDPROC(mmu_disable)
/*
* Bring an ARMv8 processor we just gained control of (e.g. from IROM) into a
* known state regarding caches/SCTLR/PSTATE. Completely invalidates
* icache/dcache, disables MMU and dcache (if active), and enables unaligned
* accesses, icache and branch prediction (if inactive). Seeds the stack and
* initializes SP_EL0. Clobbers R22 and R23.
*/
ENTRY(arm64_init_cpu)
/* Initialize PSTATE (unmask all exceptions, select SP_EL0). */
msr SPSel, #0
msr DAIFClr, #0xf
/* TODO: This is where we'd put non-boot CPUs into WFI if needed. */
/* x22: SCTLR, return address: x23 (callee-saved by subroutine) */
mov x23, x30
/* TODO: Assert that we always start running at EL3 */
mrs x22, sctlr_el3
/* Activate ICache (12) already for speed during cache flush below. */
orr x22, x22, #(1 << 12)
msr sctlr_el3, x22
isb
/* Invalidate dcache */
bl dcache_invalidate_all
/* Deactivate MMU (0), Alignment Check (1) and DCache (2) */
and x22, x22, # ~(1 << 0) & ~(1 << 1) & ~(1 << 2)
/* Activate Stack Alignment (3) because why not */
orr x22, x22, #(1 << 3)
/* Set to little-endian (25) */
and x22, x22, # ~(1 << 25)
/* Deactivate write-xor-execute enforcement (19) */
and x22, x22, # ~(1 << 19)
msr sctlr_el3, x22
/* Invalidate icache and TLB for good measure */
ic iallu
tlbi alle3
dsb sy
isb
/* Initialize stack with sentinel value to later check overflow. */
ldr x2, =0xdeadbeefdeadbeef
ldr x0, =_stack
ldr x1, =_estack
1:
stp x2, x2, [x0], #16
cmp x0, x1
bne 1b
/* Leave a line of beef dead for easier visibility in stack dumps. */
sub sp, x0, #16
ret x23
ENDPROC(arm64_init_cpu)
|