1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
|
/*
* Early initialization code for riscv virtual memory
*
* Copyright 2015 Google Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; version 2 of
* the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
* GNU General Public License for more details.
*/
#include <arch/barrier.h>
#include <arch/encoding.h>
#include <arch/sbi.h>
#include <atomic.h>
#include <console/console.h>
#include <stdint.h>
#include <vm.h>
#include <symbols.h>
/* Delegate controls which traps are delegated to the payload. If you
* wish to temporarily disable some or all delegation you can, in a
* debugger, set it to a different value (e.g. 0 to have all traps go
* to M-mode). In practice, this variable has been a lifesaver. It is
* still not quite determined which delegation might by unallowed by
* the spec so for now we enumerate and set them all. */
static int delegate = 0
| (1 << CAUSE_MISALIGNED_FETCH)
| (1 << CAUSE_FAULT_FETCH)
| (1 << CAUSE_ILLEGAL_INSTRUCTION)
| (1 << CAUSE_BREAKPOINT)
| (1 << CAUSE_FAULT_LOAD)
| (1 << CAUSE_FAULT_STORE)
| (1 << CAUSE_USER_ECALL)
;
pte_t* root_page_table;
/* Indent the following text by 2*level spaces */
static void indent(int level)
{
int i;
for (i = 0; i < level; i++)
printk(BIOS_DEBUG, " ");
}
/*
* Convert a page table index at a given page table level to a virtual address
* offset
*/
static uintptr_t index_to_virt_addr(int index, int level)
{
/*
* Index is at most RISCV_PGLEVEL_BITS bits wide (not considering the
* leading zeroes. If level==0, the below expression thus shifts index
* into the highest bits of a 64-bit number, and then shifts it down
* with sign extension.
*
* If level>0, then the expression should work as expected, without any
* magic.
*/
return ((intptr_t)index)
<< (64 - RISCV_PGLEVEL_BITS - level * RISCV_PGLEVEL_BITS)
>> (64 - VA_BITS);
}
/* Dump the page table structures to the console -- helper function */
static void print_page_table_at(pte_t *pt, intptr_t virt_addr, int level)
{
int i;
indent(level);
printk(BIOS_DEBUG, "Level %d page table at 0x%p\n", level, pt);
for (i = 0; i < RISCV_PGSIZE / sizeof(pte_t); i++) {
char urwx[8];
uintptr_t pointer;
intptr_t next_virt_addr;
if (!(pt[i] & PTE_V))
continue;
urwx[0] = (pt[i] & PTE_U)? 'u' : '-';
urwx[1] = (pt[i] & PTE_R)? 'r' : '-';
urwx[2] = (pt[i] & PTE_W)? 'w' : '-';
urwx[3] = (pt[i] & PTE_X)? 'x' : '-';
urwx[4] = '\0';
next_virt_addr = virt_addr + index_to_virt_addr(i, level);
pointer = ((uintptr_t)pt[i] >> 10) << RISCV_PGSHIFT;
indent(level + 1);
printk(BIOS_DEBUG, "Valid PTE at index %d (0x%016zx -> 0x%zx), ",
i, (size_t) next_virt_addr, (size_t) pointer);
if (PTE_TABLE(pt[i]))
printk(BIOS_DEBUG, "page table\n");
else
printk(BIOS_DEBUG, "protections %s\n", urwx);
if (PTE_TABLE(pt[i])) {
print_page_table_at((pte_t *)pointer, next_virt_addr, level + 1);
}
}
}
/* Print the page table structures to the console */
void print_page_table(void) {
print_page_table_at((void *)(read_csr(sptbr) << RISCV_PGSHIFT), 0, 0);
}
void flush_tlb(void)
{
asm volatile("sfence.vm");
}
size_t pte_ppn(pte_t pte)
{
return pte >> PTE_PPN_SHIFT;
}
pte_t ptd_create(uintptr_t ppn)
{
return (ppn << PTE_PPN_SHIFT) | PTE_V;
}
pte_t pte_create(uintptr_t ppn, int prot, int user)
{
pte_t pte = (ppn << PTE_PPN_SHIFT) | PTE_R | PTE_V;
if (prot & PTE_W)
pte |= PTE_W;
if (prot & PTE_X)
pte |= PTE_X;
if (user)
pte |= PTE_U;
return pte;
}
// The current RISCV *physical* address space is this:
// * 0 - 2 GiB: miscellaneous IO devices
// * 2 GiB - 4 GiB DRAM
// * top 2048 bytes of memory: SBI (which we round out to a 4K page)
// We have determined, also, that if code references a physical address
// not backed by a device, we'll take a fault. In other words, we don't
// need to finely map the memory-mapped devices as we would on an x86.
// We can use GiB mappings for the IO space and we will take a trap
// if we reference hardware that does not exist.
//
// The intent of the RISCV designers is that pages be set up in M mode
// for lower privilege software. They have also told me that they
// expect, unlike other platforms, that next level software use these
// page tables. Some kernels (Linux) prefer the old fashioned model,
// where kernel starts with an identity (ID) map and sets up page tables as
// it sees fit. Other kernels (harvey) are fine with using whatever
// firmware sets up. We need to accommodate both. So, we set up the
// identity map for Linux, but also set up the map for kernels that
// are more willing to conform to the RISCV model. The map is as
// follows:
//
// ID map: map IO space and all of DRAM 1:1 using 1 GiB PTEs
// I.e. we use 1 GiB PTEs for 4 GiB.
// Linux/BSD uses this mapping just enough to replace it.
//
// The SBI page is the last page in the 64 bit address space.
// map that using the middle_pts shown below.
//
// Top 2G map, including SBI page: map the 2 Gib - 4 GiB of physical
// address space to 0xffffffff_80000000. This will be needed until the
// GNU toolchain can compile code to run at 0xffffffc000000000,
// i.e. the start of Sv39.
//
// Only Harvey/Plan 9 uses this Mapping, and temporarily. It can
// never be full removed as we need the 4KiB mapping for the SBI page.
//
// standard RISCV map long term: Map IO space, and all of DRAM, to the *lowest*
// possible negative address for this implementation,
// e.g. 0xffffffc000000000 for Sv39 CPUs. For now we can use GiB PTEs.
//
// RISCV map for now: map IO space, and all of DRAM, starting at
// 0xffff_ffc0_0000_0000, i.e. just as for Sv39.
//
// It is our intent on Harvey (and eventually Akaros) that we use
// this map, once the toolchain can correctly support it.
// We have tested this arrangement and it lets us boot harvey to user mode.
void init_vm(uintptr_t virtMemStart, uintptr_t physMemStart, pte_t *sbi_pt)
{
memset(sbi_pt, 0, RISCV_PGSIZE);
// need to leave room for sbi page
// 0xFFF... - 0xFFFFFFFF81000000 - RISCV_PGSIZE
intptr_t memorySize = 0x7F000000;
// middle page table
pte_t* middle_pt = (void*)sbi_pt + RISCV_PGSIZE;
size_t num_middle_pts = 2; // 3 level page table, 39 bit virtual address space for now
// root page table
pte_t* root_pt = (void*)middle_pt + num_middle_pts * RISCV_PGSIZE;
memset(middle_pt, 0, (num_middle_pts + 1) * RISCV_PGSIZE); // 0's out middle_pt and root_pt
for (size_t i = 0; i < num_middle_pts; i++)
root_pt[(1<<RISCV_PGLEVEL_BITS)-num_middle_pts+i] = ptd_create(((uintptr_t)middle_pt >> RISCV_PGSHIFT) + i);
// fill the middle page table
for (uintptr_t vaddr = virtMemStart, paddr = physMemStart;
paddr < physMemStart + memorySize;
vaddr += SUPERPAGE_SIZE, paddr += SUPERPAGE_SIZE) {
int l2_shift = RISCV_PGLEVEL_BITS + RISCV_PGSHIFT;
size_t l2_idx = (virtMemStart >> l2_shift) & ((1 << RISCV_PGLEVEL_BITS)-1);
l2_idx += ((vaddr - virtMemStart) >> l2_shift);
middle_pt[l2_idx] = pte_create(paddr >> RISCV_PGSHIFT,
PTE_U|PTE_R|PTE_W|PTE_X, 0);
}
// map SBI at top of vaddr space
// only need to map a single page for sbi interface
uintptr_t num_sbi_pages = 1;
uintptr_t sbiStartAddress = (uintptr_t) &sbi_page;
uintptr_t sbiAddr = sbiStartAddress;
for (uintptr_t i = 0; i < num_sbi_pages; i++) {
uintptr_t idx = (1 << RISCV_PGLEVEL_BITS) - num_sbi_pages + i;
sbi_pt[idx] = pte_create(sbiAddr >> RISCV_PGSHIFT,
PTE_R|PTE_X, 0);
sbiAddr += RISCV_PGSIZE;
}
pte_t* sbi_pte = middle_pt + ((num_middle_pts << RISCV_PGLEVEL_BITS)-1);
*sbi_pte = ptd_create((uintptr_t)sbi_pt >> RISCV_PGSHIFT);
// IO space. Identity mapped.
root_pt[0x000] = pte_create(0x00000000 >> RISCV_PGSHIFT,
PTE_R | PTE_W, 0);
root_pt[0x001] = pte_create(0x40000000 >> RISCV_PGSHIFT,
PTE_R | PTE_W, 0);
root_pt[0x002] = pte_create(0x80000000 >> RISCV_PGSHIFT,
PTE_R | PTE_W | PTE_X, 0);
root_pt[0x003] = pte_create(0xc0000000 >> RISCV_PGSHIFT,
PTE_R | PTE_W | PTE_X, 0);
// Negative address space map at 0xffffffc000000000
root_pt[0x100] = root_pt[0];
root_pt[0x101] = root_pt[1];
root_pt[0x102] = root_pt[2];
root_pt[0x103] = root_pt[3];
mb();
root_page_table = root_pt;
uintptr_t ptbr = ((uintptr_t) root_pt) >> RISCV_PGSHIFT;
write_csr(sptbr, ptbr);
}
void initVirtualMemory(void) {
uintptr_t ms;
ms = read_csr(mstatus);
ms = INSERT_FIELD(ms, MSTATUS_VM, VM_CHOICE);
write_csr(mstatus, ms);
ms = read_csr(mstatus);
if (EXTRACT_FIELD(ms, MSTATUS_VM) != VM_CHOICE) {
printk(BIOS_DEBUG, "We don't have virtual memory...\n");
return;
} else {
printk(BIOS_DEBUG, "-----------------------------\n");
printk(BIOS_DEBUG, "Virtual memory status enabled\n");
printk(BIOS_DEBUG, "-----------------------------\n");
}
// TODO: Figure out how to grab this from cbfs
// N.B. We used to map physical from 0x81000000,
// but since kernels need to be able to see the page tables
// created by firmware, we're going to map from start of RAM.
// All this is subject to change as we learn more. Much
// about RISCV is still in flux.
printk(BIOS_DEBUG, "Initializing virtual memory...\n");
uintptr_t physicalStart = 0x80000000;
uintptr_t virtualStart = 0xffffffff80000000;
init_vm(virtualStart, physicalStart, (pte_t *)_pagetables);
mb();
flush_tlb();
#if IS_ENABLED(CONFIG_DEBUG_PRINT_PAGE_TABLES)
printk(BIOS_DEBUG, "Finished initializing virtual memory, starting walk...\n");
print_page_table();
#else
printk(BIOS_DEBUG, "Finished initializing virtual memory\n");
#endif
}
void mstatus_init(void)
{
uintptr_t ms = 0;
ms = INSERT_FIELD(ms, MSTATUS_FS, 3);
ms = INSERT_FIELD(ms, MSTATUS_XS, 3);
write_csr(mstatus, ms);
// clear any pending timer interrupts.
clear_csr(mip, MIP_STIP | MIP_SSIP);
// enable machine and supervisor timer and
// all other supervisor interrupts.
set_csr(mie, MIP_MTIP | MIP_STIP | MIP_SSIP);
// Delegate supervisor timer and other interrupts
// to supervisor mode.
set_csr(mideleg, MIP_STIP | MIP_SSIP);
set_csr(medeleg, delegate);
// Enable all user/supervisor-mode counters using
// v1.10 register addresses.
// They moved from the earlier spec.
// Until we trust our toolchain use the hardcoded constants.
// These were in flux and people who get the older toolchain
// will have difficult-to-debug failures.
write_csr(/*mcounteren*/0x306, 7);
}
|