1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
|
//! A buddy allocator, used to allocate pages.
//!
//! The allocator can be split into three abstractions: stripes, trees, and the allocator.
//!
//! TODO: See if there's standard terminology for these.
//!
//! ## Stripes
//!
//! The buddy allocator works in terms of size classes, which are power-of-two sized, starting at a
//! single page and going up from there. Each stripe corresponds to a single size class and a
//! particular region of memory.
//!
//! A stripe contains a circular doubly-linked free-list for subregions of that size class, and a
//! bitset marking whether a particular region has been allocated or not. Being a circular
//! doubly-linked list makes it cheap to remove an element whose address we know, as well as cheap
//! to push and pop elements.
//!
//! It's efficient to go from the address of a subregion to the index of its corresponding bit, so
//! when we hand out a subregion from the free-list or put one back, it's cheap to read or write
//! its bit.
//!
//! ## Trees
//!
//! A tree is logically a collection of stripes, one per size class. To pack the structures more
//! efficiently, they are stored interleaved with each other, and the tree manages this.
//!
//! The important logic at the level of trees happens when we allocate a subregion from a size
//! class whose stripe's free-list is empty, and when we free subregions.
//!
//! When a stripe's free-list is empty, the tree instead allocates a subregion of a larger size
//! from the next stripe. It can then store the unused portion in the current size class.
//!
//! The really important bit is the ability to merge subregions when they're freed. When we free a
//! subregion of a certain size class, we can check whether its neighbor (its buddy) is unallocated
//! as well. If so, we can remove it from the free-list by its address. We can combine the two
//! subregions into one of the next larger size class, and then return that subregion to the next
//! stripe.
//!
//! ## The buddy allocator
//!
//! Finally, the overall buddy allocator needs to be able to handle multiple memory regions. To
//! facilitate this, the trees are stored in an array, which forms the overall allocator.
#![no_std]
extern crate std; // TODO: Remove me
mod bitset;
mod free_list;
mod tree;
use crate::{
bitset::{Bitset, SubregionStatus},
free_list::{FreeList, FreeListNode},
tree::Tree,
};
use allocator_api2::alloc::{AllocError, Layout, LayoutError};
use contracts::{ensures, requires};
use core::{fmt, mem, ptr::NonNull};
use vernos_alloc_physmem_free_list::FreeListAllocator;
/// A buddy allocator.
pub struct BuddyAllocator<
'allocator,
const PAGE_SIZE: usize,
const PAGE_SIZE_BITS: usize,
const SIZE_CLASS_COUNT: usize,
> {
/// The free list sentinels (`SIZE_CLASS_COUNT` of them).
free_list_sentinels: NonNull<FreeListNode>,
/// The metadata associated with each tree.
trees: &'allocator [Tree<PAGE_SIZE, PAGE_SIZE_BITS>],
/// The shared bitset.
bitset: &'allocator mut Bitset,
}
impl<
'allocator,
const PAGE_SIZE: usize,
const PAGE_SIZE_BITS: usize,
const SIZE_CLASS_COUNT: usize,
> BuddyAllocator<'allocator, PAGE_SIZE, PAGE_SIZE_BITS, SIZE_CLASS_COUNT>
{
/// Returns a buddy allocator backed by the page ranges in the free list.
pub fn new(
mut free_list_alloc: FreeListAllocator<'allocator, PAGE_SIZE>,
) -> Result<BuddyAllocator<'allocator, PAGE_SIZE, PAGE_SIZE_BITS, SIZE_CLASS_COUNT>, AllocError>
{
assert_eq!(PAGE_SIZE, 1 << PAGE_SIZE_BITS);
assert_ne!(SIZE_CLASS_COUNT, 0);
// We split up all the nodes into size-class-sized chunks here. After this, it's important
// for safety that we never split a node again -- we'll see why later.
free_list_alloc.split_to_powers_of_two_no_larger_than(1 << (SIZE_CLASS_COUNT - 1));
// We use one contiguous region to store the free lists and all the information about each
// tree (the base, the maximum order, and the bitset). We do one pass through the free list
// to find out how much storage we need, then we try to find a convenient spot to steal the
// storage we need from a page range in the list.
let mut range_count = 0;
let mut size_class_counts = [0; SIZE_CLASS_COUNT];
for (_, len_pages) in free_list_alloc.iter() {
assert!(len_pages.is_power_of_two());
let size_class = len_pages.trailing_zeros() as usize;
assert!((0..SIZE_CLASS_COUNT).contains(&size_class));
range_count += 1;
size_class_counts[size_class] += 1;
}
// Lay out the metadata.
//
// UNWRAP: The error here really ought to be impossible, because the pages in the free list
// needed to fit in there in the first place.
let metadata_layout = MetadataLayout::<PAGE_SIZE, PAGE_SIZE_BITS, SIZE_CLASS_COUNT>::new(
range_count,
size_class_counts,
)
.unwrap();
// Allocate space for the metadata.
//
// SAFETY: We need to make _absolutely_ sure nothing in the buddy allocator uses this
// memory before we have a chance to remove it from the allocator. This function guarantees
// that the returned memory won't overlap with the first page of a live node, so as long as
// we don't split a node or write past the first page of one, we should be OK.
let metadata = unsafe {
free_list_alloc.allocate_without_always_removing_the_node(metadata_layout.layout)?
};
// SAFETY: The pointer must be valid to have been returned here.
unsafe {
metadata.cast::<u8>().write_bytes(0, metadata.len());
}
// Break out each component of the metadata.
//
// SAFETY: The layout is computed to make all of this sound if the data is valid. All of
// these types are valid when zero-initialized.
let free_list_sentinels: NonNull<FreeListNode> = unsafe {
metadata
.byte_add(metadata_layout.free_list_sentinels_offset)
.cast()
};
let trees: &'allocator mut [Tree<PAGE_SIZE, PAGE_SIZE_BITS>] = unsafe {
NonNull::slice_from_raw_parts(
metadata.byte_add(metadata_layout.trees_offset).cast(),
metadata_layout.trees_len,
)
.as_mut()
};
// SAFETY: This type is valid when zero-initialized, and is a newtype for `[usize]` so has
// the same layout.
let bitset: &'allocator mut Bitset = unsafe {
mem::transmute::<&mut [usize], &mut Bitset>(
NonNull::slice_from_raw_parts(
metadata.byte_add(metadata_layout.bitset_offset).cast(),
metadata_layout.bitset_len,
)
.as_mut(),
)
};
// Self-link each free list. Although they're valid in a UB sense when zeroed, they're not
// yet ready to use until this is done.
for size_class in 0..SIZE_CLASS_COUNT {
// SAFETY: This is the number of nodes we allocate, so we'll stay in-bounds.
unsafe { FreeListNode::init_sentinel(free_list_sentinels.add(size_class)) };
}
// Initialize the trees, adding subregions into the allocator as we do. We don't actually
// assign bitset offsets yet, because we're going to sort the trees by address before we
// do. (This isn't algorithmically necessary, but it makes debugging a bit easier.)
let mut i = 0;
while let Some((ptr, mut len_pages)) = free_list_alloc.pop() {
while len_pages > 0 {
let size_class = (len_pages.trailing_zeros() as usize).min(SIZE_CLASS_COUNT - 1);
trees[i].base_ptr = Some(ptr.cast());
trees[i].size_class = size_class;
i += 1;
len_pages -= 1 << size_class;
}
}
assert!(
i == trees.len() || i + 1 == trees.len(),
"found {} trees but allocated {} slots",
i,
trees.len()
);
// Slice the trees slice to fit how many trees actually remained, then sort them by
// address.
let trees = &mut trees[..i];
trees.sort_by_key(|tree| tree.base_addr());
// Compute the number of bits that belong to each tree and assign them.
let mut bitset_offset = 0;
for tree in &mut *trees {
tree.bitset_offset = bitset_offset;
bitset_offset += (1 << (tree.size_class + 1)) - 1;
}
assert!(bitset_offset <= metadata_layout.bitset_len_bits);
// Add regions the trees should own to the free lists, excluding the subregions that
// metadata is allocated in.
// Since we know that the metadata was allocated at the end of the page range it was
// allocated in, we can just test the end addresses.
//
// SAFETY: This is the one-past-the-end address.
let metadata_addr_hi =
unsafe { metadata.cast::<u8>().add(metadata.len()).as_ptr() } as usize;
for tree in &mut *trees {
// SAFETY: This is the one-past-the-end address.
let tree_addr_hi = tree.base_addr() + (PAGE_SIZE << tree.size_class);
if metadata_addr_hi == tree_addr_hi {
// If the metadata was present inside the tree... TODO.
std::eprintln!("TODO");
} else {
// Otherwise, we can take the whole range of the tree and add it to the free list.
// SAFETY: There are as many sentinels as size classes, so this will be in-bounds.
let sentinel = unsafe { free_list_sentinels.add(tree.size_class) };
// SAFETY: The free list is kept valid.
let mut free_list = unsafe { FreeList::from_sentinel(sentinel) };
// SAFETY: This region is not owned or borrowed by anything else (the only thing it
// could be would be the metadata, but that's not in this tree), is of the correct
// size, and is composed of untyped memory.
unsafe { free_list.push(tree.base_ptr.unwrap().cast()) };
// Then, set a bit in the bitset to say that this region is present.
let Ok(()) = tree.bitset_mark_as_present(bitset, tree.size_class, 0) else {
panic!("the first bit was already set in {tree:#?}\nbitset = {bitset:?}")
};
}
}
// Make the buddy allocator's struct and return it.
Ok(BuddyAllocator {
free_list_sentinels,
trees,
bitset,
})
}
/// Tries to allocate a subregion of a particular size class from the allocator.
#[requires(size_class < SIZE_CLASS_COUNT)]
pub fn alloc(&mut self, size_class: usize) -> Result<NonNull<u8>, AllocError> {
if let Some(ptr) = self.free_list(size_class).pop() {
// Fast-path: try allocating from the right size class's free list.
// Find the corresponding tree and the offset of the subregion in the tree.
let (tree_index, offset, ptr) = self.tree_index_offset_and_ptr(ptr);
// Mark the pointer as no longer being in the tree.
let Ok(()) =
self.trees[tree_index].bitset_mark_as_absent(self.bitset, size_class, offset)
else {
panic!("the bit for tree {tree_index}, offset {offset} was not set in {self:#?}")
};
// Return the pointer.
Ok(ptr)
} else if let Some(mut size_class_to_split) =
(size_class + 1..SIZE_CLASS_COUNT).find(|&i| !self.free_list(i).is_empty())
{
// If a larger size class had an available chunk, we'll need to split it.
//
// UNWRAP: We checked that the list was non-empty.
let ptr = self.free_list(size_class_to_split).pop().unwrap();
// Find the corresponding tree and the offset of the subregion in the tree.
let (tree_index, offset, ptr) = self.tree_index_offset_and_ptr(ptr);
// Mark the pointer as no longer being in the tree.
let Ok(()) = self.trees[tree_index].bitset_mark_as_absent(
self.bitset,
size_class_to_split,
offset,
) else {
panic!("the bit for tree {tree_index}, offset {offset} was not set in {self:#?}")
};
// For each size class between the one we found and the one we want, split the
// subregion and add the other half to the free list (and mark it as present in the
// bitset).
while size_class_to_split != size_class {
// Move to the next size class, since that's the size class of the subregion we'll
// be putting back.
size_class_to_split -= 1;
let offset_to_other_half = PAGE_SIZE << size_class_to_split;
// SAFETY: Thanks to our power-of-two policy, this must still fit within the bounds
// of the tree.
let other_half_ptr = unsafe { ptr.add(offset_to_other_half) };
let other_half_offset = offset + offset_to_other_half;
// Mark the pointer as being in the free list.
let Ok(()) = self.trees[tree_index].bitset_mark_as_present(
self.bitset,
size_class_to_split,
other_half_offset,
) else {
panic!("the bit for tree {tree_index}, offset {other_half_offset} was already set in {self:#?}")
};
// Return the pointer to the appropriate free list.
//
// SAFETY: We know the higher half was valid, and we're semantically restricting
// the size of our ownership down to the lower half.
unsafe { self.free_list(size_class_to_split).push(other_half_ptr) };
}
// Return the pointer that finally remains.
Ok(ptr)
} else {
// Otherwise, there was no way to serve the allocation.
Err(AllocError)
}
}
/// Returns a subregion of a particular size class to the allocator.
///
/// # Safety
///
/// - `ptr` must have been returned by a call to `alloc` with the same `size_class`.
/// - `ptr` must convey ownership over the entire region.
#[requires(size_class < SIZE_CLASS_COUNT)]
pub unsafe fn dealloc(&mut self, ptr: NonNull<u8>, mut size_class: usize) {
// Find the corresponding tree and the offset of the subregion in the tree.
let (tree_index, mut offset, mut ptr) = self.tree_index_offset_and_ptr(ptr);
let tree = &self.trees[tree_index];
// Ensure that the memory is marked as not being in the free list.
assert_eq!(
tree.bitset_get(&mut self.bitset, size_class, offset),
SubregionStatus::NotInFreeList
);
// Start combining size classes.
assert!(size_class <= tree.size_class);
while size_class < tree.size_class {
let buddy_offset = offset ^ (PAGE_SIZE << size_class);
match tree.bitset_get(self.bitset, size_class, buddy_offset) {
SubregionStatus::InFreeList => {
// If the buddy was present, we can remove it from the bitset and free list and
// keep merging.
let buddy_node = tree
.base_ptr
.unwrap()
.cast::<FreeListNode>()
.byte_add(buddy_offset);
let buddy_ptr = FreeListNode::unlink(buddy_node);
let Ok(()) = tree.bitset_mark_as_absent(self.bitset, size_class, buddy_offset)
else {
panic!("the bit for tree {tree_index}, offset {buddy_offset} was not set in {self:#?}")
};
// Continue merging with the lower of the two pointers (so it will still have a
// power-of-two-aligned offset).
ptr = ptr.min(buddy_ptr);
offset = offset.min(buddy_offset);
size_class += 1;
}
SubregionStatus::NotInFreeList => {
// If the buddy was absent, we won't be able to do any more merging.
break;
}
}
}
// Mark the pointer as being in the tree.
let Ok(()) = tree.bitset_mark_as_present(self.bitset, size_class, offset) else {
panic!("the bit for tree {tree_index}, offset {offset} was already set in {self:#?}")
};
// Return the pointer to the appropriate free list.
self.free_list(size_class).push(ptr);
}
/// Returns the free list with the given size class.
#[requires(size_class < SIZE_CLASS_COUNT)]
fn free_list(&mut self, size_class: usize) -> FreeList {
// SAFETY: The bounds-check is a precondition.
let sentinel = unsafe { self.free_list_sentinels.add(size_class) };
// SAFETY: The free list is kept valid.
unsafe { FreeList::from_sentinel(sentinel) }
}
/// Returns the index of the tree this pointer was allocated in, as well as the offset of the
/// pointer in the tree and a copy of the pointer with the provenance of the tree.
#[ensures(self.trees[ret.0].contains(ptr.as_ptr() as usize))]
#[ensures(ret.1 < (PAGE_SIZE << self.trees[ret.0].size_class))]
fn tree_index_offset_and_ptr(&self, ptr: NonNull<u8>) -> (usize, usize, NonNull<u8>) {
let addr = ptr.as_ptr() as usize;
let tree_index = self
.trees
.binary_search_by_key(&addr, |tree| tree.base_addr());
let tree_index = match tree_index {
Ok(i) => i,
Err(i) => {
assert_ne!(i, 0);
i - 1
}
};
let tree = &self.trees[tree_index];
let offset = addr - tree.base_addr();
let ptr = unsafe { tree.base_ptr.unwrap().cast().add(offset) };
(tree_index, offset, ptr)
}
}
impl<
'allocator,
const PAGE_SIZE: usize,
const PAGE_SIZE_BITS: usize,
const SIZE_CLASS_COUNT: usize,
> fmt::Debug for BuddyAllocator<'allocator, PAGE_SIZE, PAGE_SIZE_BITS, SIZE_CLASS_COUNT>
{
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
struct FreeLists<const SIZE_CLASS_COUNT: usize>(NonNull<FreeListNode>);
impl<const SIZE_CLASS_COUNT: usize> fmt::Debug for FreeLists<SIZE_CLASS_COUNT> {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
fmt.debug_list()
.entries((0..SIZE_CLASS_COUNT).map(|size_class| {
// SAFETY: The free lists are kept valid, and the range of size classes is
// necessarily in-bounds.
unsafe { FreeList::from_sentinel(self.0.add(size_class)) }
}))
.finish()
}
}
struct Bitsets<'allocator, const PAGE_SIZE: usize, const PAGE_SIZE_BITS: usize>(
&'allocator [Tree<PAGE_SIZE, PAGE_SIZE_BITS>],
&'allocator Bitset,
);
impl<'allocator, const PAGE_SIZE: usize, const PAGE_SIZE_BITS: usize> fmt::Debug
for Bitsets<'allocator, PAGE_SIZE, PAGE_SIZE_BITS>
{
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
fmt.debug_list()
.entries(self.0.iter().map(|tree| tree.debug_bitset(self.1)))
.finish()
}
}
fmt.debug_struct("BuddyAllocator")
.field(
"free_lists",
&FreeLists::<SIZE_CLASS_COUNT>(self.free_list_sentinels),
)
.field("trees", &self.trees)
.field("bitsets", &Bitsets(self.trees, self.bitset))
.finish()
}
}
#[derive(Debug)]
struct MetadataLayout<
const PAGE_SIZE: usize,
const PAGE_SIZE_BITS: usize,
const SIZE_CLASS_COUNT: usize,
> {
layout: Layout,
free_list_sentinels_offset: usize,
trees_offset: usize,
trees_len: usize,
bitset_offset: usize,
bitset_len: usize,
bitset_len_bits: usize,
}
impl<const PAGE_SIZE: usize, const PAGE_SIZE_BITS: usize, const SIZE_CLASS_COUNT: usize>
MetadataLayout<PAGE_SIZE, PAGE_SIZE_BITS, SIZE_CLASS_COUNT>
{
fn new(
range_count: usize,
size_class_counts: [usize; SIZE_CLASS_COUNT],
) -> Result<MetadataLayout<PAGE_SIZE, PAGE_SIZE_BITS, SIZE_CLASS_COUNT>, LayoutError> {
let trees_len = range_count;
let mut bitset_len_bits = 0;
for (size_class, &count) in size_class_counts.iter().enumerate() {
let bits_for_size_class = (1 << (size_class + 1)) - 1;
bitset_len_bits += bits_for_size_class * count;
}
let bitset_len = (bitset_len_bits + usize::BITS as usize - 1) / usize::BITS as usize;
let free_list_sentinels_layout = Layout::new::<[FreeListNode; SIZE_CLASS_COUNT]>();
let trees_layout = Layout::array::<Tree<PAGE_SIZE, PAGE_SIZE_BITS>>(trees_len)?;
let bitset_layout = Layout::array::<usize>(bitset_len)?;
let free_list_sentinels_offset = 0;
let (layout, trees_offset) = free_list_sentinels_layout.extend(trees_layout)?;
let (layout, bitset_offset) = layout.extend(bitset_layout)?;
Ok(MetadataLayout {
layout,
free_list_sentinels_offset,
trees_offset,
trees_len,
bitset_offset,
bitset_len,
bitset_len_bits,
})
}
}
|