//! A buddy allocator, used to allocate pages. #![no_std] mod bitset; mod free_list; mod tree; use crate::{ bitset::{Bitset, SubregionStatus}, free_list::{FreeList, FreeListNode}, tree::Tree, }; use allocator_api2::alloc::{AllocError, Layout, LayoutError}; use contracts::{ensures, requires}; use core::{fmt, mem, ptr::NonNull}; use vernos_alloc_physmem_free_list::FreeListAllocator; /// A buddy allocator. pub struct BuddyAllocator< 'allocator, const PAGE_SIZE: usize, const PAGE_SIZE_BITS: usize, const SIZE_CLASS_COUNT: usize, > { /// The free list sentinels (`SIZE_CLASS_COUNT` of them). free_list_sentinels: NonNull, /// The metadata associated with each tree. trees: &'allocator [Tree], /// The shared bitset. bitset: &'allocator mut Bitset, } impl< 'allocator, const PAGE_SIZE: usize, const PAGE_SIZE_BITS: usize, const SIZE_CLASS_COUNT: usize, > BuddyAllocator<'allocator, PAGE_SIZE, PAGE_SIZE_BITS, SIZE_CLASS_COUNT> { /// Returns a buddy allocator backed by the page ranges in the free list. pub fn new( mut free_list_alloc: FreeListAllocator<'allocator, PAGE_SIZE>, ) -> Result, AllocError> { assert_eq!(PAGE_SIZE, 1 << PAGE_SIZE_BITS); assert_ne!(SIZE_CLASS_COUNT, 0); // We split up all the nodes into size-class-sized chunks here. After this, it's important // for safety that we never split a node again -- we'll see why later. free_list_alloc.split_to_powers_of_two_no_larger_than(1 << (SIZE_CLASS_COUNT - 1)); // We use one contiguous region to store the free lists and all the information about each // tree (the base, the maximum order, and the bitset). We do one pass through the free list // to find out how much storage we need, then we try to find a convenient spot to steal the // storage we need from a page range in the list. let mut range_count = 0; let mut size_class_counts = [0; SIZE_CLASS_COUNT]; for (_, len_pages) in free_list_alloc.iter() { assert!(len_pages.is_power_of_two()); let size_class = len_pages.trailing_zeros() as usize; assert!((0..SIZE_CLASS_COUNT).contains(&size_class)); range_count += 1; size_class_counts[size_class] += 1; } // Lay out the metadata. // // UNWRAP: The error here really ought to be impossible, because the pages in the free list // needed to fit in there in the first place. let metadata_layout = MetadataLayout::::new( range_count, size_class_counts, ) .unwrap(); // Allocate space for the metadata. // // SAFETY: We need to make _absolutely_ sure nothing in the buddy allocator uses this // memory before we have a chance to remove it from the allocator. This function guarantees // that the returned memory won't overlap with the first page of a live node, so as long as // we don't split a node or write past the first page of one, we should be OK. let metadata = unsafe { free_list_alloc.allocate_without_always_removing_the_node(metadata_layout.layout)? }; // SAFETY: The pointer must be valid to have been returned here. unsafe { metadata.cast::().write_bytes(0, metadata.len()); } // Break out each component of the metadata. // // SAFETY: The layout is computed to make all of this sound if the data is valid. All of // these types are valid when zero-initialized. let free_list_sentinels: NonNull = unsafe { metadata .byte_add(metadata_layout.free_list_sentinels_offset) .cast() }; let trees: &'allocator mut [Tree] = unsafe { NonNull::slice_from_raw_parts( metadata.byte_add(metadata_layout.trees_offset).cast(), metadata_layout.trees_len, ) .as_mut() }; // SAFETY: This type is valid when zero-initialized, and is a newtype for `[usize]` so has // the same layout. let bitset: &'allocator mut Bitset = unsafe { mem::transmute::<&mut [usize], &mut Bitset>( NonNull::slice_from_raw_parts( metadata.byte_add(metadata_layout.bitset_offset).cast(), metadata_layout.bitset_len, ) .as_mut(), ) }; // Self-link each free list. Although they're valid in a UB sense when zeroed, they're not // yet ready to use until this is done. for size_class in 0..SIZE_CLASS_COUNT { // SAFETY: This is the number of nodes we allocate, so we'll stay in-bounds. unsafe { FreeListNode::init_sentinel(free_list_sentinels.add(size_class)) }; } // Initialize the trees, adding subregions into the allocator as we do. We don't actually // assign bitset offsets yet, because we're going to sort the trees by address before we // do. (This isn't algorithmically necessary, but it makes debugging a bit easier.) let mut i = 0; while let Some((ptr, mut len_pages)) = free_list_alloc.pop() { while len_pages > 0 { let size_class = (len_pages.trailing_zeros() as usize).min(SIZE_CLASS_COUNT - 1); trees[i].base_ptr = Some(ptr.cast()); trees[i].size_class = size_class; i += 1; len_pages -= 1 << size_class; } } assert!( i == trees.len() || i + 1 == trees.len(), "found {} trees but allocated {} slots", i, trees.len() ); // Slice the trees slice to fit how many trees actually remained, then sort them by // address. let trees = &mut trees[..i]; trees.sort_unstable_by_key(|tree| tree.base_addr()); // Compute the number of bits that belong to each tree and assign them. let mut bitset_offset = 0; for tree in &mut *trees { tree.bitset_offset = bitset_offset; bitset_offset += (1 << (tree.size_class + 1)) - 1; } assert!(bitset_offset <= metadata_layout.bitset_len_bits); // Add regions the trees should own to the free lists, excluding the subregions that // metadata is allocated in. We construct the allocator now, so that we can use its // helpers. let mut buddy_allocator = BuddyAllocator { free_list_sentinels, trees, bitset, }; // Since we know that the metadata was allocated at the end of the page range it was // allocated in, we can just test the end addresses to figure out which tree we're going to // have to be careful about. let metadata_addr_lo = metadata.cast::().as_ptr() as usize; // SAFETY: This is the one-past-the-end address. let metadata_addr_hi = unsafe { metadata.cast::().add(metadata.len()).as_ptr() } as usize; for (tree_index, tree) in trees.iter().enumerate() { // SAFETY: This is the one-past-the-end address. let tree_addr_hi = tree.base_addr() + (PAGE_SIZE << tree.size_class); if metadata_addr_hi == tree_addr_hi { // If the metadata was present inside the tree, we effectively do a binary search // for the start of the metadata, handling the half that does not contain the start // address of the metadata each time. let mut ptr = tree.base_ptr.unwrap().cast::(); let mut offset = 0; let mut size_class = tree.size_class; while ptr.as_ptr() as usize != metadata_addr_lo { assert!(size_class != 0); size_class -= 1; // SAFETY: This must still be in-bounds. let higher_half = unsafe { ptr.add(PAGE_SIZE << size_class) }; let higher_half_addr = higher_half.as_ptr() as usize; if metadata_addr_lo < higher_half_addr { // If the metadata boundary is below the higher half boundary, we can // just "zoom into" the boundary. We don't actually have to do anything, // since we already decremented the size class. } else { // If the metadata boundary is at or above the higher half boundary, we can // add the lower half to the appropriate free list. let mut free_list = buddy_allocator.free_list(size_class); unsafe { free_list.push(ptr) }; let Ok(()) = tree.bitset_mark_as_present(buddy_allocator.bitset, size_class, offset) else { panic!("the bit for tree {tree_index}, offset {offset} was not set in {buddy_allocator:#?}") }; ptr = higher_half; offset += PAGE_SIZE << size_class; } } } else { // Otherwise, we can take the whole range of the tree and add it to the free list. let mut free_list = buddy_allocator.free_list(tree.size_class); // SAFETY: This region is not owned or borrowed by anything else (the only thing it // could be would be the metadata, but that's not in this tree), is of the correct // size, and is composed of untyped memory. unsafe { free_list.push(tree.base_ptr.unwrap().cast()) }; // Then, set a bit in the bitset to say that this region is present. let Ok(()) = tree.bitset_mark_as_present(buddy_allocator.bitset, tree.size_class, 0) else { panic!("the bit for tree {tree_index}, offset 0 was not set in {buddy_allocator:#?}") }; } } // Return the allocator. Ok(buddy_allocator) } /// Tries to allocate a subregion of a particular size class from the allocator. #[requires(size_class < SIZE_CLASS_COUNT)] pub fn alloc(&mut self, size_class: usize) -> Result, AllocError> { if let Some(ptr) = self.free_list(size_class).pop() { // Fast-path: try allocating from the right size class's free list. // Find the corresponding tree and the offset of the subregion in the tree. let (tree_index, offset, ptr) = self.tree_index_offset_and_ptr(ptr); // Mark the pointer as no longer being in the tree. let Ok(()) = self.trees[tree_index].bitset_mark_as_absent(self.bitset, size_class, offset) else { panic!("the bit for tree {tree_index}, offset {offset} was not set in {self:#?}") }; // Return the pointer. Ok(ptr) } else if let Some(mut size_class_to_split) = (size_class + 1..SIZE_CLASS_COUNT).find(|&i| !self.free_list(i).is_empty()) { // If a larger size class had an available chunk, we'll need to split it. // // UNWRAP: We checked that the list was non-empty. let ptr = self.free_list(size_class_to_split).pop().unwrap(); // Find the corresponding tree and the offset of the subregion in the tree. let (tree_index, offset, ptr) = self.tree_index_offset_and_ptr(ptr); // Mark the pointer as no longer being in the tree. let Ok(()) = self.trees[tree_index].bitset_mark_as_absent( self.bitset, size_class_to_split, offset, ) else { panic!("the bit for tree {tree_index}, offset {offset} was not set in {self:#?}") }; // For each size class between the one we found and the one we want, split the // subregion and add the other half to the free list (and mark it as present in the // bitset). while size_class_to_split != size_class { // Move to the next size class, since that's the size class of the subregion we'll // be putting back. size_class_to_split -= 1; let offset_to_other_half = PAGE_SIZE << size_class_to_split; // SAFETY: Thanks to our power-of-two policy, this must still fit within the bounds // of the tree. let other_half_ptr = unsafe { ptr.add(offset_to_other_half) }; let other_half_offset = offset + offset_to_other_half; // Mark the pointer as being in the free list. let Ok(()) = self.trees[tree_index].bitset_mark_as_present( self.bitset, size_class_to_split, other_half_offset, ) else { panic!("the bit for tree {tree_index}, offset {other_half_offset} was already set in {self:#?}") }; // Return the pointer to the appropriate free list. // // SAFETY: We know the higher half was valid, and we're semantically restricting // the size of our ownership down to the lower half. unsafe { self.free_list(size_class_to_split).push(other_half_ptr) }; } // Return the pointer that finally remains. Ok(ptr) } else { // Otherwise, there was no way to serve the allocation. Err(AllocError) } } /// Returns a subregion of a particular size class to the allocator. /// /// # Safety /// /// - `ptr` must have been returned by a call to `alloc` with the same `size_class`. /// - `ptr` must convey ownership over the entire region. #[requires(size_class < SIZE_CLASS_COUNT)] pub unsafe fn dealloc(&mut self, ptr: NonNull, mut size_class: usize) { // Find the corresponding tree and the offset of the subregion in the tree. let (tree_index, mut offset, mut ptr) = self.tree_index_offset_and_ptr(ptr); let tree = &self.trees[tree_index]; // Ensure that the memory is marked as not being in the free list. assert_eq!( tree.bitset_get(&mut self.bitset, size_class, offset), SubregionStatus::NotInFreeList ); // Start combining size classes. assert!(size_class <= tree.size_class); while size_class < tree.size_class { let buddy_offset = offset ^ (PAGE_SIZE << size_class); match tree.bitset_get(self.bitset, size_class, buddy_offset) { SubregionStatus::InFreeList => { // If the buddy was present, we can remove it from the bitset and free list and // keep merging. let buddy_node = tree .base_ptr .unwrap() .cast::() .byte_add(buddy_offset); let buddy_ptr = FreeListNode::unlink(buddy_node); let Ok(()) = tree.bitset_mark_as_absent(self.bitset, size_class, buddy_offset) else { panic!("the bit for tree {tree_index}, offset {buddy_offset} was not set in {self:#?}") }; // Continue merging with the lower of the two pointers (so it will still have a // power-of-two-aligned offset). ptr = ptr.min(buddy_ptr); offset = offset.min(buddy_offset); size_class += 1; } SubregionStatus::NotInFreeList => { // If the buddy was absent, we won't be able to do any more merging. break; } } } // Mark the pointer as being in the tree. let Ok(()) = tree.bitset_mark_as_present(self.bitset, size_class, offset) else { panic!("the bit for tree {tree_index}, offset {offset} was already set in {self:#?}") }; // Return the pointer to the appropriate free list. self.free_list(size_class).push(ptr); } /// Returns the free list with the given size class. #[requires(size_class < SIZE_CLASS_COUNT)] fn free_list(&mut self, size_class: usize) -> FreeList { // SAFETY: The bounds-check is a precondition. let sentinel = unsafe { self.free_list_sentinels.add(size_class) }; // SAFETY: The free list is kept valid. unsafe { FreeList::from_sentinel(sentinel) } } /// Returns the index of the tree this pointer was allocated in, as well as the offset of the /// pointer in the tree and a copy of the pointer with the provenance of the tree. #[ensures(self.trees[ret.0].contains(ptr.as_ptr() as usize))] #[ensures(ret.1 < (PAGE_SIZE << self.trees[ret.0].size_class))] fn tree_index_offset_and_ptr(&self, ptr: NonNull) -> (usize, usize, NonNull) { let addr = ptr.as_ptr() as usize; let tree_index = self .trees .binary_search_by_key(&addr, |tree| tree.base_addr()); let tree_index = match tree_index { Ok(i) => i, Err(i) => { assert_ne!(i, 0); i - 1 } }; let tree = &self.trees[tree_index]; let offset = addr - tree.base_addr(); let ptr = unsafe { tree.base_ptr.unwrap().cast().add(offset) }; (tree_index, offset, ptr) } } impl< 'allocator, const PAGE_SIZE: usize, const PAGE_SIZE_BITS: usize, const SIZE_CLASS_COUNT: usize, > fmt::Debug for BuddyAllocator<'allocator, PAGE_SIZE, PAGE_SIZE_BITS, SIZE_CLASS_COUNT> { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { struct FreeLists(NonNull); impl fmt::Debug for FreeLists { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { fmt.debug_list() .entries((0..SIZE_CLASS_COUNT).map(|size_class| { // SAFETY: The free lists are kept valid, and the range of size classes is // necessarily in-bounds. unsafe { FreeList::from_sentinel(self.0.add(size_class)) } })) .finish() } } struct Bitsets<'allocator, const PAGE_SIZE: usize, const PAGE_SIZE_BITS: usize>( &'allocator [Tree], &'allocator Bitset, ); impl<'allocator, const PAGE_SIZE: usize, const PAGE_SIZE_BITS: usize> fmt::Debug for Bitsets<'allocator, PAGE_SIZE, PAGE_SIZE_BITS> { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { fmt.debug_list() .entries(self.0.iter().map(|tree| tree.debug_bitset(self.1))) .finish() } } fmt.debug_struct("BuddyAllocator") .field( "free_lists", &FreeLists::(self.free_list_sentinels), ) .field("trees", &self.trees) .field("bitsets", &Bitsets(self.trees, self.bitset)) .finish() } } #[derive(Debug)] struct MetadataLayout< const PAGE_SIZE: usize, const PAGE_SIZE_BITS: usize, const SIZE_CLASS_COUNT: usize, > { layout: Layout, free_list_sentinels_offset: usize, trees_offset: usize, trees_len: usize, bitset_offset: usize, bitset_len: usize, bitset_len_bits: usize, } impl MetadataLayout { fn new( range_count: usize, size_class_counts: [usize; SIZE_CLASS_COUNT], ) -> Result, LayoutError> { let trees_len = range_count; let mut bitset_len_bits = 0; for (size_class, &count) in size_class_counts.iter().enumerate() { let bits_for_size_class = (1 << (size_class + 1)) - 1; bitset_len_bits += bits_for_size_class * count; } let bitset_len = (bitset_len_bits + usize::BITS as usize - 1) / usize::BITS as usize; let free_list_sentinels_layout = Layout::new::<[FreeListNode; SIZE_CLASS_COUNT]>(); let trees_layout = Layout::array::>(trees_len)?; let bitset_layout = Layout::array::(bitset_len)?; let free_list_sentinels_offset = 0; let (layout, trees_offset) = free_list_sentinels_layout.extend(trees_layout)?; let (layout, bitset_offset) = layout.extend(bitset_layout)?; Ok(MetadataLayout { layout, free_list_sentinels_offset, trees_offset, trees_len, bitset_offset, bitset_len, bitset_len_bits, }) } }