From d0ad46b868f361559ed3dd2afe9411be87198c91 Mon Sep 17 00:00:00 2001 From: Julien Cretin Date: Wed, 30 Sep 2020 10:34:35 +0200 Subject: [PATCH] Add format helpers for new store --- libraries/persistent_store/src/bitfield.rs | 41 +- libraries/persistent_store/src/format.rs | 907 +++++++++++++++++++++ libraries/persistent_store/src/lib.rs | 326 ++++++++ 3 files changed, 1255 insertions(+), 19 deletions(-) create mode 100644 libraries/persistent_store/src/format.rs diff --git a/libraries/persistent_store/src/bitfield.rs b/libraries/persistent_store/src/bitfield.rs index a2760c2..09e0ad7 100644 --- a/libraries/persistent_store/src/bitfield.rs +++ b/libraries/persistent_store/src/bitfield.rs @@ -13,8 +13,6 @@ // limitations under the License. //! Helps manipulate bit fields in 32-bits words. -// TODO(ia0): Remove when the module is used. -#![cfg_attr(not(test), allow(dead_code, unused_macros))] use crate::{StoreError, StoreResult}; @@ -180,24 +178,29 @@ macro_rules! bitfield_impl { // - Input are bit field descriptors // - Position is the number of bits used by prior bit fields // - Output are the bit field definitions - ([$($output: tt)*]{ pos: $pos: expr }[$name: ident: Bit, $($input: tt)*]) => { + ([$($output: tt)*]{ pos: $pos: expr } + [$(#[$meta: meta])* $name: ident: Bit, $($input: tt)*]) => { bitfield_impl! { - [$($output)* const $name: Bit = Bit { pos: $pos };] + [$($output)* $(#[$meta])* const $name: Bit = Bit { pos: $pos };] { pos: $pos + 1 } [$($input)*] } }; - ([$($output: tt)*]{ pos: $pos: expr }[$name: ident: Field <= $max: expr, $($input: tt)*]) => { + ([$($output: tt)*]{ pos: $pos: expr } + [$(#[$meta: meta])* $name: ident: Field <= $max: expr, $($input: tt)*]) => { bitfield_impl! { - [$($output)* const $name: Field = Field { pos: $pos, len: num_bits($max) };] + [$($output)* $(#[$meta])* const $name: Field = Field { + pos: $pos, + len: num_bits($max), + };] { pos: $pos + $name.len } [$($input)*] } }; ([$($output: tt)*]{ pos: $pos: expr } - [$name: ident: Checksum <= $max: expr, $($input: tt)*]) => { + [$(#[$meta: meta])* $name: ident: Checksum <= $max: expr, $($input: tt)*]) => { bitfield_impl! { - [$($output)* const $name: Checksum = Checksum { + [$($output)* $(#[$meta])* const $name: Checksum = Checksum { field: Field { pos: $pos, len: num_bits($max) } };] { pos: $pos + $name.field.len } @@ -213,9 +216,9 @@ macro_rules! bitfield_impl { } }; ([$($output: tt)*]{ pos: $pos: expr } - [$name: ident: ConstField = $bits: tt, $($input: tt)*]) => { + [$(#[$meta: meta])* $name: ident: ConstField = $bits: tt, $($input: tt)*]) => { bitfield_impl! { - Reverse $name []$bits + Reverse $(#[$meta])* $name []$bits [$($output)*]{ pos: $pos }[$($input)*] } }; @@ -224,17 +227,17 @@ macro_rules! bitfield_impl { // Auxiliary rules for constant bit fields: // - Input is a sequence of bits // - Output is the reversed sequence of bits - (Reverse $name: ident [$($output_bits: tt)*] [$bit: tt $($input_bits: tt)*] + (Reverse $(#[$meta: meta])* $name: ident [$($output_bits: tt)*] [$bit: tt $($input_bits: tt)*] [$($output: tt)*]{ pos: $pos: expr }[$($input: tt)*]) => { bitfield_impl! { - Reverse $name [$bit $($output_bits)*][$($input_bits)*] + Reverse $(#[$meta])* $name [$bit $($output_bits)*][$($input_bits)*] [$($output)*]{ pos: $pos }[$($input)*] } }; - (Reverse $name: ident $bits: tt [] + (Reverse $(#[$meta: meta])* $name: ident $bits: tt [] [$($output: tt)*]{ pos: $pos: expr }[$($input: tt)*]) => { bitfield_impl! { - ConstField $name { len: 0, val: 0 }$bits + ConstField $(#[$meta])* $name { len: 0, val: 0 }$bits [$($output)*]{ pos: $pos }[$($input)*] } }; @@ -242,10 +245,10 @@ macro_rules! bitfield_impl { // Auxiliary rules for constant bit fields: // - Input is a sequence of bits in reversed order // - Output is the constant bit field definition with the sequence of bits as value - (ConstField $name: ident { len: $len: expr, val: $val: expr }[] + (ConstField $(#[$meta: meta])* $name: ident { len: $len: expr, val: $val: expr }[] [$($output: tt)*]{ pos: $pos: expr }[$($input: tt)*]) => { bitfield_impl! { - [$($output)* const $name: ConstField = ConstField { + [$($output)* $(#[$meta])* const $name: ConstField = ConstField { field: Field { pos: $pos, len: $len }, value: $val, };] @@ -253,10 +256,10 @@ macro_rules! bitfield_impl { [$($input)*] } }; - (ConstField $name: ident { len: $len: expr, val: $val: expr }[$bit: tt $($bits: tt)*] - [$($output: tt)*]{ pos: $pos: expr }[$($input: tt)*]) => { + (ConstField $(#[$meta: meta])* $name: ident { len: $len: expr, val: $val: expr } + [$bit: tt $($bits: tt)*][$($output: tt)*]{ pos: $pos: expr }[$($input: tt)*]) => { bitfield_impl! { - ConstField $name { len: $len + 1, val: $val * 2 + $bit }[$($bits)*] + ConstField $(#[$meta])* $name { len: $len + 1, val: $val * 2 + $bit }[$($bits)*] [$($output)*]{ pos: $pos }[$($input)*] } }; diff --git a/libraries/persistent_store/src/format.rs b/libraries/persistent_store/src/format.rs new file mode 100644 index 0000000..21f4abc --- /dev/null +++ b/libraries/persistent_store/src/format.rs @@ -0,0 +1,907 @@ +// Copyright 2019-2020 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// TODO(ia0): Remove when the module is used. +#![allow(dead_code)] + +use crate::bitfield::*; +use crate::{Storage, StorageIndex, StoreError, StoreResult}; +use alloc::vec::Vec; +use core::cmp::min; +use core::convert::TryFrom; + +type WORD = u32; + +/// Size of a word in bytes. +const WORD_SIZE: usize = core::mem::size_of::(); + +/// Maximum size of a page in bytes. +const MAX_PAGE_SIZE: usize = 4096; + +/// Maximum number of erase cycles. +const MAX_CYCLE: usize = 65535; + +/// Maximum page index. +/// +/// Thus the maximum number of pages in one more than this number. +const MAX_PAGE: usize = 63; + +/// Maximum number of keys. +const MAX_KEY: usize = 4095; + +/// Maximum length in bytes of a user payload. +const MAX_VALUE_LEN: usize = 1023; + +/// Maximum number of updates per transaction. +const MAX_UPDATES: usize = 31; + +/// Maximum number of words per virtual page. +const MAX_VIRT_PAGE_SIZE: usize = div_ceil(MAX_PAGE_SIZE, WORD_SIZE) - CONTENT_WORD; + +/// Word with all bits set to one. +const ERASED_WORD: u32 = 0xffffffff; + +/// Helpers for a given storage configuration. +#[derive(Clone, Debug)] +pub struct Format { + /// The size in bytes of a page in the storage. + page_size: usize, + + /// The number of pages in the storage. + num_pages: usize, + + /// The maximum number of times a page can be erased. + max_page_erases: usize, +} + +impl Format { + /// Extracts the format from a storage. + /// + /// Returns `None` if the storage is not [supported]. + /// + /// [supported]: struct.Format.html#method.is_storage_supported + pub fn new(storage: &S) -> Option { + if Format::is_storage_supported(storage) { + Some(Format { + page_size: storage.page_size(), + num_pages: storage.num_pages(), + max_page_erases: storage.max_page_erases(), + }) + } else { + None + } + } + + /// Returns whether a storage is supported. + /// + /// A storage is supported if the following conditions hold: + /// - The size of a word is 4 bytes. + /// - The size of a word evenly divides the size of a page. + /// - A page contains at least 8 words. + /// - A page contains at most [`MAX_PAGE_SIZE`] bytes. + /// - There is at least 3 pages. + /// - There is at most [`MAX_PAGE`]` + 1` pages. + /// - A word can be written at least twice between erase cycles. + /// - A page can be erased at most [`MAX_CYCLE`] times. + /// + /// [`MAX_PAGE_SIZE`]: constant.MAX_PAGE_SIZE.html + /// [`MAX_PAGE`]: constant.MAX_PAGE.html + /// [`MAX_CYCLE`]: constant.MAX_CYCLE.html + fn is_storage_supported(storage: &S) -> bool { + let word_size = storage.word_size(); + let page_size = storage.page_size(); + let num_pages = storage.num_pages(); + let max_word_writes = storage.max_word_writes(); + let max_page_erases = storage.max_page_erases(); + word_size == 4 + && page_size % word_size == 0 + && (8 * word_size <= page_size && page_size <= MAX_PAGE_SIZE) + && (3 <= num_pages && num_pages <= MAX_PAGE + 1) + && max_word_writes >= 2 + && max_page_erases <= MAX_CYCLE + } + + /// The size of a word in bytes. + pub fn word_size(&self) -> usize { + WORD_SIZE + } + + /// The size of a page in bytes. + /// + /// We have `32 <= self.page_size() <= MAX_PAGE_SIZE`. + pub fn page_size(&self) -> usize { + self.page_size + } + + /// The number of pages in the storage. + /// + /// Notation: `N`. We have `3 <= N <= MAX_PAGE + 1`. + pub fn num_pages(&self) -> usize { + self.num_pages + } + + /// The maximum page index. + /// + /// We have `2 <= self.max_page() <= MAX_PAGE`. + pub fn max_page(&self) -> usize { + self.num_pages - 1 + } + + /// The maximum number of times a page can be erased. + /// + /// Notation: `E`. We have `E <= MAX_CYCLE`. + pub fn max_page_erases(&self) -> usize { + self.max_page_erases + } + + /// The maximum key. + pub fn max_key(&self) -> usize { + MAX_KEY + } + + /// The maximum number of updates per transaction. + pub fn max_updates(&self) -> usize { + MAX_UPDATES + } + + /// The size of a virtual page in words. + /// + /// A virtual page is stored in a physical page after the page header. + /// + /// Notation: `Q`. We have `6 <= Q <= MAX_VIRT_PAGE_SIZE`. + pub fn virt_page_size(&self) -> usize { + self.page_size() / self.word_size() - CONTENT_WORD + } + + /// The maximum length in bytes of a user payload. + /// + /// We have `20 <= self.max_value_len() <= MAX_VALUE_LEN`. + pub fn max_value_len(&self) -> usize { + min( + (self.virt_page_size() - 1) * self.word_size(), + MAX_VALUE_LEN, + ) + } + + /// The maximum prefix length in words. + /// + /// A prefix is the first words of a virtual page that belong to the last entry of the previous + /// virtual page. This happens because entries may overlap up to 2 virtual pages. + /// + /// Notation: `M`. We have `5 <= M < Q`. + pub fn max_prefix_len(&self) -> usize { + self.bytes_to_words(self.max_value_len()) + } + + /// The total virtual capacity in words. + /// + /// Notation: `V`. We have `V = (N - 1) * (Q - 1) - M`. + pub fn virt_size(&self) -> usize { + (self.num_pages() - 1) * (self.virt_page_size() - 1) - self.max_prefix_len() + } + + /// The total user capacity in words. + /// + /// Notation: `C`. We have `C = V - N = (N - 1) * (Q - 2) - M - 1`. + pub fn total_capacity(&self) -> usize { + // From the virtual capacity, we reserve N - 1 words for `Erase` entries and 1 word for a + // `Clear` entry. + self.virt_size() - self.num_pages() + } + + /// The total virtual lifetime in words. + /// + /// Notation: `L`. We have `L = (E * N + N - 1) * Q`. + pub fn total_lifetime(&self) -> Position { + Position::new(self, self.max_page_erases(), self.num_pages() - 1, 0) + } + + /// Returns the word position of the first entry of a page. + /// + /// The init info of the page must be provided to know where the first entry of the page + /// starts. + pub fn page_head(&self, init: InitInfo, page: usize) -> Position { + Position::new(self, init.cycle, page, init.prefix) + } + + /// Returns the storage index of the init info of a page. + pub fn index_init(&self, page: usize) -> StorageIndex { + let byte = INIT_WORD * self.word_size(); + StorageIndex { page, byte } + } + + /// Parses the init info of a page from its storage representation. + pub fn parse_init(&self, word: &[u8]) -> StoreResult> { + let word = slice_to_word(word); + Ok(if word == ERASED_WORD { + WordState::Erased + } else if WORD_CHECKSUM.get(word)? != 0 { + WordState::Partial + } else { + let cycle = INIT_CYCLE.get(word); + let prefix = INIT_PREFIX.get(word); + if cycle > self.max_page_erases() || prefix > self.max_prefix_len() { + return Err(StoreError::InvalidStorage); + } + WordState::Valid(InitInfo { cycle, prefix }) + }) + } + + /// Builds the storage representation of an init info. + pub fn build_init(&self, init: InitInfo) -> [u8; WORD_SIZE] { + let mut word = ERASED_WORD; + INIT_CYCLE.set(&mut word, init.cycle); + INIT_PREFIX.set(&mut word, init.prefix); + WORD_CHECKSUM.set(&mut word, 0); + word.to_ne_bytes() + } + + /// Returns the storage index of the compact info of a page. + pub fn index_compact(&self, page: usize) -> StorageIndex { + let byte = COMPACT_WORD * self.word_size(); + StorageIndex { page, byte } + } + + /// Parses the compact info of a page from its storage representation. + pub fn parse_compact(&self, word: &[u8]) -> StoreResult> { + let word = slice_to_word(word); + Ok(if word == ERASED_WORD { + WordState::Erased + } else if WORD_CHECKSUM.get(word)? != 0 { + WordState::Partial + } else { + let tail = COMPACT_TAIL.get(word); + if tail > self.virt_size() + self.max_prefix_len() { + return Err(StoreError::InvalidStorage); + } + WordState::Valid(CompactInfo { tail }) + }) + } + + /// Builds the storage representation of a compact info. + pub fn build_compact(&self, compact: CompactInfo) -> [u8; WORD_SIZE] { + let mut word = ERASED_WORD; + COMPACT_TAIL.set(&mut word, compact.tail); + WORD_CHECKSUM.set(&mut word, 0); + word.to_ne_bytes() + } + + /// Builds the storage representation of an internal entry. + pub fn build_internal(&self, internal: Internal) -> [u8; WORD_SIZE] { + let mut word = ERASED_WORD; + match internal { + Internal::Erase { page } => { + ID_ERASE.set(&mut word); + ERASE_PAGE.set(&mut word, page); + } + Internal::Clear { min_key } => { + ID_CLEAR.set(&mut word); + CLEAR_MIN_KEY.set(&mut word, min_key); + } + Internal::Marker { count } => { + ID_MARKER.set(&mut word); + MARKER_COUNT.set(&mut word, count); + } + Internal::Remove { key } => { + ID_REMOVE.set(&mut word); + REMOVE_KEY.set(&mut word, key); + } + } + WORD_CHECKSUM.set(&mut word, 0); + word.to_ne_bytes() + } + + /// Parses the first word of an entry from its storage representation. + pub fn parse_word(&self, word: &[u8]) -> StoreResult> { + let word = slice_to_word(word); + let valid = if ID_PADDING.check(word) { + ParsedWord::Padding(Padding { length: 0 }) + } else if ID_HEADER.check(word) { + if HEADER_DELETED.get(word) { + let length = HEADER_LENGTH.get(word); + if length > self.max_value_len() { + return Err(StoreError::InvalidStorage); + } + let length = self.bytes_to_words(length); + ParsedWord::Padding(Padding { length }) + } else { + let flipped = HEADER_FLIPPED.get(word); + let length = HEADER_LENGTH.get(word); + let key = HEADER_KEY.get(word); + let checksum = HEADER_CHECKSUM.get(word)?; + ParsedWord::Header(Header { + flipped, + length, + key, + checksum, + }) + } + } else if ID_ERASE.check(word) { + let page = ERASE_PAGE.get(word); + ParsedWord::Internal(Internal::Erase { page }) + } else if ID_CLEAR.check(word) { + let min_key = CLEAR_MIN_KEY.get(word); + ParsedWord::Internal(Internal::Clear { min_key }) + } else if ID_MARKER.check(word) { + let count = MARKER_COUNT.get(word); + ParsedWord::Internal(Internal::Marker { count }) + } else if ID_REMOVE.check(word) { + let key = REMOVE_KEY.get(word); + ParsedWord::Internal(Internal::Remove { key }) + } else if word == ERASED_WORD { + return Ok(WordState::Erased); + } else { + return Ok(WordState::Partial); + }; + if let ParsedWord::Internal(internal) = &valid { + if WORD_CHECKSUM.get(word)? != 0 { + return Ok(WordState::Partial); + } + let invalid = match internal { + Internal::Erase { page } => *page > self.max_page(), + Internal::Clear { min_key } => *min_key > self.max_key(), + Internal::Marker { count } => *count > MAX_UPDATES, + Internal::Remove { key } => *key > self.max_key(), + }; + if invalid { + return Err(StoreError::InvalidStorage); + } + } + Ok(WordState::Valid(valid)) + } + + /// Builds the storage representation of a user entry. + pub fn build_user(&self, key: usize, value: &[u8]) -> Vec { + let length = value.len(); + let word_size = self.word_size(); + let footer = self.bytes_to_words(length); + let mut result = vec![0xff; (1 + footer) * word_size]; + result[word_size..][..length].copy_from_slice(value); + let mut word = ERASED_WORD; + ID_HEADER.set(&mut word); + if footer > 0 && is_erased(&result[footer * word_size..]) { + HEADER_FLIPPED.set(&mut word); + *result.last_mut().unwrap() = 0x7f; + } + HEADER_LENGTH.set(&mut word, length); + HEADER_KEY.set(&mut word, key); + HEADER_CHECKSUM.set(&mut word, count_zeros(&result[footer * word_size..])); + result[..word_size].copy_from_slice(&word.to_ne_bytes()); + result + } + + /// Sets the padding bit in the first word of a user entry. + /// + /// The word is taken as a slice for convenience. + /// + /// # Panics + /// + /// Panics if `slice.len() != WORD_SIZE`. + pub fn set_padding(&self, slice: &mut [u8]) { + let mut word = slice_to_word(slice); + ID_PADDING.set(&mut word); + slice.copy_from_slice(&word.to_ne_bytes()); + } + + /// Sets the deleted bit in the first word of a user entry. + /// + /// The word is taken as a slice for convenience. + /// + /// # Panics + /// + /// Panics if `slice.len() != WORD_SIZE`. + pub fn set_deleted(&self, slice: &mut [u8]) { + let mut word = slice_to_word(slice); + HEADER_DELETED.set(&mut word); + slice.copy_from_slice(&word.to_ne_bytes()); + } + + /// Returns the minimum number of words to represent a given number of bytes. + pub fn bytes_to_words(&self, bytes: usize) -> usize { + div_ceil(bytes, self.word_size()) + } +} + +/// The word index of the init info in a page. +const INIT_WORD: usize = 0; + +/// The word index of the compact info in a page. +const COMPACT_WORD: usize = 1; + +/// The word index of the content of a page. +const CONTENT_WORD: usize = 2; + +/// The checksum for a single word. +/// +/// It needs 5 bits to store numbers between 0 and 27. +const WORD_CHECKSUM: Checksum = Checksum { + field: Field { pos: 27, len: 5 }, +}; + +// The fields of the init info of a page. +bitfield! { + /// The number of times the page has been erased. + INIT_CYCLE: Field <= MAX_CYCLE, + + /// The word index of the first entry in this virtual page. + INIT_PREFIX: Field <= div_ceil(MAX_VALUE_LEN, WORD_SIZE), + + #[cfg(test)] + LEN_INIT: Length, +} + +// The fields of the compact info of a page. +bitfield! { + /// The distance in words between head and tail at compaction. + /// + /// In particular, compaction copies non-deleted user entries from the head to the tail as long + /// as entries span the page to be compacted. + COMPACT_TAIL: Field <= MAX_VIRT_PAGE_SIZE * MAX_PAGE, + + #[cfg(test)] + LEN_COMPACT: Length, +} + +// Overview of the first word of the different kind of entries: +// +// 0123456789abcdef0123456789abcdef +// padding 0 +// header 10.............................. +// erase 11000........... +// clear 11001................. +// marker 11010.......... +// remove 11011................. +// +// NOTE: We could pad the internal entries to the right. + +// The fields of a padding entry. +bitfield! { + /// The identifier for padding entries. + ID_PADDING: ConstField = [0], +} + +// The fields of a user entry. +bitfield! { + /// The identifier for user entries. + ID_HEADER: ConstField = [1 0], + + /// Whether the user entry is deleted. + HEADER_DELETED: Bit, + + /// Whether the last bit of the user data is flipped. + HEADER_FLIPPED: Bit, + + /// The length in bytes of the user data. + // NOTE: It is possible to support values of length 1024 by having a separate kind of entries + // when the value is empty. We could then subtract one from the length here. + HEADER_LENGTH: Field <= MAX_VALUE_LEN, + + /// The key of the user entry. + HEADER_KEY: Field <= MAX_KEY, + + /// The checksum of the user entry. + /// + /// This counts the number of bits set to zero in both the first and last words of the user + /// entry, except in the checksum itself. So it needs 6 bits to store numbers between 0 and 58. + // NOTE: It may be possible to save one bit by storing: + // - the footer checksum (as a field) if the value is not empty + // - the header checksum (as a checksum) if the value is empty + HEADER_CHECKSUM: Checksum <= 58, + + #[cfg(test)] + LEN_HEADER: Length, +} + +// The fields of an erase entry. +bitfield! { + /// The identifier for erase entries. + ID_ERASE: ConstField = [1 1 0 0 0], + + /// The page to be erased. + ERASE_PAGE: Field <= MAX_PAGE, + + #[cfg(test)] + LEN_ERASE: Length, +} + +// The fields of a clear entry. +bitfield! { + /// The identifier for clear entries. + ID_CLEAR: ConstField = [1 1 0 0 1], + + /// The minimum key to be cleared. + /// + /// All entries with a key below this limit are not cleared. All other entries are deleted. + CLEAR_MIN_KEY: Field <= MAX_KEY, + + #[cfg(test)] + LEN_CLEAR: Length, +} + +// The fields of a marker entry. +bitfield! { + /// The identifier for marker entries. + ID_MARKER: ConstField = [1 1 0 1 0], + + /// The number of updates in this transaction. + /// + /// The update entries follow this marker entry. + MARKER_COUNT: Field <= MAX_UPDATES, + + #[cfg(test)] + LEN_MARKER: Length, +} + +// The fields of a remove entry. +bitfield! { + /// The identifier for remove entries. + ID_REMOVE: ConstField = [1 1 0 1 1], + + /// The key of the user entry to be removed. + REMOVE_KEY: Field <= MAX_KEY, + + #[cfg(test)] + LEN_REMOVE: Length, +} + +/// The position of a word in the virtual storage. +/// +/// With the notations defined in `Format`, let: +/// - `w` a virtual word offset in a page which is between `0` and `Q - 1` +/// - `p` a page offset which is between `0` and `N - 1` +/// - `c` the number of erase cycles of a page which is between `0` and `E` +/// +/// Then the position of a word is `(c*N + p)*Q + w`. This position monotonically increases and +/// represents the consumed lifetime of the storage. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Position(usize); + +impl core::ops::Add for Position { + type Output = Position; + + fn add(self, delta: usize) -> Position { + Position(self.0 + delta) + } +} + +impl core::ops::Sub for Position { + type Output = usize; + + fn sub(self, base: Position) -> usize { + self.0 - base.0 + } +} + +impl core::ops::AddAssign for Position { + fn add_assign(&mut self, delta: usize) { + self.0 += delta; + } +} + +impl Position { + /// Create a word position given its coordinates. + /// + /// The coordinates of a word are: + /// - Its word index in its page. + /// - Its page index in the storage. + /// - The number of times that page was erased. + pub fn new(format: &Format, cycle: usize, page: usize, word: usize) -> Position { + Position((cycle * format.num_pages() + page) * format.virt_page_size() + word) + } + + /// Accesses the underlying position as a natural number. + pub fn get(self) -> usize { + self.0 + } + + /// Returns the associated storage index. + pub fn index(self, format: &Format) -> StorageIndex { + let page = self.page(format); + let word = CONTENT_WORD + self.word(format); + let byte = word * format.word_size(); + StorageIndex { page, byte } + } + + /// Returns the beginning of the current virtual page. + pub fn page_begin(self, format: &Format) -> Position { + let virt_page_size = format.virt_page_size(); + Position((self.0 / virt_page_size) * virt_page_size) + } + + /// Returns the beginning of the next virtual page. + pub fn next_page(self, format: &Format) -> Position { + let virt_page_size = format.virt_page_size(); + Position((self.0 / virt_page_size + 1) * virt_page_size) + } + + /// Returns the number of times the current page was erased. + pub fn cycle(self, format: &Format) -> usize { + (self.0 / format.virt_page_size()) / format.num_pages() + } + + /// Returns the current page index. + pub fn page(self, format: &Format) -> usize { + (self.0 / format.virt_page_size()) % format.num_pages() + } + + /// Returns the current word index in the page. + pub fn word(self, format: &Format) -> usize { + self.0 % format.virt_page_size() + } +} + +/// Possible states of some storage representation as a word. +pub enum WordState { + /// The word is still erased. + Erased, + + /// The word is partially written. + Partial, + + /// Holds the decoded version of a valid word. + Valid(T), +} + +/// Information for an initialized page. +pub struct InitInfo { + /// The number of times this page has been erased. + pub cycle: usize, + + /// The word index of the first entry in this virtual page. + pub prefix: usize, +} + +/// Information for a page being compacted. +pub struct CompactInfo { + /// The distance in words between head and tail at compaction. + pub tail: usize, +} + +/// The first word of an entry. +#[derive(Debug)] +pub enum ParsedWord { + /// Padding entry. + Padding(Padding), + + /// Header of a user entry. + Header(Header), + + /// Internal entry. + Internal(Internal), +} + +/// Padding entry. +#[derive(Debug)] +pub struct Padding { + /// The number of following padding words after the first word of the padding entry. + pub length: usize, +} + +/// Header of a user entry. +#[derive(Debug)] +pub struct Header { + /// Whether the last bit of the user data is flipped. + pub flipped: bool, + + /// The length in bytes of the user data. + pub length: usize, + + /// The key of the user entry. + pub key: usize, + + /// The checksum of the user entry. + pub checksum: usize, +} + +impl Header { + /// Checks the validity of a user entry. + /// + /// If the user entry has no payload, the `footer` must be set to `None`. Otherwise it should be + /// the last word of the entry. + pub fn check(&self, footer: Option<&[u8]>) -> bool { + footer.map_or(0, |x| count_zeros(x)) == self.checksum + } +} + +/// Internal entry. +#[derive(Debug)] +pub enum Internal { + /// Indicates that a page should be erased. + Erase { + /// The page to be erased. + page: usize, + }, + + /// Indicates that user entries with high key should be deleted. + Clear { + /// The minimum key a user entry should have to be deleted. + min_key: usize, + }, + + /// Marks the start of a transaction. + /// + /// The marker is followed by a given number of updates, which are either user entries or remove + /// entries. + Marker { + /// The number of updates in the transaction. + count: usize, + }, + + /// Indicates that a user entry should be removed. + /// + /// This is only useful (and valid) as part of a transaction, since removing a single entry is + /// already atomic. + Remove { + /// The key of the user entry to be removed. + key: usize, + }, +} + +/// Returns whether a slice has all bits equal to one. +pub fn is_erased(slice: &[u8]) -> bool { + slice.iter().all(|&x| x == 0xff) +} + +/// Converts a word slice into a word. +/// +/// # Panics +/// +/// Panics if `word.len() != WORD_SIZE`. +fn slice_to_word(word: &[u8]) -> WORD { + u32::from_ne_bytes(<[u8; WORD_SIZE]>::try_from(word).unwrap()) +} + +/// Divides then takes ceiling. +/// +/// Returns `ceil(x / m)` with mathematical notations. +pub const fn div_ceil(x: usize, m: usize) -> usize { + (x + m - 1) / m +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn size_of_format() { + assert_eq!(std::mem::size_of::(), 24); + } + + #[test] + fn checksum_ok() { + let Field { pos, len } = WORD_CHECKSUM.field; + // There is enough bits to represents the number of zeros preceding the checksum. + assert_eq!(len, num_bits(pos)); + // The checksum is the last field of a word. + assert_eq!(pos + len, 8 * WORD_SIZE); + // The data of words using the checksum don't overlap the checksum. + let words = &[ + &LEN_INIT, + &LEN_COMPACT, + &LEN_ERASE, + &LEN_CLEAR, + &LEN_MARKER, + &LEN_REMOVE, + ]; + for word in words { + assert!(word.pos < pos); + } + } + + #[test] + fn init_ok() { + assert_eq!(INIT_CYCLE.pos, 0); + assert_eq!(INIT_CYCLE.len, 16); + assert_eq!(INIT_PREFIX.pos, 16); + assert_eq!(INIT_PREFIX.len, 9); + assert_eq!(LEN_INIT.pos, 25); + } + + #[test] + fn compact_ok() { + assert_eq!(COMPACT_TAIL.pos, 0); + assert_eq!(COMPACT_TAIL.len, 16); + assert_eq!(LEN_COMPACT.pos, 16); + } + + #[test] + fn header_ok() { + assert_eq!(ID_HEADER.field.pos, 0); + assert_eq!(ID_HEADER.field.len, 2); + assert_eq!(ID_HEADER.value, 0b01); + assert_eq!(HEADER_DELETED.pos, 2); + assert_eq!(HEADER_FLIPPED.pos, 3); + assert_eq!(HEADER_LENGTH.pos, 4); + assert_eq!(HEADER_LENGTH.len, 10); + assert_eq!(HEADER_KEY.pos, 14); + assert_eq!(HEADER_KEY.len, 12); + assert_eq!(HEADER_CHECKSUM.field.pos, 26); + assert_eq!(HEADER_CHECKSUM.field.len, 6); + assert_eq!(LEN_HEADER.pos, 32); + } + + #[test] + fn erase_ok() { + assert_eq!(ID_ERASE.field.pos, 0); + assert_eq!(ID_ERASE.field.len, 5); + assert_eq!(ID_ERASE.value, 0b00011); + assert_eq!(ERASE_PAGE.pos, 5); + assert_eq!(ERASE_PAGE.len, 6); + assert_eq!(LEN_ERASE.pos, 11); + } + + #[test] + fn clear_ok() { + assert_eq!(ID_CLEAR.field.pos, 0); + assert_eq!(ID_CLEAR.field.len, 5); + assert_eq!(ID_CLEAR.value, 0b10011); + assert_eq!(CLEAR_MIN_KEY.pos, 5); + assert_eq!(CLEAR_MIN_KEY.len, 12); + assert_eq!(LEN_CLEAR.pos, 17); + } + + #[test] + fn marker_ok() { + assert_eq!(ID_MARKER.field.pos, 0); + assert_eq!(ID_MARKER.field.len, 5); + assert_eq!(ID_MARKER.value, 0b01011); + assert_eq!(MARKER_COUNT.pos, 5); + assert_eq!(MARKER_COUNT.len, 5); + assert_eq!(LEN_MARKER.pos, 10); + } + + #[test] + fn remove_ok() { + assert_eq!(ID_REMOVE.field.pos, 0); + assert_eq!(ID_REMOVE.field.len, 5); + assert_eq!(ID_REMOVE.value, 0b11011); + assert_eq!(REMOVE_KEY.pos, 5); + assert_eq!(REMOVE_KEY.len, 12); + assert_eq!(LEN_REMOVE.pos, 17); + } + + #[test] + fn is_erased_ok() { + assert!(is_erased(&[])); + assert!(is_erased(&[0xff])); + assert!(is_erased(&[0xff, 0xff])); + assert!(!is_erased(&[0x00])); + assert!(!is_erased(&[0xff, 0xfe])); + assert!(!is_erased(&[0x7f, 0xff])); + } + + #[test] + fn slice_to_word_ok() { + // We write test with little-endian in mind, but use this helper function to test regardless + // of endianness. + fn test(slice: &[u8], word: u32) { + #[cfg(target_endian = "little")] + let word = word.swap_bytes(); + assert_eq!(slice_to_word(slice), word); + } + test(&[0x01, 0x02, 0x03, 0x04], 0x01020304); + test(&[0xf0, 0x78, 0x3c, 0x1e], 0xf0783c1e); + } + + #[test] + fn div_ceil_ok() { + assert_eq!(div_ceil(0, 1), 0); + assert_eq!(div_ceil(1, 1), 1); + assert_eq!(div_ceil(2, 1), 2); + assert_eq!(div_ceil(0, 2), 0); + assert_eq!(div_ceil(1, 2), 1); + assert_eq!(div_ceil(2, 2), 1); + assert_eq!(div_ceil(3, 2), 2); + } +} diff --git a/libraries/persistent_store/src/lib.rs b/libraries/persistent_store/src/lib.rs index 954e66f..242a218 100644 --- a/libraries/persistent_store/src/lib.rs +++ b/libraries/persistent_store/src/lib.rs @@ -12,10 +12,336 @@ // See the License for the specific language governing permissions and // limitations under the License. +// TODO(ia0): Add links once the code is complete. +//! Store abstraction for flash storage +//! +//! # Specification +//! +//! The store provides a partial function from keys to values on top of a storage +//! interface. The store total capacity depends on the size of the storage. Store +//! updates may be bundled in transactions. Mutable operations are atomic, including +//! when interrupted. +//! +//! The store is flash-efficient in the sense that it uses the storage lifetime +//! efficiently. For each page, all words are written at least once between erase +//! cycles and all erase cycles are used. However, not all written words are user +//! content: lifetime is also consumed with metadata and compaction. +//! +//! The store is extendable with other entries than key-values. It is essentially a +//! framework providing access to the storage lifetime. The partial function is +//! simply the most common usage and can be used to encode other usages. +//! +//! ## Definitions +//! +//! An _entry_ is a pair of a key and a value. A _key_ is a number between 0 +//! and 4095. A _value_ is a byte slice with a length between 0 and 1023 bytes (for +//! large enough pages). +//! +//! The store provides the following _updates_: +//! - Given a key and a value, `Insert` updates the store such that the value is +//! associated with the key. The value for other keys are left unchanged. +//! - Given a key, `Remove` updates the store such that no value is associated for +//! the key. The value for other keys are left unchanged. Additionally, if there +//! was a value associated with the key, the value is wiped from the storage +//! (all its bits are set to 0). +//! +//! The store provides the following _read-only operations_: +//! - `Iter` iterates through the store returning all entries exactly once. The +//! iteration order is not specified but stable between mutable operations. +//! - `Capacity` returns how many words can be stored before the store is full. +//! - `Lifetime` returns how many words can be written before the storage lifetime +//! is consumed. +//! +//! The store provides the following _mutable operations_: +//! - Given a set of independent updates, `Transaction` applies the sequence of +//! updates. +//! - Given a threshold, `Clear` removes all entries with a key greater or equal +//! to the threshold. +//! - Given a length in words, `Prepare` makes one step of compaction unless that +//! many words can be written without compaction. This operation has no effect +//! on the store but may still mutate its storage. In particular, the store has +//! the same capacity but a possibly reduced lifetime. +//! +//! A mutable operation is _atomic_ if, when power is lost during the operation, the +//! store is either updated (as if the operation succeeded) or left unchanged (as if +//! the operation did not occur). If the store is left unchanged, lifetime may still +//! be consumed. +//! +//! The store relies on the following _storage interface_: +//! - It is possible to read a byte slice. The slice won't span multiple pages. +//! - It is possible to write a word slice. The slice won't span multiple pages. +//! - It is possible to erase a page. +//! - The pages are sequentially indexed from 0. If the actual underlying storage +//! is segmented, then the storage layer should translate those indices to +//! actual page addresses. +//! +//! The store has a _total capacity_ of `C = (N - 1) * (P - 4) - M - 1` words, where +//! `P` is the number of words per page, `N` is the number of pages, and `M` is the +//! maximum length in words of a value (256 for large enough pages). The capacity +//! used by each mutable operation is given below (a transient word only uses +//! capacity during the operation): +//! - `Insert` uses `1 + ceil(len / 4)` words where `len` is the length of the +//! value in bytes. If an entry was replaced, the words used by its insertion +//! are freed. +//! - `Remove` doesn't use capacity if alone in the transaction and 1 transient +//! word otherwise. If an entry was deleted, the words used by its insertion are +//! freed. +//! - `Transaction` uses 1 transient word. In addition, the updates of the +//! transaction use and free words as described above. +//! - `Clear` doesn't use capacity and frees the words used by the insertion of +//! the deleted entries. +//! - `Prepare` doesn't use capacity. +//! +//! The _total lifetime_ of the store is below `L = ((E + 1) * N - 1) * (P - 2)` and +//! above `L - M` words, where `E` is the maximum number of erase cycles. The +//! lifetime is used when capacity is used, including transiently, as well as when +//! compaction occurs. The more the store is loaded (few remaining words of +//! capacity), the more compactions are frequent, and the more lifetime is used. +//! +//! It is possible to approximate the cost of transient words in terms of capacity: +//! `L` transient words are equivalent to `C - x` words of capacity where `x` is the +//! average capacity (including transient) of operations. +//! +//! ## Preconditions +//! +//! The store may behave in unexpected ways if the following assumptions don't hold: +//! - A word can be written twice between erase cycles. +//! - A page can be erased `E` times after the first boot of the store. +//! - When power is lost while writing a slice or erasing a page, the next read +//! returns a slice where a subset (possibly none or all) of the bits that +//! should have been modified have been modified. +//! - Reading a slice is deterministic. When power is lost while writing a slice +//! or erasing a slice (erasing a page containing that slice), reading that +//! slice repeatedly returns the same result (until it is overwritten or its +//! page is erased). +//! - To decide whether a page has been erased, it is enough to test if all its +//! bits are equal to 1. +//! - When power is lost while writing a slice or erasing a page, that operation +//! does not count towards the limits. However, completing that write or erase +//! operation would count towards the limits, as if the number of writes per +//! word and number of erase cycles could be fractional. +//! - The storage is only modified by the store. Note that completely erasing the +//! storage is supported, essentially losing all content and lifetime tracking. +//! It is preferred to use `Clear` with a threshold of 0 to keep the lifetime +//! tracking. +//! +//! The store properties may still hold outside some of those assumptions but with +//! weaker probabilities as the usage diverges from them. +//! +//! # Implementation +//! +//! We define the following constants: +//! - `E < 65536` the number of times a page can be erased. +//! - `3 <= N < 64` the number of pages in the storage. +//! - `8 <= P <= 1024` the number of words in a page. +//! - `Q = P - 2` the number of words in a virtual page. +//! - `K = 4096` the maximum number of keys. +//! - `M = min(Q - 1, 256)` the maximum length in words of a value. +//! - `V = (N - 1) * (Q - 1) - M` the virtual capacity. +//! - `C = V - N` the user capacity. +//! +//! We build a virtual storage from the physical storage using the first 2 words of +//! each page: +//! - The first word contains the number of times the page has been erased. +//! - The second word contains the starting word to which this page is being moved +//! during compaction. +//! +//! The virtual storage has a length of `(E + 1) * N * Q` words and represents the +//! lifetime of the store. (We reserve the last `Q + M` words to support adding +//! emergency lifetime.) This virtual storage has a linear address space. +//! +//! We define a set of overlapping windows of `N * Q` words at each `Q`-aligned +//! boundary. We call `i` the window spanning from `i * Q` to `(i + N) * Q`. Only +//! those windows actually exist in the underlying storage. We use compaction to +//! shift the current window from `i` to `i + 1`, preserving the content of the +//! store. +//! +//! For a given state of the virtual storage, we define `h_i` as the position of the +//! first entry of the window `i`. We call it the head of the window `i`. Because +//! entries are at most `M + 1` words, they can overlap on the next page only by `M` +//! words. So we have `i * Q <= h_i <= i * Q + M` . Since there are no entries +//! before the first page, we have `h_0 = 0`. +//! +//! We define `t_i` as one past the last entry of the window `i`. If there are no +//! entries in that window, we have `t_i = h_i`. We call `t_i` the tail of the +//! window `i`. We define the compaction invariant as `t_i - h_i <= V`. +//! +//! We define `|x|` as the capacity used before position `x`. We have `|x| <= x`. We +//! define the capacity invariant as `|t_i| - |h_i| <= C`. +//! +//! Using this virtual storage, entries are appended to the tail as long as there is +//! both virtual capacity to preserve the compaction invariant and capacity to +//! preserve the capacity invariant. When virtual capacity runs out, the first page +//! of the window is compacted and the window is shifted. +//! +//! Entries are identified by a prefix of bits. The prefix has to contain at least +//! one bit set to zero to differentiate from the tail. Entries can be one of: +//! - Padding: A word whose first bit is set to zero. The rest is arbitrary. This +//! entry is used to mark words partially written after an interrupted operation +//! as padding such that they are ignored by future operations. +//! - Header: A word whose second bit is set to zero. It contains the following fields: +//! - A bit indicating whether the entry is deleted. +//! - A bit indicating whether the value is word-aligned and has all bits set +//! to 1 in its last word. The last word of an entry is used to detect that +//! an entry has been fully written. As such it must contain at least one +//! bit equal to zero. +//! - The key of the entry. +//! - The length in bytes of the value. The value follows the header. The +//! entry is word-aligned if the value is not. +//! - The checksum of the first and last word of the entry. +//! - Erase: A word used during compaction. It contains the page to be erased and +//! a checksum. +//! - Clear: A word used during the `Clear` operation. It contains the threshold +//! and a checksum. +//! - Marker: A word used during the `Transaction` operation. It contains the +//! number of updates following the marker and a checksum. +//! - Remove: A word used during the `Transaction` operation. It contains the key +//! of the entry to be removed and a checksum. +//! +//! Checksums are the number of bits equal to 0. +//! +//! # Proofs +//! +//! ## Compaction +//! +//! It should always be possible to fully compact the store, after what the +//! remaining capacity should be available in the current window (restoring the +//! compaction invariant). We consider all notations on the virtual storage after +//! the full compaction. We will use the `|x|` notation although we update the state +//! of the virtual storage. This is fine because compaction doesn't change the +//! status of an existing word. +//! +//! We want to show that the next `N - 1` compactions won't move the tail past the +//! last page of their window, with `I` the initial window: +//! +//! ``` +//! forall 1 <= i <= N - 1, t_{I + i} <= (I + i + N - 1) * Q +//! ``` +//! +//! We assume `i` between `1` and `N - 1`. +//! +//! One step of compaction advances the tail by how many words were used in the +//! first page of the window with the last entry possibly overlapping on the next +//! page. +//! +//! ``` +//! forall j, t_{j + 1} = t_j + |h_{j + 1}| - |h_j| + 1 +//! ``` +//! +//! By induction, we have: +//! +//! ``` +//! t_{I + i} <= t_I + |h_{I + i}| - |h_I| + i +//! ``` +//! +//! We have the following properties: +//! +//! ``` +//! t_I <= h_I + V +//! |h_{I + i}| - |h_I| <= h_{I + i} - h_I +//! h_{I + i} <= (I + i) * Q + M +//! ``` +//! +//! Replacing into our previous equality, we can conclude: +//! +//! ``` +//! t_{I + i} = t_I + |h_{I + i}| - |h_I| + i +//! <= h_I + V + (I + i) * Q + M - h_I + i +//! = (N - 1) * (Q - 1) - M + (I + i) * Q + M + i +//! = (N - 1) * (Q - 1) + (I + i) * Q + i +//! = (I + i + N - 1) * Q + i - (N - 1) +//! <= (I + i + N - 1) * Q +//! ``` +//! +//! We also want to show that after `N - 1` compactions, the remaining capacity is +//! available without compaction. +//! +//! ``` +//! V - (t_{I + N - 1} - h_{I + N - 1}) >= // The available words in the window. +//! C - (|t_{I + N - 1}| - |h_{I + N - 1}|) // The remaining capacity. +//! + 1 // Reserved for Clear. +//! ``` +//! +//! We can replace the definition of `C` and simplify: +//! +//! ``` +//! V - (t_{I + N - 1} - h_{I + N - 1}) >= V - N - (|t_{I + N - 1}| - |h_{I + N - 1}|) + 1 +//! iff t_{I + N - 1} - h_{I + N - 1} <= |t_{I + N - 1}| - |h_{I + N - 1}| + N - 1 +//! ``` +//! +//! We have the following properties: +//! +//! ``` +//! t_{I + N - 1} = t_I + |h_{I + N - 1}| - |h_I| + N - 1 +//! |t_{I + N - 1}| - |h_{I + N - 1}| = |t_I| - |h_I| // Compaction preserves capacity. +//! |h_{I + N - 1}| - |t_I| <= h_{I + N - 1} - t_I +//! ``` +//! +//! From which we conclude: +//! +//! ``` +//! t_{I + N - 1} - h_{I + N - 1} <= |t_{I + N - 1}| - |h_{I + N - 1}| + N - 1 +//! iff t_I + |h_{I + N - 1}| - |h_I| + N - 1 - h_{I + N - 1} <= |t_I| - |h_I| + N - 1 +//! iff t_I + |h_{I + N - 1}| - h_{I + N - 1} <= |t_I| +//! iff |h_{I + N - 1}| - |t_I| <= h_{I + N - 1} - t_I +//! ``` +//! +//! +//! ## Checksum +//! +//! The main property we want is that all partially written/erased words are either +//! the initial word, the final word, or invalid. +//! +//! We say that a bit sequence `TARGET` is reachable from a bit sequence `SOURCE` if +//! both have the same length and `SOURCE & TARGET == TARGET` where `&` is the +//! bitwise AND operation on bit sequences of that length. In other words, when +//! `SOURCE` has a bit equal to 0 then `TARGET` also has that bit equal to 0. +//! +//! The only written entries start with `101` or `110` and are written from an +//! erased word. Marking an entry as padding or deleted is a single bit operation, +//! so the property trivially holds. For those cases, the proof relies on the fact +//! that there is exactly one bit equal to 0 in the 3 first bits. Either the 3 first +//! bits are still `111` in which case we expect the remaining bits to bit equal +//! to 1. Otherwise we can use the checksum of the given type of entry because those +//! 2 types of entries are not reachable from each other. +//! +//! To show that valid entries of a given type are not reachable from each other, we +//! show 3 lemmas: +//! +//! 1. A bit sequence is not reachable from another if its number of bits equal to +//! 0 is smaller. +//! +//! 2. A bit sequence is not reachable from another if they have the same number of +//! bits equals to 0 and are different. +//! +//! 3. A bit sequence is not reachable from another if it is bigger when they are +//! interpreted as numbers in binary representation. +//! +//! From those lemmas we consider the 2 cases. If both entries have the same number +//! of bits equal to 0, they are either equal or not reachable from each other +//! because of the second lemma. If they don't have the same number of bits equal to +//! 0, then the one with less bits equal to 0 is not reachable from the other +//! because of the first lemma and the one with more bits equal to 0 is not +//! reachable from the other because of the third lemma and the definition of the +//! checksum. +//! +//! # Fuzzing +//! +//! For any sequence of operations and interruptions starting from an erased +//! storage, the store is checked against its model and some internal invariant at +//! each step. +//! +//! For any sequence of operations and interruptions starting from an arbitrary +//! storage, the store is checked not to crash. + #![cfg_attr(not(feature = "std"), no_std)] +#[macro_use] +extern crate alloc; + #[macro_use] mod bitfield; +mod format; mod storage; mod store;