Skip to content

index V2 writing finalization #479

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Aug 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions git-index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ bstr = { version = "0.2.13", default-features = false }
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
smallvec = "1.7.0"
atoi = "1.0.0"
itoa = "1.0.3"
bitflags = "1.3.2"

document-features = { version = "0.2.0", optional = true }
Expand Down
3 changes: 2 additions & 1 deletion git-index/src/decode/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entrie

use crate::{util::from_be_u32, Version};

pub(crate) const SIGNATURE: &[u8] = b"DIRC";

mod error {

/// The error produced when failing to decode an index header.
Expand All @@ -23,7 +25,6 @@ pub(crate) fn decode(data: &[u8], object_hash: git_hash::Kind) -> Result<(Versio
));
}

const SIGNATURE: &[u8] = b"DIRC";
let (signature, data) = data.split_at(4);
if signature != SIGNATURE {
return Err(Error::Corrupt(
Expand Down
28 changes: 18 additions & 10 deletions git-index/src/entry/flags.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
use crate::entry::Stage;
use crate::Version;
use bitflags::bitflags;

bitflags! {
/// In-memory flags
pub struct Flags: u32 {
/// The mask to apply to obtain the stage number of an entry.
const STAGE_MASK = 0x3000;
/// If set, additional bits need to be written to storage.
const EXTENDED = 0x4000;
// TODO: could we use the pathlen ourselves to save 8 bytes? And how to handle longer paths than that? 0 as sentinel maybe?
/// The mask to obtain the length of the path associated with this entry.
const PATH_LEN = 0x0fff;
Expand Down Expand Up @@ -49,9 +50,6 @@ bitflags! {
/// Stored at rest
const SKIP_WORKTREE = 1 << 30;

/// flags that need to be stored on disk in a V3 formatted index.
const EXTENDED_FLAGS = 1 << 29 | 1 << 30;

/// For future extension
const EXTENDED_2 = 1 << 31;
}
Expand All @@ -64,10 +62,17 @@ impl Flags {
}

/// Transform ourselves to a storage representation to keep all flags which are to be persisted,
/// with the caller intending to write `version`.
pub fn to_storage(&self, version: Version) -> at_rest::Flags {
assert_eq!(version, Version::V2, "Can only encode V2 flags at the moment");
at_rest::Flags::from_bits(self.bits() as u16).unwrap()
/// skipping all extended flags. Note that the caller has to check for the `EXTENDED` bit to be present
/// and write extended flags as well if so.
pub fn to_storage(mut self) -> at_rest::Flags {
at_rest::Flags::from_bits(
{
self.remove(Self::PATH_LEN);
self
}
.bits() as u16,
)
.unwrap()
}
}

Expand All @@ -89,8 +94,7 @@ pub(crate) mod at_rest {

impl Flags {
pub fn to_memory(self) -> super::Flags {
super::Flags::from_bits((self & (Flags::PATH_LEN | Flags::STAGE_MASK | Flags::ASSUME_VALID)).bits as u32)
.expect("PATHLEN is part of memory representation")
super::Flags::from_bits(self.bits as u32).expect("PATHLEN is part of memory representation")
}
}

Expand All @@ -103,6 +107,10 @@ pub(crate) mod at_rest {
}

impl FlagsExtended {
pub fn from_flags(flags: super::Flags) -> Self {
Self::from_bits(((flags & (super::Flags::INTENT_TO_ADD | super::Flags::SKIP_WORKTREE)).bits >> 16) as u16)
.expect("valid")
}
pub fn to_flags(self) -> Option<super::Flags> {
super::Flags::from_bits((self.bits as u32) << 16)
}
Expand Down
3 changes: 0 additions & 3 deletions git-index/src/entry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,3 @@ mod _impls {
}
}
}

#[cfg(test)]
mod tests;
13 changes: 0 additions & 13 deletions git-index/src/entry/tests.rs

This file was deleted.

29 changes: 17 additions & 12 deletions git-index/src/entry/write.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use crate::{Entry, State, Version};
use crate::{entry, Entry, State};
use std::convert::TryInto;

impl Entry {
/// Serialize ourselves to `out` with path access via `state`.
/// Serialize ourselves to `out` with path access via `state`, without padding.
pub fn write_to(&self, mut out: impl std::io::Write, state: &State) -> std::io::Result<()> {
let stat = self.stat;
out.write_all(&stat.ctime.secs.to_be_bytes())?;
Expand All @@ -17,16 +17,21 @@ impl Entry {
out.write_all(&stat.size.to_be_bytes())?;
out.write_all(self.id.as_bytes())?;
let path = self.path(state);
let path_len: u16 = path
.len()
.try_into()
.expect("Cannot handle paths longer than 16bits ever");
assert!(
path_len <= 0xFFF,
"Paths can't be longer than 12 bits as they share space with bit flags in a u16"
);
let version = Version::V2; // TODO: don't hardcode once `to_storage()` can do its work without assertion
out.write_all(&(self.flags.to_storage(version).bits() | path_len).to_be_bytes())?;
let path_len: u16 = if path.len() >= entry::Flags::PATH_LEN.bits() as usize {
entry::Flags::PATH_LEN.bits() as u16
} else {
path.len()
.try_into()
.expect("we just checked that the length is smaller than 0xfff")
};
out.write_all(&(self.flags.to_storage().bits() | path_len).to_be_bytes())?;
if self.flags.contains(entry::Flags::EXTENDED) {
out.write_all(
&entry::at_rest::FlagsExtended::from_flags(self.flags)
.bits()
.to_be_bytes(),
)?;
}
out.write_all(path)?;
out.write_all(b"\0")
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
use crate::{decode::header, extension, extension::Signature, util::from_be_u32};

pub const SIGNATURE: Signature = *b"EOIE";
pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes();
pub const SIZE_WITH_HEADER: usize = crate::extension::MIN_SIZE + SIZE;

use crate::decode::header;
use crate::extension;
use crate::extension::end_of_index_entry::{MIN_SIZE, MIN_SIZE_WITH_HEADER, SIGNATURE};
use crate::util::from_be_u32;

/// Decode the end of index entry extension, which is no more than a glorified offset to the first byte of all extensions to allow
/// loading entries and extensions in parallel.
///
/// Itself it's located at the end of the index file, which allows its location to be known and thus addressable.
/// From there it's possible to traverse the chunks of all set extensions, hash them, and compare that hash with all extensions
/// stored prior to this one to assure they are correct.
///
/// If the checksum wasn't matched, we will ignoree this extension entirely.
pub fn decode(data: &[u8], object_hash: git_hash::Kind) -> Option<usize> {
let hash_len = object_hash.len_in_bytes();
if data.len() < SIZE_WITH_HEADER + hash_len {
if data.len() < MIN_SIZE_WITH_HEADER + hash_len {
return None;
}

let start_of_eoie = data.len() - SIZE_WITH_HEADER - hash_len;
let start_of_eoie = data.len() - MIN_SIZE_WITH_HEADER - hash_len;
let ext_data = &data[start_of_eoie..data.len() - hash_len];

let (signature, ext_size, ext_data) = extension::decode::header(ext_data);
if signature != SIGNATURE || ext_size as usize != SIZE {
if signature != SIGNATURE || ext_size as usize != MIN_SIZE {
return None;
}

Expand All @@ -26,7 +33,7 @@ pub fn decode(data: &[u8], object_hash: git_hash::Kind) -> Option<usize> {

let mut hasher = git_features::hash::hasher(git_hash::Kind::Sha1);
let mut last_chunk = None;
for (signature, chunk) in extension::Iter::new(&data[offset..data.len() - SIZE_WITH_HEADER - hash_len]) {
for (signature, chunk) in extension::Iter::new(&data[offset..data.len() - MIN_SIZE_WITH_HEADER - hash_len]) {
hasher.update(&signature);
hasher.update(&(chunk.len() as u32).to_be_bytes());
last_chunk = Some(chunk);
Expand Down
14 changes: 14 additions & 0 deletions git-index/src/extension/end_of_index_entry/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
use crate::{extension, extension::Signature};

/// The signature of the end-of-index-entry extension
pub const SIGNATURE: Signature = *b"EOIE";
/// The minimal size of the extension, depending on the shortest hash.
pub const MIN_SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::shortest().len_in_bytes();
/// The smallest size of the extension varying by hash kind, along with the standard extension header.
pub const MIN_SIZE_WITH_HEADER: usize = extension::MIN_SIZE + MIN_SIZE;

mod decode;
pub use decode::decode;

mod write;
pub use write::write_to;
30 changes: 30 additions & 0 deletions git-index/src/extension/end_of_index_entry/write.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use crate::extension::end_of_index_entry::SIGNATURE;
use crate::extension::Signature;

/// Write this extension to out and generate a hash of `hash_kind` over all `prior_extensions` which are specified as `(signature, size)`
/// pair. `one_past_entries` is the offset to the first byte past the entries, which is also the first byte of the signature of the
/// first extension in `prior_extensions`. Note that `prior_extensions` must have been written prior to this one, as the name suggests,
/// allowing this extension to be the last one in the index file.
///
/// Even if there are no `prior_extensions`, this extension will be written unconditionally.
pub fn write_to(
out: &mut impl std::io::Write,
hash_kind: git_hash::Kind,
offset_to_extensions: u32,
prior_extensions: impl IntoIterator<Item = (Signature, u32)>,
) -> Result<(), std::io::Error> {
out.write_all(&SIGNATURE)?;
let extension_size: u32 = 4 + hash_kind.len_in_bytes() as u32;
out.write_all(&extension_size.to_be_bytes())?;

out.write_all(&offset_to_extensions.to_be_bytes())?;

let mut hasher = git_features::hash::hasher(hash_kind);
for (signature, size) in prior_extensions {
hasher.update(&signature);
hasher.update(&size.to_be_bytes());
}
out.write_all(&hasher.digest())?;

Ok(())
}
9 changes: 6 additions & 3 deletions git-index/src/extension/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use bstr::BString;
use smallvec::SmallVec;

const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */;
/// The size of the smallest possible exstension, which is no more than a signature and a 0 indicating its size.
pub const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */;

/// The kind of index extension.
pub type Signature = [u8; 4];
Expand All @@ -25,7 +26,8 @@ pub struct Tree {
pub id: git_hash::ObjectId,
/// The amount of non-tree items in this directory tree, including sub-trees, recursively.
/// The value of the top-level tree is thus equal to the value of the total amount of entries.
pub num_entries: u32,
/// If `None`, the tree is considered invalid and needs to be refreshed
pub num_entries: Option<u32>,
/// The child-trees below the current tree.
pub children: Vec<Tree>,
}
Expand Down Expand Up @@ -77,7 +79,8 @@ pub(crate) mod decode;
///
pub mod tree;

pub(crate) mod end_of_index_entry;
///
pub mod end_of_index_entry;

pub(crate) mod index_entry_offset_table;

Expand Down
16 changes: 12 additions & 4 deletions git-index/src/extension/tree/decode.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::extension::Tree;
use crate::util::{split_at_byte_exclusive, split_at_pos};
use git_hash::ObjectId;
use std::convert::TryInto;

/// A recursive data structure
pub fn decode(data: &[u8], object_hash: git_hash::Kind) -> Option<Tree> {
Expand All @@ -17,13 +18,20 @@ fn one_recursive(data: &[u8], hash_len: usize) -> Option<(Tree, &[u8])> {
let (path, data) = split_at_byte_exclusive(data, 0)?;

let (entry_count, data) = split_at_byte_exclusive(data, b' ')?;
let num_entries: u32 = atoi::atoi(entry_count)?;
let num_entries: i32 = atoi::atoi(entry_count)?;

let (subtree_count, data) = split_at_byte_exclusive(data, b'\n')?;
let subtree_count: usize = atoi::atoi(subtree_count)?;

let (hash, mut data) = split_at_pos(data, hash_len)?;
let id = ObjectId::from(hash);
let (id, mut data) = if num_entries >= 0 {
let (hash, data) = split_at_pos(data, hash_len)?;
(ObjectId::from(hash), data)
} else {
(
ObjectId::null(git_hash::Kind::from_hex_len(hash_len * 2).expect("valid hex_len")),
data,
)
};

let mut subtrees = Vec::with_capacity(subtree_count);
for _ in 0..subtree_count {
Expand All @@ -42,7 +50,7 @@ fn one_recursive(data: &[u8], hash_len: usize) -> Option<(Tree, &[u8])> {
Some((
Tree {
id,
num_entries,
num_entries: num_entries.try_into().ok(),
name: path.into(),
children: subtrees,
},
Expand Down
Loading