From eab421c93cc4d386a547825a9450da4b84f5d8f7 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 8 Jan 2022 12:51:46 +0800 Subject: [PATCH 01/57] first research on index reading (#293) --- crate-status.md | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/crate-status.md b/crate-status.md index b6883bea75e..ccebd283307 100644 --- a/crate-status.md +++ b/crate-status.md @@ -207,9 +207,27 @@ Check out the [performance discussion][git-traverse-performance] as well. * [ ] Some examples ### git-index -* read and write a git-index file - * non-sparse - * sparse (search for [`sparse index` here](https://github.blog/2021-08-16-highlights-from-git-2-33/)) +* read + * [ ] V2 + * [ ] V3 + * [ ] V4 + * optional threading + * [ ] concurrent loading of index extensions + * [ ] threaded cache entry reading +* `stat` update + * [ ] optional threaded `stat` based on thread_cost (aka preload) +* extensions + * [ ] TREE for speeding up tree generation + * [ ] REUC resolving undo + * [ ] UNTR untracked cache + * [ ] FSMN file system monitor cache V1 and V2 + * [ ] EOIE end of index entry + * [ ] IEOT index entry offset table + * [ ] link base indices to take information from, split index + * [ ] sdir sparse directory entries +* additinoal support + * [ ] non-sparse + * [ ] sparse (search for [`sparse index` here](https://github.blog/2021-08-16-highlights-from-git-2-33/)) * add and remove entries * [x] API documentation * [ ] Some examples From 3040857ec4d2e0557b4920eaa77ddc4292d9adae Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 8 Jan 2022 17:52:38 +0800 Subject: [PATCH 02/57] notes on how test indices have been created (#293) Note that in-code we must make sufficiently clear where a particular fixture is coming from, or we name it after the test and file right away. --- crate-status.md | 2 ++ etc/check-package-size.sh | 1 + git-index/README.md | 10 ++++++++++ 3 files changed, 13 insertions(+) create mode 100644 git-index/README.md diff --git a/crate-status.md b/crate-status.md index ccebd283307..749dfe8ae55 100644 --- a/crate-status.md +++ b/crate-status.md @@ -216,6 +216,8 @@ Check out the [performance discussion][git-traverse-performance] as well. * [ ] threaded cache entry reading * `stat` update * [ ] optional threaded `stat` based on thread_cost (aka preload) +* [ ] handling of `.gitignore` and system file exclude configuration +* [ ] handle potential races * extensions * [ ] TREE for speeding up tree generation * [ ] REUC resolving undo diff --git a/etc/check-package-size.sh b/etc/check-package-size.sh index eab9d152b6c..6e02a56a902 100755 --- a/etc/check-package-size.sh +++ b/etc/check-package-size.sh @@ -18,6 +18,7 @@ echo "in root: gitoxide CLI" #indent cargo diet -n --package-size-limit 25KB - fails right now because of dotted profile.dev.package (enter cargo-smart-release && indent cargo diet -n --package-size-limit 85KB) (enter git-actor && indent cargo diet -n --package-size-limit 5KB) +(enter git-index && indent cargo diet -n --package-size-limit 5KB) (enter git-tempfile && indent cargo diet -n --package-size-limit 25KB) (enter git-lock && indent cargo diet -n --package-size-limit 15KB) (enter git-config && indent cargo diet -n --package-size-limit 65KB) diff --git a/git-index/README.md b/git-index/README.md new file mode 100644 index 00000000000..95600acb7fb --- /dev/null +++ b/git-index/README.md @@ -0,0 +1,10 @@ + +#### Test fixtures + +Most of the test indices are snatched directly from the unit test suite of `git` itself, usually by running something like the following + +```shell + ./t1700-split-index.sh -r 2 --debug +``` + +Then one finds all test state and the index in particular in `trash directory/t1700-split-index/.git/index`. From bce67d8ec58f78a1fce1c76f7b93d9650f9f550e Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 8 Jan 2022 18:28:07 +0800 Subject: [PATCH 03/57] preempt the eventual need for a worktree implementation (#293) It deals with comparing items from the work tree and the index, and is generally what makes use of exclude specificiations. --- Cargo.lock | 4 ++++ Cargo.toml | 1 + README.md | 4 ++-- crate-status.md | 5 +++++ git-worktree/Cargo.toml | 15 +++++++++++++++ git-worktree/src/lib.rs | 1 + 6 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 git-worktree/Cargo.toml create mode 100644 git-worktree/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 58079307e12..1125c1335b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1664,6 +1664,10 @@ dependencies = [ "quick-error", ] +[[package]] +name = "git-worktree" +version = "0.0.0" + [[package]] name = "git2" version = "0.13.25" diff --git a/Cargo.toml b/Cargo.toml index 8eaafb121e5..3f9c7a9e2ff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -115,6 +115,7 @@ members = [ "git-diff", "git-traverse", "git-index", + "git-worktree", "git-packetline", "git-transport", "git-protocol", diff --git a/README.md b/README.md index 8fa45c9586d..10199d74fb4 100644 --- a/README.md +++ b/README.md @@ -83,9 +83,9 @@ Follow linked crate name for detailed status. Please note that all crates follow * [git-repository](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-repository) * `gitoxide-core` * **very early** -* **idea** * [git-index](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-index) - * git-status +* **idea** + * [git-worktree](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-worktree) * [git-tui](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-tui) * [git-bundle](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-bundle) diff --git a/crate-status.md b/crate-status.md index 749dfe8ae55..2a13e1796ac 100644 --- a/crate-status.md +++ b/crate-status.md @@ -206,6 +206,11 @@ Check out the [performance discussion][git-traverse-performance] as well. * [x] API documentation * [ ] Some examples +### git-worktree +* handle the working tree/checkout +* manage multiple worktrees +* deal with exclude specifications, like .gitignore and other exclude files. + ### git-index * read * [ ] V2 diff --git a/git-worktree/Cargo.toml b/git-worktree/Cargo.toml new file mode 100644 index 00000000000..6aa73e98b5e --- /dev/null +++ b/git-worktree/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "git-worktree" +version = "0.0.0" +repository = "https://github.com/Byron/gitoxide" +license = "MIT/Apache-2.0" +description = "A WIP crate of the gitoxide project dedicated implementing everything around working trees and git excludes" +authors = ["Sebastian Thiel "] +edition = "2018" + +[lib] +doctest = false + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/git-worktree/src/lib.rs b/git-worktree/src/lib.rs new file mode 100644 index 00000000000..d7a83e4f525 --- /dev/null +++ b/git-worktree/src/lib.rs @@ -0,0 +1 @@ +#![forbid(unsafe_code, rust_2018_idioms)] From b3ee7c6f7553de6bff4934bbdf38f6c6ea2cf349 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 8 Jan 2022 18:30:03 +0800 Subject: [PATCH 04/57] update changelog (#293) --- git-worktree/CHANGELOG.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 git-worktree/CHANGELOG.md diff --git a/git-worktree/CHANGELOG.md b/git-worktree/CHANGELOG.md new file mode 100644 index 00000000000..7d1f8df4b53 --- /dev/null +++ b/git-worktree/CHANGELOG.md @@ -0,0 +1,29 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## Unreleased + +Reserve the name for a necessary crate of the `gitoxide` project. + +### Commit Statistics + + + + - 1 commit contributed to the release. + - 0 commits where understood as [conventional](https://www.conventionalcommits.org). + - 1 unique issue was worked on: [#293](https://github.com/Byron/gitoxide/issues/293) + +### Commit Details + + + +
view details + + * **[#293](https://github.com/Byron/gitoxide/issues/293)** + - preempt the eventual need for a worktree implementation ([`bce67d8`](https://github.com/Byron/gitoxide/commit/bce67d8ec58f78a1fce1c76f7b93d9650f9f550e)) +
+ From ddb1bf49e3b5b663fcf166d8cbce416e78d9fc18 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 8 Jan 2022 18:31:02 +0800 Subject: [PATCH 05/57] Release git-worktree v0.0.0 --- git-worktree/CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/git-worktree/CHANGELOG.md b/git-worktree/CHANGELOG.md index 7d1f8df4b53..e48aefbcad3 100644 --- a/git-worktree/CHANGELOG.md +++ b/git-worktree/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## Unreleased +## 0.0.0 (2022-01-08) Reserve the name for a necessary crate of the `gitoxide` project. @@ -13,7 +13,7 @@ Reserve the name for a necessary crate of the `gitoxide` project. - - 1 commit contributed to the release. + - 2 commits contributed to the release. - 0 commits where understood as [conventional](https://www.conventionalcommits.org). - 1 unique issue was worked on: [#293](https://github.com/Byron/gitoxide/issues/293) @@ -24,6 +24,7 @@ Reserve the name for a necessary crate of the `gitoxide` project.
view details * **[#293](https://github.com/Byron/gitoxide/issues/293)** + - update changelog ([`b3ee7c6`](https://github.com/Byron/gitoxide/commit/b3ee7c6f7553de6bff4934bbdf38f6c6ea2cf349)) - preempt the eventual need for a worktree implementation ([`bce67d8`](https://github.com/Byron/gitoxide/commit/bce67d8ec58f78a1fce1c76f7b93d9650f9f550e))
From aa60fdf3d86e08877c88f9e4973f546642ed1370 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 8 Jan 2022 20:37:11 +0800 Subject: [PATCH 06/57] base setup for index testing (#293) It should be easy enough to learn from git tests to generate whichever kind of index we need. --- Cargo.lock | 5 +++ git-index/Cargo.toml | 6 +++ git-index/src/lib.rs | 49 ++++++++++++++++++++++- git-index/tests/file/mod.rs | 8 ++++ git-index/tests/fixtures/make_index/v2.sh | 9 +++++ git-index/tests/index.rs | 9 +++++ tests/tools/src/lib.rs | 6 ++- 7 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 git-index/tests/file/mod.rs create mode 100644 git-index/tests/fixtures/make_index/v2.sh create mode 100644 git-index/tests/index.rs diff --git a/Cargo.lock b/Cargo.lock index 1125c1335b4..5935089699a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1227,6 +1227,11 @@ dependencies = [ [[package]] name = "git-index" version = "0.0.0" +dependencies = [ + "git-hash 0.8.0", + "git-testtools", + "quick-error", +] [[package]] name = "git-lock" diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index fda89b07c02..9161b45db2a 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -13,3 +13,9 @@ doctest = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +git-hash = { version ="^0.8.0", path = "../git-hash" } + +quick-error = "2.0.0" + +[dev-dependencies] +git-testtools = { path = "../tests/tools"} diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index d7a83e4f525..bf692bb05f2 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -1 +1,48 @@ -#![forbid(unsafe_code, rust_2018_idioms)] +#![forbid(unsafe_code)] +#![deny(rust_2018_idioms)] +#![allow(missing_docs)] + +use std::path::PathBuf; + +pub mod file { + pub mod init { + #![allow(unused)] + use crate::File; + use std::path::Path; + + impl File { + pub fn at(path: impl AsRef, object_hash: git_hash::Kind) -> std::io::Result { + todo!("read file") + } + } + } +} +pub mod init { + use crate::State; + + impl State { + /// Returns an empty state. + /// TODO: figure out if it needs to know some configuration + pub fn new() -> Self { + State + } + } + + impl Default for State { + fn default() -> Self { + State::new() + } + } +} + +/// An index file whose state was read from a file on disk. +pub struct File { + pub state: State, + pub path: PathBuf, +} + +/// An in-memory cache of a fully parsed git index file. +/// +/// As opposed to a snapshot, it's meant to be altered and eventually be written back to disk or converted into a tree. +/// We treat index and its state synonymous. +pub struct State; diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs new file mode 100644 index 00000000000..7aae4332c2b --- /dev/null +++ b/git-index/tests/file/mod.rs @@ -0,0 +1,8 @@ +mod init { + + #[test] + #[ignore] + fn v2() { + let _file = git_index::File::at(crate::index_fixture_path("v2"), git_hash::Kind::Sha1).unwrap(); + } +} diff --git a/git-index/tests/fixtures/make_index/v2.sh b/git-index/tests/fixtures/make_index/v2.sh new file mode 100644 index 00000000000..56cce83c35c --- /dev/null +++ b/git-index/tests/fixtures/make_index/v2.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -eu -o pipefail + +GIT_INDEX_VERSION=2 git init -q +git config commit.gpgsign false + +touch a +git add a +git commit -m "empty" diff --git a/git-index/tests/index.rs b/git-index/tests/index.rs new file mode 100644 index 00000000000..4e1bf722477 --- /dev/null +++ b/git-index/tests/index.rs @@ -0,0 +1,9 @@ +use std::path::{Path, PathBuf}; + +mod file; + +pub fn index_fixture_path(name: &str) -> PathBuf { + let dir = git_testtools::scripted_fixture_repo_read_only(Path::new("make_index").join(name).with_extension("sh")) + .expect("script works"); + dir.join(".git").join("index") +} diff --git a/tests/tools/src/lib.rs b/tests/tools/src/lib.rs index 78ef564b750..f60ecad5b37 100644 --- a/tests/tools/src/lib.rs +++ b/tests/tools/src/lib.rs @@ -19,7 +19,9 @@ pub fn hex_to_id(hex: &str) -> git_hash::ObjectId { pub fn fixture_path(path: impl AsRef) -> PathBuf { PathBuf::from("tests").join("fixtures").join(path.as_ref()) } -pub fn scripted_fixture_repo_read_only(script_name: &str) -> std::result::Result> { +pub fn scripted_fixture_repo_read_only( + script_name: impl AsRef, +) -> std::result::Result> { scripted_fixture_repo_read_only_with_args(script_name, None) } @@ -59,7 +61,7 @@ pub fn copy_recursively_into_existing_dir(src_dir: impl AsRef, dst_dir: im /// Returns the directory at which the data is present pub fn scripted_fixture_repo_read_only_with_args( - script_name: &str, + script_name: impl AsRef, args: impl IntoIterator, ) -> std::result::Result> { let script_path = fixture_path(script_name); From b481f136c4084b8839ebecb604dea5aa30d3a44e Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 09:35:42 +0800 Subject: [PATCH 07/57] The realization that FileBuffer really shouldn't be used anymore (#293) --- Cargo.lock | 1 + git-index/Cargo.toml | 1 + git-index/README.md | 3 ++- git-index/src/lib.rs | 21 ++++++++++++++++++++- git-index/tests/file/mod.rs | 7 +++++-- 5 files changed, 29 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5935089699a..57a96818a7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1228,6 +1228,7 @@ dependencies = [ name = "git-index" version = "0.0.0" dependencies = [ + "filebuffer", "git-hash 0.8.0", "git-testtools", "quick-error", diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index 9161b45db2a..5c98c218dc2 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -16,6 +16,7 @@ doctest = false git-hash = { version ="^0.8.0", path = "../git-hash" } quick-error = "2.0.0" +filebuffer = "0.4.0" [dev-dependencies] git-testtools = { path = "../tests/tools"} diff --git a/git-index/README.md b/git-index/README.md index 95600acb7fb..f239be693c0 100644 --- a/git-index/README.md +++ b/git-index/README.md @@ -7,4 +7,5 @@ Most of the test indices are snatched directly from the unit test suite of `git` ./t1700-split-index.sh -r 2 --debug ``` -Then one finds all test state and the index in particular in `trash directory/t1700-split-index/.git/index`. +Then one finds all test state and the index in particular in `trash directory/t1700-split-index/.git/index` and can possibly copy it over and use as fixture. +The preferred way is to find a test of interest, and use its setup code within one of our own fixture scripts that are executed once to generate the file of interest. diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index bf692bb05f2..baa76536139 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -8,10 +8,29 @@ pub mod file { pub mod init { #![allow(unused)] use crate::File; + use filebuffer::FileBuffer; use std::path::Path; + mod error { + use quick_error::quick_error; + + quick_error! { + #[derive(Debug)] + pub enum Error { + Io(err: std::io::Error) { + display("An IO error occurred while reading the index") + source(err) + from() + } + } + } + } + pub use error::Error; + impl File { - pub fn at(path: impl AsRef, object_hash: git_hash::Kind) -> std::io::Result { + pub fn at(path: impl AsRef, object_hash: git_hash::Kind) -> Result { + let data = FileBuffer::open(path)?; + todo!("read file") } } diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index 7aae4332c2b..be7187c4cb8 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -1,8 +1,11 @@ mod init { + fn file(name: &str) -> git_index::File { + git_index::File::at(crate::index_fixture_path(name), git_hash::Kind::Sha1).unwrap() + } #[test] #[ignore] - fn v2() { - let _file = git_index::File::at(crate::index_fixture_path("v2"), git_hash::Kind::Sha1).unwrap(); + fn read_v2() { + let _file = file("v2"); } } From 4dec3ead7c28e88de1eb8e1576b9b29b1c0953c7 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 09:49:05 +0800 Subject: [PATCH 08/57] git-ref uses memmap2 (#293) --- Cargo.lock | 11 ++++++++++- git-ref/Cargo.toml | 2 +- git-ref/src/lib.rs | 3 +-- git-ref/src/store/packed/buffer.rs | 10 ++++++++-- git-ref/src/store/packed/mod.rs | 4 ++-- 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 57a96818a7e..7f5f567983d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1474,7 +1474,6 @@ dependencies = [ name = "git-ref" version = "0.10.0" dependencies = [ - "filebuffer", "git-actor 0.7.0", "git-features 0.18.0", "git-hash 0.8.0", @@ -1484,6 +1483,7 @@ dependencies = [ "git-tempfile 1.0.3", "git-testtools", "git-validate 0.5.3", + "memmap2", "nom", "os_str_bytes 6.0.0", "quick-error", @@ -2077,6 +2077,15 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +[[package]] +name = "memmap2" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4647a11b578fead29cdbb34d4adef8dd3dc35b876c9c6d5240d83f205abfe96e" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.6.5" diff --git a/git-ref/Cargo.toml b/git-ref/Cargo.toml index 97e7621496a..979ec377091 100644 --- a/git-ref/Cargo.toml +++ b/git-ref/Cargo.toml @@ -38,7 +38,7 @@ serde = { version = "1.0.114", optional = true, default-features = false, featur os_str_bytes = "6.0.0" # packed refs -filebuffer = "0.4.0" +memmap2 = "0.5.0" [dev-dependencies] git-testtools = { path = "../tests/tools" } diff --git a/git-ref/src/lib.rs b/git-ref/src/lib.rs index da554a12b15..c19e75cc355 100644 --- a/git-ref/src/lib.rs +++ b/git-ref/src/lib.rs @@ -16,8 +16,7 @@ //! * references are stored in a single human-readable file, along with their targets if they are symbolic. //! * **ref-table** //! * supersedes all of the above to allow handling hundreds of thousands of references. -#![forbid(unsafe_code)] -#![deny(missing_docs, rust_2018_idioms)] +#![deny(unsafe_code, missing_docs, rust_2018_idioms)] use std::borrow::Cow; diff --git a/git-ref/src/store/packed/buffer.rs b/git-ref/src/store/packed/buffer.rs index 8ff97f1f174..912ebe85747 100644 --- a/git-ref/src/store/packed/buffer.rs +++ b/git-ref/src/store/packed/buffer.rs @@ -19,7 +19,7 @@ impl AsRef<[u8]> for packed::Backing { pub mod open { use std::path::PathBuf; - use filebuffer::FileBuffer; + use memmap2::Mmap; use crate::store_impl::packed; @@ -35,7 +35,13 @@ pub mod open { let backing = if std::fs::metadata(&path)?.len() <= use_memory_map_if_larger_than_bytes { packed::Backing::InMemory(std::fs::read(&path)?) } else { - packed::Backing::Mapped(FileBuffer::open(&path)?) + packed::Backing::Mapped( + // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. + #[allow(unsafe_code)] + unsafe { + Mmap::map(&std::fs::File::open(&path)?)? + }, + ) }; let (offset, sorted) = { diff --git a/git-ref/src/store/packed/mod.rs b/git-ref/src/store/packed/mod.rs index db775e5f6ce..45d2d925e59 100644 --- a/git-ref/src/store/packed/mod.rs +++ b/git-ref/src/store/packed/mod.rs @@ -1,9 +1,9 @@ use std::path::PathBuf; -use filebuffer::FileBuffer; use git_features::threading::OwnShared; use git_hash::ObjectId; use git_object::bstr::{BStr, BString}; +use memmap2::Mmap; use crate::{transaction::RefEdit, FullNameRef}; @@ -12,7 +12,7 @@ enum Backing { /// The buffer is loaded entirely in memory, along with the `offset` to the first record past the header. InMemory(Vec), /// The buffer is mapping the file on disk, along with the offset to the first record past the header - Mapped(FileBuffer), + Mapped(Mmap), } /// A buffer containing a packed-ref file that is either memory mapped or fully in-memory depending on a cutoff. From 0c946f5cb9d6eb13615b6c3d1a7b479ab5874441 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 09:53:06 +0800 Subject: [PATCH 09/57] use memmap2 in git-commitgraph (#293) --- Cargo.lock | 2 +- git-commitgraph/Cargo.toml | 2 +- git-commitgraph/src/file/init.rs | 18 +++++++++++++----- git-commitgraph/src/file/mod.rs | 4 ++-- git-commitgraph/src/lib.rs | 3 +-- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7f5f567983d..55f50f885b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1108,11 +1108,11 @@ version = "0.7.0" dependencies = [ "bstr", "byteorder", - "filebuffer", "git-chunk", "git-features 0.18.0", "git-hash 0.8.0", "git-testtools", + "memmap2", "serde", "thiserror", ] diff --git a/git-commitgraph/Cargo.toml b/git-commitgraph/Cargo.toml index c31ec7c0bbe..cdd6e394c11 100644 --- a/git-commitgraph/Cargo.toml +++ b/git-commitgraph/Cargo.toml @@ -22,7 +22,7 @@ git-chunk = { version ="^0.2.0", path = "../git-chunk" } bstr = { version = "0.2.13", default-features = false, features = ["std"] } byteorder = "1.2.3" -filebuffer = "0.4.0" +memmap2 = "0.5.0" serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } thiserror = "1.0.26" diff --git a/git-commitgraph/src/file/init.rs b/git-commitgraph/src/file/init.rs index 5d9c03de9f5..40104782888 100644 --- a/git-commitgraph/src/file/init.rs +++ b/git-commitgraph/src/file/init.rs @@ -5,7 +5,7 @@ use std::{ use bstr::ByteSlice; use byteorder::{BigEndian, ByteOrder}; -use filebuffer::FileBuffer; +use memmap2::Mmap; use crate::file::{ ChunkId, File, BASE_GRAPHS_LIST_CHUNK_ID, COMMIT_DATA_CHUNK_ID, COMMIT_DATA_ENTRY_SIZE_SANS_HASH, @@ -66,10 +66,18 @@ impl TryFrom<&Path> for File { type Error = Error; fn try_from(path: &Path) -> Result { - let data = FileBuffer::open(path).map_err(|e| Error::Io { - err: e, - path: path.to_owned(), - })?; + let data = std::fs::File::open(path) + .and_then(|file| { + // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. + #[allow(unsafe_code)] + unsafe { + Mmap::map(&file) + } + }) + .map_err(|e| Error::Io { + err: e, + path: path.to_owned(), + })?; let data_size = data.len(); if data_size < MIN_FILE_SIZE { return Err(Error::Corrupt( diff --git a/git-commitgraph/src/file/mod.rs b/git-commitgraph/src/file/mod.rs index 617f5cf0a0a..861ead20cbf 100644 --- a/git-commitgraph/src/file/mod.rs +++ b/git-commitgraph/src/file/mod.rs @@ -6,7 +6,7 @@ use std::{ path::PathBuf, }; -use filebuffer::FileBuffer; +use memmap2::Mmap; pub use self::{commit::Commit, init::Error}; @@ -42,7 +42,7 @@ pub struct File { base_graph_count: u8, base_graphs_list_offset: Option, commit_data_offset: usize, - data: FileBuffer, + data: Mmap, extra_edges_list_range: Option>, fan: [u32; FAN_LEN], oid_lookup_offset: usize, diff --git a/git-commitgraph/src/lib.rs b/git-commitgraph/src/lib.rs index cd516a3f507..b943050f22c 100644 --- a/git-commitgraph/src/lib.rs +++ b/git-commitgraph/src/lib.rs @@ -7,8 +7,7 @@ //! As generating the full commit graph from scratch can take some time, git may write new commits //! to separate [files][file::File] instead of overwriting the original file. //! Eventually, git will merge these files together as the number of files grows. -#![forbid(unsafe_code)] -#![deny(rust_2018_idioms, missing_docs)] +#![deny(unsafe_code, rust_2018_idioms, missing_docs)] pub mod file; pub mod graph; From fbfea28d2c9ed92e270c6a5aa603d3c84769ae8f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 09:55:24 +0800 Subject: [PATCH 10/57] git-index uses memmap2 (#293) --- Cargo.lock | 2 +- git-index/Cargo.toml | 2 +- git-index/src/lib.rs | 9 +++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 55f50f885b9..5ebc25c37c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1228,9 +1228,9 @@ dependencies = [ name = "git-index" version = "0.0.0" dependencies = [ - "filebuffer", "git-hash 0.8.0", "git-testtools", + "memmap2", "quick-error", ] diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index 5c98c218dc2..2c922a4d3db 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -16,7 +16,7 @@ doctest = false git-hash = { version ="^0.8.0", path = "../git-hash" } quick-error = "2.0.0" -filebuffer = "0.4.0" +memmap2 = "0.5.0" [dev-dependencies] git-testtools = { path = "../tests/tools"} diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index baa76536139..6f4d56905a4 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -1,5 +1,4 @@ -#![forbid(unsafe_code)] -#![deny(rust_2018_idioms)] +#![deny(unsafe_code, missing_docs, rust_2018_idioms)] #![allow(missing_docs)] use std::path::PathBuf; @@ -8,7 +7,7 @@ pub mod file { pub mod init { #![allow(unused)] use crate::File; - use filebuffer::FileBuffer; + use memmap2::Mmap; use std::path::Path; mod error { @@ -29,7 +28,9 @@ pub mod file { impl File { pub fn at(path: impl AsRef, object_hash: git_hash::Kind) -> Result { - let data = FileBuffer::open(path)?; + // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. + #[allow(unsafe_code)] + let data = unsafe { Mmap::map(&std::fs::File::open(path)?)? }; todo!("read file") } From d9011c71048ff34201917b0693586290c23b3ddf Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 10:18:45 +0800 Subject: [PATCH 11/57] git-pack uses `memmap2` instead of `filebuffer` (#293) --- Cargo.lock | 2 +- git-pack/Cargo.toml | 2 +- git-pack/src/bundle/write/mod.rs | 5 +++-- git-pack/src/data/file/init.rs | 4 +--- git-pack/src/data/mod.rs | 4 ++-- git-pack/src/index/init.rs | 3 +-- git-pack/src/index/mod.rs | 4 ++-- git-pack/src/lib.rs | 13 +++++++++++++ git-pack/src/multi_index/init.rs | 3 +-- git-pack/src/multi_index/mod.rs | 4 ++-- git-pack/tests/pack/index.rs | 2 +- 11 files changed, 28 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5ebc25c37c4..4f2cd66f337 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1361,7 +1361,6 @@ dependencies = [ "clru", "common_macros", "dashmap 5.0.0", - "filebuffer", "git-chunk", "git-diff 0.12.0", "git-features 0.18.0", @@ -1371,6 +1370,7 @@ dependencies = [ "git-tempfile 1.0.3", "git-testtools", "git-traverse 0.11.0", + "memmap2", "os_str_bytes 6.0.0", "parking_lot", "serde", diff --git a/git-pack/Cargo.toml b/git-pack/Cargo.toml index b5cc1c26181..89368d46e28 100644 --- a/git-pack/Cargo.toml +++ b/git-pack/Cargo.toml @@ -42,7 +42,7 @@ git-diff = { version ="^0.12.0", path = "../git-diff" } git-tempfile = { version ="^1.0.0", path = "../git-tempfile" } smallvec = "1.3.0" -filebuffer = "0.4.0" +memmap2 = "0.5.0" byteorder = "1.2.3" serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } bytesize = "1.0.1" diff --git a/git-pack/src/bundle/write/mod.rs b/git-pack/src/bundle/write/mod.rs index 8bdd6b7ac61..1abbd43f7fd 100644 --- a/git-pack/src/bundle/write/mod.rs +++ b/git-pack/src/bundle/write/mod.rs @@ -4,7 +4,6 @@ use std::{ sync::{atomic::AtomicBool, Arc}, }; -use filebuffer::FileBuffer; use git_features::{interrupt, progress, progress::Progress}; use git_tempfile::{handle::Writable, AutoRemove, ContainingDirectory}; @@ -295,7 +294,9 @@ impl crate::Bundle { fn new_pack_file_resolver( data_file: Arc>>, ) -> io::Result) -> Option<()> + Send + Clone> { - let mapped_file = Arc::new(FileBuffer::open(data_file.lock().with_mut(|f| f.path().to_owned())?)?); + let mapped_file = Arc::new(crate::mmap::read_only( + &data_file.lock().with_mut(|f| f.path().to_owned())?, + )?); let pack_data_lookup = move |range: std::ops::Range, out: &mut Vec| -> Option<()> { mapped_file .get(range.start as usize..range.end as usize) diff --git a/git-pack/src/data/file/init.rs b/git-pack/src/data/file/init.rs index ca7e3af5c86..390ad255f44 100644 --- a/git-pack/src/data/file/init.rs +++ b/git-pack/src/data/file/init.rs @@ -1,7 +1,5 @@ use std::{convert::TryInto, path::Path}; -use filebuffer::FileBuffer; - use crate::data; /// Instantiation @@ -18,7 +16,7 @@ impl data::File { use crate::data::header::N32_SIZE; let hash_len = object_hash.len_in_bytes(); - let data = FileBuffer::open(path).map_err(|e| data::header::decode::Error::Io { + let data = crate::mmap::read_only(path).map_err(|e| data::header::decode::Error::Io { source: e, path: path.to_owned(), })?; diff --git a/git-pack/src/data/mod.rs b/git-pack/src/data/mod.rs index 56d1adf0a03..d20a5538c65 100644 --- a/git-pack/src/data/mod.rs +++ b/git-pack/src/data/mod.rs @@ -7,7 +7,7 @@ pub type Offset = u64; /// An identifier to uniquely identify all packs loaded within a known context or namespace. pub type Id = u32; -use filebuffer::FileBuffer; +use memmap2::Mmap; /// An representing an full- or delta-object within a pack #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] @@ -62,7 +62,7 @@ impl Default for Version { /// A pack data file pub struct File { - data: FileBuffer, + data: Mmap, path: std::path::PathBuf, /// A value to represent this pack uniquely when used with cache lookup, or a way to identify this pack by its location on disk. /// The same location on disk should yield the same id. diff --git a/git-pack/src/index/init.rs b/git-pack/src/index/init.rs index 8f55c90f0ed..ebc9656e96a 100644 --- a/git-pack/src/index/init.rs +++ b/git-pack/src/index/init.rs @@ -1,7 +1,6 @@ use std::{mem::size_of, path::Path}; use byteorder::{BigEndian, ByteOrder}; -use filebuffer::FileBuffer; use crate::index::{self, Version, FAN_LEN, V2_SIGNATURE}; @@ -33,7 +32,7 @@ impl index::File { } fn at_inner(path: &Path, object_hash: git_hash::Kind) -> Result { - let data = FileBuffer::open(&path).map_err(|source| Error::Io { + let data = crate::mmap::read_only(&path).map_err(|source| Error::Io { source, path: path.to_owned(), })?; diff --git a/git-pack/src/index/mod.rs b/git-pack/src/index/mod.rs index d1cc1b749b5..0ae786f22ed 100644 --- a/git-pack/src/index/mod.rs +++ b/git-pack/src/index/mod.rs @@ -73,7 +73,7 @@ macro_rules! izip { }; } -use filebuffer::FileBuffer; +use memmap2::Mmap; /// The version of an index file #[derive(PartialEq, Eq, Ord, PartialOrd, Debug, Hash, Clone, Copy)] @@ -106,7 +106,7 @@ const FAN_LEN: usize = 256; /// A representation of a pack index file pub struct File { - data: FileBuffer, + data: Mmap, path: std::path::PathBuf, version: Version, num_objects: u32, diff --git a/git-pack/src/lib.rs b/git-pack/src/lib.rs index 4b81c8e6460..a2b7d8fa0fb 100755 --- a/git-pack/src/lib.rs +++ b/git-pack/src/lib.rs @@ -41,3 +41,16 @@ pub mod multi_index; /// pub mod verify; + +mod mmap { + use std::path::Path; + + pub fn read_only(path: &Path) -> std::io::Result { + let file = std::fs::File::open(path)?; + // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. + #[allow(unsafe_code)] + unsafe { + memmap2::Mmap::map(&file) + } + } +} diff --git a/git-pack/src/multi_index/init.rs b/git-pack/src/multi_index/init.rs index c3ad6445b30..871ddfec2fd 100644 --- a/git-pack/src/multi_index/init.rs +++ b/git-pack/src/multi_index/init.rs @@ -1,7 +1,6 @@ use std::{convert::TryFrom, path::Path}; use byteorder::{BigEndian, ByteOrder}; -use filebuffer::FileBuffer; use crate::multi_index::{chunk, File, Version}; @@ -52,7 +51,7 @@ impl TryFrom<&Path> for File { type Error = Error; fn try_from(path: &Path) -> Result { - let data = FileBuffer::open(path).map_err(|source| Error::Io { + let data = crate::mmap::read_only(path).map_err(|source| Error::Io { source, path: path.to_owned(), })?; diff --git a/git-pack/src/multi_index/mod.rs b/git-pack/src/multi_index/mod.rs index 31532b61dbe..ac84ea95b30 100644 --- a/git-pack/src/multi_index/mod.rs +++ b/git-pack/src/multi_index/mod.rs @@ -1,6 +1,6 @@ use std::path::PathBuf; -use filebuffer::FileBuffer; +use memmap2::Mmap; /// Known multi-index file versions #[derive(PartialEq, Eq, Ord, PartialOrd, Debug, Hash, Clone, Copy)] @@ -25,7 +25,7 @@ pub type EntryIndex = u32; /// A representation of an index file for multiple packs at the same time, typically stored in a file /// named 'multi-pack-index'. pub struct File { - data: FileBuffer, + data: Mmap, path: std::path::PathBuf, version: Version, hash_len: usize, diff --git a/git-pack/tests/pack/index.rs b/git-pack/tests/pack/index.rs index 94fde45d56e..f732953d4e4 100644 --- a/git-pack/tests/pack/index.rs +++ b/git-pack/tests/pack/index.rs @@ -76,10 +76,10 @@ mod file { mod any { use std::{fs, io, sync::atomic::AtomicBool}; - use filebuffer::FileBuffer; use git_features::progress; use git_odb::pack; use git_pack::data::{input, EntryRange}; + use memmap2::Mmap; use crate::{fixture_path, pack::V2_PACKS_AND_INDICES}; From 5a68d2feffc551ad5f07e90efb2307e966d2636b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 10:19:51 +0800 Subject: [PATCH 12/57] thanks clippy --- git-pack/src/index/init.rs | 2 +- git-pack/tests/pack/index.rs | 7 ++++--- gitoxide-core/src/repository.rs | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/git-pack/src/index/init.rs b/git-pack/src/index/init.rs index ebc9656e96a..4ab959c71ef 100644 --- a/git-pack/src/index/init.rs +++ b/git-pack/src/index/init.rs @@ -32,7 +32,7 @@ impl index::File { } fn at_inner(path: &Path, object_hash: git_hash::Kind) -> Result { - let data = crate::mmap::read_only(&path).map_err(|source| Error::Io { + let data = crate::mmap::read_only(path).map_err(|source| Error::Io { source, path: path.to_owned(), })?; diff --git a/git-pack/tests/pack/index.rs b/git-pack/tests/pack/index.rs index f732953d4e4..441b5f9fe88 100644 --- a/git-pack/tests/pack/index.rs +++ b/git-pack/tests/pack/index.rs @@ -79,7 +79,6 @@ mod file { use git_features::progress; use git_odb::pack; use git_pack::data::{input, EntryRange}; - use memmap2::Mmap; use crate::{fixture_path, pack::V2_PACKS_AND_INDICES}; @@ -89,8 +88,10 @@ mod file { for compressed in &[input::EntryDataMode::Crc32, input::EntryDataMode::KeepAndCrc32] { for (index_path, data_path) in V2_PACKS_AND_INDICES { let resolve = { - let buf = - git_features::threading::OwnShared::new(FileBuffer::open(fixture_path(data_path))?); + let buf = git_features::threading::OwnShared::new({ + let file = std::fs::File::open(fixture_path(data_path))?; + unsafe { memmap2::Mmap::map(&file)? } + }); move |entry: EntryRange, out: &mut Vec| { buf.get(entry.start as usize..entry.end as usize) .map(|slice| out.copy_from_slice(slice)) diff --git a/gitoxide-core/src/repository.rs b/gitoxide-core/src/repository.rs index 15d93d3d0da..fe1bfd9028b 100644 --- a/gitoxide-core/src/repository.rs +++ b/gitoxide-core/src/repository.rs @@ -41,6 +41,7 @@ pub mod verify { }: Context, ) -> anyhow::Result<()> { let repo = git_repository::open(repo)?; + #[cfg_attr(not(feature = "serde1"), allow(unused))] let outcome = repo.objects.verify_integrity( progress, should_interrupt, From 494ed46acc54bd342f891416918032a2c4848cf1 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 11:39:08 +0800 Subject: [PATCH 13/57] refactor (#293) --- git-index/src/file.rs | 32 ++++++++++++++++++++++++++++++++ git-index/src/lib.rs | 34 +--------------------------------- 2 files changed, 33 insertions(+), 33 deletions(-) create mode 100644 git-index/src/file.rs diff --git a/git-index/src/file.rs b/git-index/src/file.rs new file mode 100644 index 00000000000..16cbc950c94 --- /dev/null +++ b/git-index/src/file.rs @@ -0,0 +1,32 @@ +pub mod init { + #![allow(unused)] + use crate::File; + use memmap2::Mmap; + use std::path::Path; + + mod error { + use quick_error::quick_error; + + quick_error! { + #[derive(Debug)] + pub enum Error { + Io(err: std::io::Error) { + display("An IO error occurred while reading the index") + source(err) + from() + } + } + } + } + pub use error::Error; + + impl File { + pub fn at(path: impl AsRef, object_hash: git_hash::Kind) -> Result { + // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. + #[allow(unsafe_code)] + let data = unsafe { Mmap::map(&std::fs::File::open(path)?)? }; + + todo!("read file") + } + } +} diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 6f4d56905a4..1b70af808d1 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -3,40 +3,8 @@ use std::path::PathBuf; -pub mod file { - pub mod init { - #![allow(unused)] - use crate::File; - use memmap2::Mmap; - use std::path::Path; +pub mod file; - mod error { - use quick_error::quick_error; - - quick_error! { - #[derive(Debug)] - pub enum Error { - Io(err: std::io::Error) { - display("An IO error occurred while reading the index") - source(err) - from() - } - } - } - } - pub use error::Error; - - impl File { - pub fn at(path: impl AsRef, object_hash: git_hash::Kind) -> Result { - // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. - #[allow(unsafe_code)] - let data = unsafe { Mmap::map(&std::fs::File::open(path)?)? }; - - todo!("read file") - } - } - } -} pub mod init { use crate::State; From 826ca0c6a6801ec2a67ca73ac17092e5f85fe9ce Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 14:16:42 +0800 Subject: [PATCH 14/57] first stab at basic index file parsing (#293) --- Cargo.lock | 2 ++ Makefile | 1 + git-index/Cargo.toml | 6 +++++ git-index/src/file.rs | 48 ++++++++++++++++++++++++++++++------- git-index/src/lib.rs | 30 +++++++++++++++++++---- git-index/tests/file/mod.rs | 1 - 6 files changed, 74 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f2cd66f337..c8c5364f46c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1228,10 +1228,12 @@ dependencies = [ name = "git-index" version = "0.0.0" dependencies = [ + "filetime", "git-hash 0.8.0", "git-testtools", "memmap2", "quick-error", + "serde", ] [[package]] diff --git a/Makefile b/Makefile index fc7ce20cea0..4b158201ff9 100644 --- a/Makefile +++ b/Makefile @@ -82,6 +82,7 @@ check: ## Build all code in suitable configurations && cargo check cd git-object && cargo check --all-features \ && cargo check --features verbose-object-parsing-errors + cd git-index && cargo check --features serde1 cd git-actor && cargo check --features serde1 cd git-pack && cargo check --features serde1 \ && cargo check --features pack-cache-lru-static \ diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index 2c922a4d3db..dd18d1d867a 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -10,6 +10,9 @@ edition = "2018" [lib] doctest = false +[features] +serde1 = ["serde"] + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] @@ -17,6 +20,9 @@ git-hash = { version ="^0.8.0", path = "../git-hash" } quick-error = "2.0.0" memmap2 = "0.5.0" +filetime = "0.2.15" + +serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } [dev-dependencies] git-testtools = { path = "../tests/tools"} diff --git a/git-index/src/file.rs b/git-index/src/file.rs index 16cbc950c94..79dd0572fe3 100644 --- a/git-index/src/file.rs +++ b/git-index/src/file.rs @@ -1,8 +1,8 @@ pub mod init { #![allow(unused)] - use crate::File; + use crate::{File, State}; use memmap2::Mmap; - use std::path::Path; + use std::path::{Path, PathBuf}; mod error { use quick_error::quick_error; @@ -11,7 +11,7 @@ pub mod init { #[derive(Debug)] pub enum Error { Io(err: std::io::Error) { - display("An IO error occurred while reading the index") + display("An IO error occurred while opening the index") source(err) from() } @@ -21,12 +21,44 @@ pub mod init { pub use error::Error; impl File { - pub fn at(path: impl AsRef, object_hash: git_hash::Kind) -> Result { - // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. - #[allow(unsafe_code)] - let data = unsafe { Mmap::map(&std::fs::File::open(path)?)? }; + pub fn at(path: impl Into, object_hash: git_hash::Kind) -> Result { + let path = path.into(); + let (data, mtime) = { + // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. + let file = std::fs::File::open(&path)?; + #[allow(unsafe_code)] + let data = unsafe { Mmap::map(&file)? }; + (data, filetime::FileTime::from_last_modification_time(&file.metadata()?)) + }; - todo!("read file") + Ok(File { + state: State { timestamp: mtime }, + path, + }) } } } + +pub mod decode { + pub mod header { + mod error { + use quick_error::quick_error; + + quick_error! { + #[derive(Debug)] + pub enum Error { + Io(err: std::io::Error) { + display("An IO error occurred while opening the index") + source(err) + from() + } + } + } + } + pub use error::Error; + } + + fn header(data: &[u8]) -> Result<(crate::Version, &[u8]), header::Error> { + todo!("header parsing") + } +} diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 1b70af808d1..e5b6c0ed8f3 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -1,18 +1,22 @@ #![deny(unsafe_code, missing_docs, rust_2018_idioms)] -#![allow(missing_docs)] +#![allow(missing_docs, unused)] +use filetime::FileTime; use std::path::PathBuf; pub mod file; pub mod init { use crate::State; + use filetime::FileTime; impl State { /// Returns an empty state. - /// TODO: figure out if it needs to know some configuration - pub fn new() -> Self { - State + /// TODO: figure out if it needs to know some configuration, and if this would actually be used somewhere + fn new() -> Self { + State { + timestamp: FileTime::from_system_time(std::time::SystemTime::UNIX_EPOCH), + } } } @@ -23,6 +27,16 @@ pub mod init { } } +/// All known versions of a git index file. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +#[allow(missing_docs)] +pub enum Version { + V2 = 2, + V3 = 3, + V4 = 4, +} + /// An index file whose state was read from a file on disk. pub struct File { pub state: State, @@ -33,4 +47,10 @@ pub struct File { /// /// As opposed to a snapshot, it's meant to be altered and eventually be written back to disk or converted into a tree. /// We treat index and its state synonymous. -pub struct State; +pub struct State { + /// The time at which the state was created, indicating its freshness compared to other files on disk. + /// + /// Note that on platforms that only have a precisions of a second for this time, we will treat all entries with the + /// same timestamp as this as potentially changed, checking more thoroughly if a change actually happened. + timestamp: FileTime, +} diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index be7187c4cb8..53bb122796f 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -4,7 +4,6 @@ mod init { } #[test] - #[ignore] fn read_v2() { let _file = file("v2"); } From c5268115d9193ba2e309a943ee1d3c9e5825562c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 16:14:49 +0800 Subject: [PATCH 15/57] remove byteorder dependency from git-commitgraph (#293) This is now sufficiently well implemented in the standard library. --- Cargo.lock | 1 - git-commitgraph/Cargo.toml | 1 - git-commitgraph/src/file/commit.rs | 18 +++++++++++------- git-commitgraph/src/file/init.rs | 3 +-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c8c5364f46c..4e24ad17f36 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1107,7 +1107,6 @@ name = "git-commitgraph" version = "0.7.0" dependencies = [ "bstr", - "byteorder", "git-chunk", "git-features 0.18.0", "git-hash 0.8.0", diff --git a/git-commitgraph/Cargo.toml b/git-commitgraph/Cargo.toml index cdd6e394c11..5ba7a3f06b8 100644 --- a/git-commitgraph/Cargo.toml +++ b/git-commitgraph/Cargo.toml @@ -21,7 +21,6 @@ git-hash = { version ="^0.8.0", path = "../git-hash" } git-chunk = { version ="^0.2.0", path = "../git-chunk" } bstr = { version = "0.2.13", default-features = false, features = ["std"] } -byteorder = "1.2.3" memmap2 = "0.5.0" serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } thiserror = "1.0.26" diff --git a/git-commitgraph/src/file/commit.rs b/git-commitgraph/src/file/commit.rs index 8e89b4b030d..10e45dc12ab 100644 --- a/git-commitgraph/src/file/commit.rs +++ b/git-commitgraph/src/file/commit.rs @@ -5,8 +5,6 @@ use std::{ slice::Chunks, }; -use byteorder::{BigEndian, ByteOrder}; - use crate::{ file::{self, File, EXTENDED_EDGES_MASK, LAST_EXTENDED_EDGE_MASK, NO_PARENT}, graph, @@ -38,6 +36,11 @@ pub struct Commit<'a> { root_tree_id: &'a git_hash::oid, } +#[inline] +fn read_u32(b: &[u8]) -> u32 { + u32::from_be_bytes(b.try_into().unwrap()) +} + impl<'a> Commit<'a> { pub(crate) fn new(file: &'a File, pos: file::Position) -> Self { let bytes = file.commit_data_bytes(pos); @@ -45,10 +48,11 @@ impl<'a> Commit<'a> { file, pos, root_tree_id: git_hash::oid::from_bytes_unchecked(&bytes[..file.hash_len]), - parent1: ParentEdge::from_raw(BigEndian::read_u32(&bytes[file.hash_len..][..4])), - parent2: ParentEdge::from_raw(BigEndian::read_u32(&bytes[file.hash_len + 4..][..4])), - generation: BigEndian::read_u32(&bytes[file.hash_len + 8..][..4]) >> 2, - commit_timestamp: BigEndian::read_u64(&bytes[file.hash_len + 8..][..8]) & 0x0003_ffff_ffff, + parent1: ParentEdge::from_raw(read_u32(&bytes[file.hash_len..][..4])), + parent2: ParentEdge::from_raw(read_u32(&bytes[file.hash_len + 4..][..4])), + generation: read_u32(&bytes[file.hash_len + 8..][..4]) >> 2, + commit_timestamp: u64::from_be_bytes(bytes[file.hash_len + 8..][..8].try_into().unwrap()) + & 0x0003_ffff_ffff, } } @@ -173,7 +177,7 @@ impl<'a> Iterator for ParentIterator<'a> { }, ParentIteratorState::Extra(mut chunks) => { if let Some(chunk) = chunks.next() { - let extra_edge = BigEndian::read_u32(chunk); + let extra_edge = read_u32(chunk); match ExtraEdge::from_raw(extra_edge) { ExtraEdge::Internal(pos) => { self.state = ParentIteratorState::Extra(chunks); diff --git a/git-commitgraph/src/file/init.rs b/git-commitgraph/src/file/init.rs index 40104782888..1fd4516f24d 100644 --- a/git-commitgraph/src/file/init.rs +++ b/git-commitgraph/src/file/init.rs @@ -4,7 +4,6 @@ use std::{ }; use bstr::ByteSlice; -use byteorder::{BigEndian, ByteOrder}; use memmap2::Mmap; use crate::file::{ @@ -249,7 +248,7 @@ impl TryFrom<&Path> for File { fn read_fan(d: &[u8]) -> ([u32; FAN_LEN], usize) { let mut fan = [0; FAN_LEN]; for (c, f) in d.chunks(4).zip(fan.iter_mut()) { - *f = BigEndian::read_u32(c); + *f = u32::from_be_bytes(c.try_into().unwrap()); } (fan, FAN_LEN * 4) } From 41223061a2b919fd190066315b419ea17cabfde3 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 16:33:20 +0800 Subject: [PATCH 16/57] remove byteorder from git-pack (#293) It's sufficiently well supported using the standard library now. --- Cargo.lock | 1 - git-pack/Cargo.toml | 1 - git-pack/src/data/header.rs | 6 ++---- git-pack/src/index/access.rs | 29 +++++++++++++---------------- git-pack/src/index/init.rs | 6 ++---- git-pack/src/index/write/encode.rs | 14 +++++++------- git-pack/src/lib.rs | 13 +++++++++++++ git-pack/src/multi_index/access.rs | 8 +++----- git-pack/src/multi_index/chunk.rs | 14 ++++---------- git-pack/src/multi_index/init.rs | 4 +--- git-pack/src/multi_index/write.rs | 3 +-- 11 files changed, 46 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4e24ad17f36..b66233d9548 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1357,7 +1357,6 @@ name = "git-pack" version = "0.16.0" dependencies = [ "bstr", - "byteorder", "bytesize", "clru", "common_macros", diff --git a/git-pack/Cargo.toml b/git-pack/Cargo.toml index 89368d46e28..dcf64ace9bb 100644 --- a/git-pack/Cargo.toml +++ b/git-pack/Cargo.toml @@ -43,7 +43,6 @@ git-tempfile = { version ="^1.0.0", path = "../git-tempfile" } smallvec = "1.3.0" memmap2 = "0.5.0" -byteorder = "1.2.3" serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } bytesize = "1.0.1" os_str_bytes = "6.0.0" diff --git a/git-pack/src/data/header.rs b/git-pack/src/data/header.rs index 0ae297e80c2..348a4ca24ec 100644 --- a/git-pack/src/data/header.rs +++ b/git-pack/src/data/header.rs @@ -1,5 +1,3 @@ -use byteorder::{BigEndian, ByteOrder}; - use crate::data; pub(crate) const N32_SIZE: usize = std::mem::size_of::(); @@ -11,13 +9,13 @@ pub fn decode(data: &[u8; 12]) -> Result<(data::Version, u32), decode::Error> { return Err(decode::Error::Corrupt("Pack data type not recognized".into())); } ofs += N32_SIZE; - let kind = match BigEndian::read_u32(&data[ofs..ofs + N32_SIZE]) { + let kind = match crate::read_u32(&data[ofs..ofs + N32_SIZE]) { 2 => data::Version::V2, 3 => data::Version::V3, v => return Err(decode::Error::UnsupportedVersion(v)), }; ofs += N32_SIZE; - let num_objects = BigEndian::read_u32(&data[ofs..ofs + N32_SIZE]); + let num_objects = crate::read_u32(&data[ofs..ofs + N32_SIZE]); Ok((kind, num_objects)) } diff --git a/git-pack/src/index/access.rs b/git-pack/src/index/access.rs index c535377d672..aaa3319f9b4 100644 --- a/git-pack/src/index/access.rs +++ b/git-pack/src/index/access.rs @@ -1,7 +1,5 @@ use std::mem::size_of; -use byteorder::{BigEndian, ByteOrder}; - use crate::{ data, index::{self, EntryIndex, FAN_LEN}, @@ -39,7 +37,7 @@ impl index::File { let (ofs, oid) = c.split_at(N32_SIZE); Entry { oid: git_hash::ObjectId::from(oid), - pack_offset: BigEndian::read_u32(ofs) as u64, + pack_offset: crate::read_u32(ofs) as u64, crc32: None, } }), @@ -59,7 +57,7 @@ impl index::File { .map(move |(oid, crc32, ofs32)| Entry { oid: git_hash::ObjectId::from(oid), pack_offset: self.pack_offset_from_offset_v2(ofs32, pack64_offset), - crc32: Some(BigEndian::read_u32(crc32)), + crc32: Some(crate::read_u32(crc32)), }), _ => panic!("Cannot use iter_v2() on index of type {:?}", self.version), } @@ -94,7 +92,7 @@ impl index::File { } index::Version::V1 => { let start = V1_HEADER_SIZE + index * (N32_SIZE + self.hash_len); - BigEndian::read_u32(&self.data[start..][..N32_SIZE]) as u64 + crate::read_u32(&self.data[start..][..N32_SIZE]) as u64 } } } @@ -110,7 +108,7 @@ impl index::File { match self.version { index::Version::V2 => { let start = self.offset_crc32_v2() + index * N32_SIZE; - Some(BigEndian::read_u32(&self.data[start..start + N32_SIZE])) + Some(crate::read_u32(&self.data[start..start + N32_SIZE])) } index::Version::V1 => None, } @@ -153,14 +151,13 @@ impl index::File { let mut ofs: Vec<_> = match self.version { index::Version::V1 => self.iter().map(|e| e.pack_offset).collect(), index::Version::V2 => { - let mut v = Vec::with_capacity(self.num_objects as usize); - let mut ofs32 = &self.data[self.offset_pack_offset_v2()..]; - let pack_offset_64 = self.offset_pack_offset64_v2(); - for _ in 0..self.num_objects { - v.push(self.pack_offset_from_offset_v2(ofs32, pack_offset_64)); - ofs32 = &ofs32[4..]; - } - v + let offset32_start = &self.data[self.offset_pack_offset_v2()..]; + let pack_offset_64_start = self.offset_pack_offset64_v2(); + offset32_start + .chunks(N32_SIZE) + .take(self.num_objects as usize) + .map(|offset| self.pack_offset_from_offset_v2(offset, pack_offset_64_start)) + .collect() } }; ofs.sort_unstable(); @@ -185,10 +182,10 @@ impl index::File { #[inline] fn pack_offset_from_offset_v2(&self, offset: &[u8], pack64_offset: usize) -> data::Offset { debug_assert_eq!(self.version, index::Version::V2); - let ofs32 = BigEndian::read_u32(offset); + let ofs32 = crate::read_u32(offset); if (ofs32 & N32_HIGH_BIT) == N32_HIGH_BIT { let from = pack64_offset + (ofs32 ^ N32_HIGH_BIT) as usize * N64_SIZE; - BigEndian::read_u64(&self.data[from..][..N64_SIZE]) + crate::read_u64(&self.data[from..][..N64_SIZE]) } else { ofs32 as u64 } diff --git a/git-pack/src/index/init.rs b/git-pack/src/index/init.rs index 4ab959c71ef..78b225ad665 100644 --- a/git-pack/src/index/init.rs +++ b/git-pack/src/index/init.rs @@ -1,7 +1,5 @@ use std::{mem::size_of, path::Path}; -use byteorder::{BigEndian, ByteOrder}; - use crate::index::{self, Version, FAN_LEN, V2_SIGNATURE}; /// Returned by [`index::File::at()`]. @@ -57,7 +55,7 @@ impl index::File { let d = { if let Version::V2 = kind { let (vd, dr) = d.split_at(N32_SIZE); - let version = BigEndian::read_u32(vd); + let version = crate::read_u32(vd); if version != Version::V2 as u32 { return Err(Error::UnsupportedVersion { version }); } @@ -87,7 +85,7 @@ impl index::File { fn read_fan(d: &[u8]) -> ([u32; FAN_LEN], usize) { let mut fan = [0; FAN_LEN]; for (c, f) in d.chunks(N32_SIZE).zip(fan.iter_mut()) { - *f = BigEndian::read_u32(c); + *f = crate::read_u32(c); } (fan, FAN_LEN * N32_SIZE) } diff --git a/git-pack/src/index/write/encode.rs b/git-pack/src/index/write/encode.rs index feb3428ddf9..95dc6a45455 100644 --- a/git-pack/src/index/write/encode.rs +++ b/git-pack/src/index/write/encode.rs @@ -3,7 +3,6 @@ use std::{cmp::Ordering, io}; pub(crate) const LARGE_OFFSET_THRESHOLD: u64 = 0x7fff_ffff; pub(crate) const HIGH_BIT: u32 = 0x8000_0000; -use byteorder::{BigEndian, WriteBytesExt}; use git_features::{ hash, progress::{self, Progress}, @@ -35,7 +34,7 @@ pub(crate) fn write_to( hash::Write::new(out, kind.hash()), )); out.write_all(V2_SIGNATURE)?; - out.write_u32::(kind as u32)?; + out.write_all(&(kind as u32).to_be_bytes())?; progress.init(Some(4), progress::steps()); let start = std::time::Instant::now(); @@ -43,7 +42,7 @@ pub(crate) fn write_to( let fan_out = fanout(entries_sorted_by_oid.iter().map(|e| e.data.id.first_byte())); for value in fan_out { - out.write_u32::(value)?; + out.write_all(&value.to_be_bytes())?; } progress.inc(); @@ -55,7 +54,7 @@ pub(crate) fn write_to( progress.inc(); let _info = progress.add_child("writing crc32"); for entry in &entries_sorted_by_oid { - out.write_u32::(entry.data.crc32)?; + out.write_all(&entry.data.crc32.to_be_bytes())?; } progress.inc(); @@ -63,7 +62,7 @@ pub(crate) fn write_to( { let mut offsets64 = Vec::::new(); for entry in &entries_sorted_by_oid { - out.write_u32::(if entry.offset > LARGE_OFFSET_THRESHOLD { + let offset: u32 = if entry.offset > LARGE_OFFSET_THRESHOLD { assert!( offsets64.len() < LARGE_OFFSET_THRESHOLD as usize, "Encoding breakdown - way too many 64bit offsets" @@ -72,10 +71,11 @@ pub(crate) fn write_to( ((offsets64.len() - 1) as u32) | HIGH_BIT } else { entry.offset as u32 - })?; + }; + out.write_all(&offset.to_be_bytes())?; } for value in offsets64 { - out.write_u64::(value)?; + out.write_all(&value.to_be_bytes())?; } } diff --git a/git-pack/src/lib.rs b/git-pack/src/lib.rs index a2b7d8fa0fb..35f4e6f3c0c 100755 --- a/git-pack/src/lib.rs +++ b/git-pack/src/lib.rs @@ -32,6 +32,7 @@ pub mod cache; pub mod data; mod find_traits; + pub use find_traits::{Find, FindExt}; /// @@ -54,3 +55,15 @@ mod mmap { } } } + +use std::convert::TryInto; + +#[inline] +fn read_u32(b: &[u8]) -> u32 { + u32::from_be_bytes(b.try_into().unwrap()) +} + +#[inline] +fn read_u64(b: &[u8]) -> u64 { + u64::from_be_bytes(b.try_into().unwrap()) +} diff --git a/git-pack/src/multi_index/access.rs b/git-pack/src/multi_index/access.rs index 081873fe4f8..42956ba0001 100644 --- a/git-pack/src/multi_index/access.rs +++ b/git-pack/src/multi_index/access.rs @@ -1,7 +1,5 @@ use std::path::{Path, PathBuf}; -use byteorder::{BigEndian, ByteOrder}; - use crate::{ data, multi_index::{EntryIndex, File, PackIndex, Version}, @@ -102,15 +100,15 @@ impl File { const HIGH_BIT: u32 = 1 << 31; - let pack_index = BigEndian::read_u32(&self.data[start..][..4]); + let pack_index = crate::read_u32(&self.data[start..][..4]); let offset = &self.data[start + 4..][..4]; - let ofs32 = BigEndian::read_u32(offset); + let ofs32 = crate::read_u32(offset); let pack_offset = if (ofs32 & HIGH_BIT) == HIGH_BIT { // We determine if large offsets are actually larger than 4GB and if not, we don't use the high-bit to signal anything // but allow the presence of the large-offset chunk to signal what's happening. if let Some(offsets_64) = self.large_offsets_ofs { let from = offsets_64 + (ofs32 ^ HIGH_BIT) as usize * 8; - BigEndian::read_u64(&self.data[from..][..8]) + crate::read_u64(&self.data[from..][..8]) } else { ofs32 as u64 } diff --git a/git-pack/src/multi_index/chunk.rs b/git-pack/src/multi_index/chunk.rs index 8d6c734a556..5aeae5fe3f2 100644 --- a/git-pack/src/multi_index/chunk.rs +++ b/git-pack/src/multi_index/chunk.rs @@ -108,8 +108,6 @@ pub mod index_names { pub mod fanout { use std::convert::TryInto; - use byteorder::{BigEndian, WriteBytesExt}; - use crate::multi_index; /// The size of the fanout table @@ -138,7 +136,7 @@ pub mod fanout { let fanout = crate::index::write::encode::fanout(sorted_entries.iter().map(|e| e.id.first_byte())); for value in fanout { - out.write_u32::(value)?; + out.write_all(&value.to_be_bytes())?; } Ok(()) } @@ -178,8 +176,6 @@ pub mod lookup { pub mod offsets { use std::{convert::TryInto, ops::Range}; - use byteorder::{BigEndian, WriteBytesExt}; - use crate::multi_index; /// The id uniquely identifying the offsets table. @@ -199,7 +195,7 @@ pub mod offsets { let mut num_large_offsets = 0u32; for entry in sorted_entries { - out.write_u32::(entry.pack_index)?; + out.write_all(&entry.pack_index.to_be_bytes())?; let offset: u32 = if large_offsets_needed { if entry.pack_offset > LARGE_OFFSET_THRESHOLD { @@ -215,7 +211,7 @@ pub mod offsets { .try_into() .expect("without large offsets, pack-offset fits u32") }; - out.write_u32::(offset)?; + out.write_all(&offset.to_be_bytes())?; } Ok(()) } @@ -231,8 +227,6 @@ pub mod offsets { pub mod large_offsets { use std::ops::Range; - use byteorder::{BigEndian, WriteBytesExt}; - use crate::{index::write::encode::LARGE_OFFSET_THRESHOLD, multi_index}; /// The id uniquely identifying the large offsets table (with 64 bit offsets) @@ -267,7 +261,7 @@ pub mod large_offsets { .iter() .filter_map(|e| (e.pack_offset > LARGE_OFFSET_THRESHOLD).then(|| e.pack_offset)) { - out.write_u64::(offset)?; + out.write_all(&offset.to_be_bytes())?; num_large_offsets = num_large_offsets .checked_sub(1) .expect("BUG: wrote more offsets the previously found"); diff --git a/git-pack/src/multi_index/init.rs b/git-pack/src/multi_index/init.rs index 871ddfec2fd..ca1afb22d24 100644 --- a/git-pack/src/multi_index/init.rs +++ b/git-pack/src/multi_index/init.rs @@ -1,7 +1,5 @@ use std::{convert::TryFrom, path::Path}; -use byteorder::{BigEndian, ByteOrder}; - use crate::multi_index::{chunk, File, Version}; mod error { @@ -90,7 +88,7 @@ impl TryFrom<&Path> for File { let (_num_base_files, data) = data.split_at(1); // TODO: handle base files once it's clear what this does let (num_indices, _) = data.split_at(4); - let num_indices = BigEndian::read_u32(num_indices); + let num_indices = crate::read_u32(num_indices); (version, object_hash, num_chunks, num_indices) }; diff --git a/git-pack/src/multi_index/write.rs b/git-pack/src/multi_index/write.rs index 04e18d4e900..9de5f50489b 100644 --- a/git-pack/src/multi_index/write.rs +++ b/git-pack/src/multi_index/write.rs @@ -5,7 +5,6 @@ use std::{ time::{Instant, SystemTime}, }; -use byteorder::{BigEndian, WriteBytesExt}; use git_features::progress::Progress; use crate::multi_index; @@ -217,7 +216,7 @@ impl multi_index::File { out.write_all(&[object_hash as u8])?; out.write_all(&[num_chunks])?; out.write_all(&[0])?; /* unused number of base files */ - out.write_u32::(num_indices)?; + out.write_all(&num_indices.to_be_bytes())?; Ok(Self::HEADER_LEN) } From 5c731f831d007a4fe099cadc4ecaab113ab7e08a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 16:54:15 +0800 Subject: [PATCH 17/57] parse index header (#293) --- git-index/Cargo.toml | 2 +- git-index/src/file.rs | 52 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index dd18d1d867a..7718a554bb0 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -5,7 +5,7 @@ repository = "https://github.com/Byron/gitoxide" license = "MIT/Apache-2.0" description = "A WIP crate of the gitoxide project dedicated implementing the git index file" authors = ["Sebastian Thiel "] -edition = "2018" +edition = "2021" [lib] doctest = false diff --git a/git-index/src/file.rs b/git-index/src/file.rs index 79dd0572fe3..f9e7aff89e5 100644 --- a/git-index/src/file.rs +++ b/git-index/src/file.rs @@ -1,10 +1,13 @@ pub mod init { #![allow(unused)] + + use crate::file::decode; use crate::{File, State}; use memmap2::Mmap; use std::path::{Path, PathBuf}; mod error { + use crate::file::decode; use quick_error::quick_error; quick_error! { @@ -15,6 +18,11 @@ pub mod init { source(err) from() } + DecodeHeader(err: decode::header::Error) { + display("The header could not be understood") + source(err) + from() + } } } } @@ -31,6 +39,8 @@ pub mod init { (data, filetime::FileTime::from_last_modification_time(&file.metadata()?)) }; + let (version, num_entries, data) = decode::header(&data)?; + Ok(File { state: State { timestamp: mtime }, path, @@ -40,6 +50,8 @@ pub mod init { } pub mod decode { + use crate::Version; + pub mod header { mod error { use quick_error::quick_error; @@ -47,10 +59,11 @@ pub mod decode { quick_error! { #[derive(Debug)] pub enum Error { - Io(err: std::io::Error) { - display("An IO error occurred while opening the index") - source(err) - from() + Corrupt(message: &'static str) { + display("{}", message) + } + UnsupportedVersion(version: u32) { + display("Index version {} is not supported", version) } } } @@ -58,7 +71,34 @@ pub mod decode { pub use error::Error; } - fn header(data: &[u8]) -> Result<(crate::Version, &[u8]), header::Error> { - todo!("header parsing") + pub(crate) fn header(data: &[u8]) -> Result<(crate::Version, u32, &[u8]), header::Error> { + if data.len() < 3 * 4 { + return Err(header::Error::Corrupt("The header is truncated")); + } + + const SIGNATURE: &[u8] = b"DIRC"; + let (signature, data) = data.split_at(4); + if signature != SIGNATURE { + return Err(header::Error::Corrupt( + "Signature mismatch - this doesn't claim to be a header file", + )); + } + + let (version, data) = data.split_at(4); + let version = match read_u32(version) { + 2 => Version::V2, + 3 => Version::V3, + 4 => Version::V4, + unknown => return Err(header::Error::UnsupportedVersion(unknown)), + }; + let (entries, data) = data.split_at(4); + let entries = read_u32(entries); + + Ok((version, entries, data)) + } + + #[inline] + fn read_u32(b: &[u8]) -> u32 { + u32::from_be_bytes(b.try_into().unwrap()) } } From 068c716b46699234d6ad1db70be34b894e61d76a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 18:19:54 +0800 Subject: [PATCH 18/57] first step towards reading the EOIE extension (#293) --- git-index/src/file.rs | 50 +++++++++++++++++++++-- git-index/src/lib.rs | 17 ++++++++ git-index/tests/fixtures/make_index/v2.sh | 1 + 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/git-index/src/file.rs b/git-index/src/file.rs index f9e7aff89e5..8f178cd6883 100644 --- a/git-index/src/file.rs +++ b/git-index/src/file.rs @@ -39,7 +39,8 @@ pub mod init { (data, filetime::FileTime::from_last_modification_time(&file.metadata()?)) }; - let (version, num_entries, data) = decode::header(&data)?; + let (version, num_entries, post_header_data) = decode::header(&data, object_hash)?; + let start_of_extensions = decode::extension::end_of_index_entry(&data, object_hash); Ok(File { state: State { timestamp: mtime }, @@ -52,7 +53,43 @@ pub mod init { pub mod decode { use crate::Version; + fn extension(data: &[u8]) -> ([u8; 4], u32, &[u8]) { + let (signature, data) = data.split_at(4); + let (size, data) = data.split_at(4); + (signature.try_into().unwrap(), read_u32(size), data) + } + + pub(crate) mod extension { + use crate::extension::EndOfIndexEntry; + use crate::file::decode; + use crate::file::decode::read_u32; + + pub fn end_of_index_entry(data: &[u8], object_hash: git_hash::Kind) -> Option { + let hash_len = object_hash.len_in_bytes(); + if data.len() < EndOfIndexEntry::SIZE_WITH_HEADER + hash_len { + return None; + } + + let start_of_eoie = data.len() - EndOfIndexEntry::SIZE_WITH_HEADER - hash_len; + let data = &data[start_of_eoie..][..hash_len]; + + let (signature, ext_size, data) = decode::extension(data); + if &signature != EndOfIndexEntry::SIGNATURE || ext_size as usize != EndOfIndexEntry::SIZE { + return None; + } + + let (offset, hash) = data.split_at(4); + let offset = read_u32(offset) as usize; + if offset < decode::header::SIZE { + return None; + } + todo!("eoie") + } + } + pub mod header { + pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; + mod error { use quick_error::quick_error; @@ -71,9 +108,14 @@ pub mod decode { pub use error::Error; } - pub(crate) fn header(data: &[u8]) -> Result<(crate::Version, u32, &[u8]), header::Error> { - if data.len() < 3 * 4 { - return Err(header::Error::Corrupt("The header is truncated")); + pub(crate) fn header( + data: &[u8], + object_hash: git_hash::Kind, + ) -> Result<(crate::Version, u32, &[u8]), header::Error> { + if data.len() < (3 * 4) + object_hash.len_in_bytes() { + return Err(header::Error::Corrupt( + "File is too small even for header with zero entries and smallest hash", + )); } const SIGNATURE: &[u8] = b"DIRC"; diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index e5b6c0ed8f3..fafc76f587f 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -6,6 +6,23 @@ use std::path::PathBuf; pub mod file; +pub mod extension { + const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; + + pub struct EndOfIndexEntry { + /// The offset the the beginning of all extensions, or the end of all entries. + offset_to_extensions: u32, + /// The SHA1 checksum over the signature and size of all extensions. + checksum: git_hash::ObjectId, + } + + impl EndOfIndexEntry { + pub const SIGNATURE: &'static [u8] = b"EOIE"; + pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); + pub const SIZE_WITH_HEADER: usize = crate::extension::MIN_SIZE + Self::SIZE; + } +} + pub mod init { use crate::State; use filetime::FileTime; diff --git a/git-index/tests/fixtures/make_index/v2.sh b/git-index/tests/fixtures/make_index/v2.sh index 56cce83c35c..20b16c3b587 100644 --- a/git-index/tests/fixtures/make_index/v2.sh +++ b/git-index/tests/fixtures/make_index/v2.sh @@ -3,6 +3,7 @@ set -eu -o pipefail GIT_INDEX_VERSION=2 git init -q git config commit.gpgsign false +git config index.threads 2 touch a git add a From 9b28b18262c763608d60fba65e91fcb9ca3ddb3e Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 18:27:05 +0800 Subject: [PATCH 19/57] refactor (#293) --- git-index/src/file.rs | 95 +++++++---------------- git-index/src/lib.rs | 49 +++++++++++- git-odb/src/store_impls/dynamic/verify.rs | 2 +- gitoxide-core/src/repository.rs | 4 +- 4 files changed, 77 insertions(+), 73 deletions(-) diff --git a/git-index/src/file.rs b/git-index/src/file.rs index 8f178cd6883..94c438d1aed 100644 --- a/git-index/src/file.rs +++ b/git-index/src/file.rs @@ -1,15 +1,17 @@ pub mod init { #![allow(unused)] - use crate::file::decode; - use crate::{File, State}; - use memmap2::Mmap; use std::path::{Path, PathBuf}; + use memmap2::Mmap; + + use crate::{extension, file::header, File, State}; + mod error { - use crate::file::decode; use quick_error::quick_error; + use crate::file::header; + quick_error! { #[derive(Debug)] pub enum Error { @@ -18,7 +20,7 @@ pub mod init { source(err) from() } - DecodeHeader(err: decode::header::Error) { + DecodeHeader(err: header::decode::Error) { display("The header could not be understood") source(err) from() @@ -39,8 +41,8 @@ pub mod init { (data, filetime::FileTime::from_last_modification_time(&file.metadata()?)) }; - let (version, num_entries, post_header_data) = decode::header(&data, object_hash)?; - let start_of_extensions = decode::extension::end_of_index_entry(&data, object_hash); + let (version, num_entries, post_header_data) = header::decode(&data, object_hash)?; + let start_of_extensions = extension::EndOfIndexEntry::from_bytes(&data, object_hash); Ok(File { state: State { timestamp: mtime }, @@ -50,70 +52,32 @@ pub mod init { } } -pub mod decode { - use crate::Version; - - fn extension(data: &[u8]) -> ([u8; 4], u32, &[u8]) { - let (signature, data) = data.split_at(4); - let (size, data) = data.split_at(4); - (signature.try_into().unwrap(), read_u32(size), data) - } - - pub(crate) mod extension { - use crate::extension::EndOfIndexEntry; - use crate::file::decode; - use crate::file::decode::read_u32; - - pub fn end_of_index_entry(data: &[u8], object_hash: git_hash::Kind) -> Option { - let hash_len = object_hash.len_in_bytes(); - if data.len() < EndOfIndexEntry::SIZE_WITH_HEADER + hash_len { - return None; - } - - let start_of_eoie = data.len() - EndOfIndexEntry::SIZE_WITH_HEADER - hash_len; - let data = &data[start_of_eoie..][..hash_len]; +pub mod header { + pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; - let (signature, ext_size, data) = decode::extension(data); - if &signature != EndOfIndexEntry::SIGNATURE || ext_size as usize != EndOfIndexEntry::SIZE { - return None; - } - - let (offset, hash) = data.split_at(4); - let offset = read_u32(offset) as usize; - if offset < decode::header::SIZE { - return None; - } - todo!("eoie") - } - } + pub mod decode { + use quick_error::quick_error; - pub mod header { - pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; - - mod error { - use quick_error::quick_error; - - quick_error! { - #[derive(Debug)] - pub enum Error { - Corrupt(message: &'static str) { - display("{}", message) - } - UnsupportedVersion(version: u32) { - display("Index version {} is not supported", version) - } + quick_error! { + #[derive(Debug)] + pub enum Error { + Corrupt(message: &'static str) { + display("{}", message) + } + UnsupportedVersion(version: u32) { + display("Index version {} is not supported", version) } } } - pub use error::Error; } + use crate::{util::read_u32, Version}; - pub(crate) fn header( + pub(crate) fn decode( data: &[u8], object_hash: git_hash::Kind, - ) -> Result<(crate::Version, u32, &[u8]), header::Error> { + ) -> Result<(crate::Version, u32, &[u8]), decode::Error> { if data.len() < (3 * 4) + object_hash.len_in_bytes() { - return Err(header::Error::Corrupt( + return Err(decode::Error::Corrupt( "File is too small even for header with zero entries and smallest hash", )); } @@ -121,7 +85,7 @@ pub mod decode { const SIGNATURE: &[u8] = b"DIRC"; let (signature, data) = data.split_at(4); if signature != SIGNATURE { - return Err(header::Error::Corrupt( + return Err(decode::Error::Corrupt( "Signature mismatch - this doesn't claim to be a header file", )); } @@ -131,16 +95,11 @@ pub mod decode { 2 => Version::V2, 3 => Version::V3, 4 => Version::V4, - unknown => return Err(header::Error::UnsupportedVersion(unknown)), + unknown => return Err(decode::Error::UnsupportedVersion(unknown)), }; let (entries, data) = data.split_at(4); let entries = read_u32(entries); Ok((version, entries, data)) } - - #[inline] - fn read_u32(b: &[u8]) -> u32 { - u32::from_be_bytes(b.try_into().unwrap()) - } } diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index fafc76f587f..7fa0b41ef3e 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -1,14 +1,51 @@ #![deny(unsafe_code, missing_docs, rust_2018_idioms)] #![allow(missing_docs, unused)] -use filetime::FileTime; use std::path::PathBuf; +use filetime::FileTime; + pub mod file; pub mod extension { + use crate::{util::read_u32, Version}; + const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; + fn decode_header(data: &[u8]) -> ([u8; 4], u32, &[u8]) { + let (signature, data) = data.split_at(4); + let (size, data) = data.split_at(4); + (signature.try_into().unwrap(), read_u32(size), data) + } + + mod end_of_index_entry { + use crate::{extension, extension::EndOfIndexEntry, file::header, util::read_u32}; + + impl EndOfIndexEntry { + pub fn from_bytes(data: &[u8], object_hash: git_hash::Kind) -> Option { + let hash_len = object_hash.len_in_bytes(); + if data.len() < EndOfIndexEntry::SIZE_WITH_HEADER + hash_len { + return None; + } + + let start_of_eoie = data.len() - EndOfIndexEntry::SIZE_WITH_HEADER - hash_len; + let data = &data[start_of_eoie..][..hash_len]; + + let (signature, ext_size, data) = extension::decode_header(data); + if &signature != EndOfIndexEntry::SIGNATURE || ext_size as usize != EndOfIndexEntry::SIZE { + return None; + } + + let (offset, hash) = data.split_at(4); + let offset = read_u32(offset) as usize; + if offset < header::SIZE { + return None; + } + todo!("eoie") + } + } + } + pub struct EndOfIndexEntry { /// The offset the the beginning of all extensions, or the end of all entries. offset_to_extensions: u32, @@ -24,9 +61,10 @@ pub mod extension { } pub mod init { - use crate::State; use filetime::FileTime; + use crate::State; + impl State { /// Returns an empty state. /// TODO: figure out if it needs to know some configuration, and if this would actually be used somewhere @@ -71,3 +109,10 @@ pub struct State { /// same timestamp as this as potentially changed, checking more thoroughly if a change actually happened. timestamp: FileTime, } + +pub(crate) mod util { + #[inline] + pub fn read_u32(b: &[u8]) -> u32 { + u32::from_be_bytes(b.try_into().unwrap()) + } +} diff --git a/git-odb/src/store_impls/dynamic/verify.rs b/git-odb/src/store_impls/dynamic/verify.rs index a4a43ab332b..3b36f64476f 100644 --- a/git-odb/src/store_impls/dynamic/verify.rs +++ b/git-odb/src/store_impls/dynamic/verify.rs @@ -1,7 +1,7 @@ -use std::time::Instant; use std::{ ops::Deref, sync::atomic::{AtomicBool, Ordering}, + time::Instant, }; use git_features::progress::{MessageLevel, Progress}; diff --git a/gitoxide-core/src/repository.rs b/gitoxide-core/src/repository.rs index fe1bfd9028b..aad5dfa7944 100644 --- a/gitoxide-core/src/repository.rs +++ b/gitoxide-core/src/repository.rs @@ -8,12 +8,12 @@ pub fn init(directory: Option) -> Result { } pub mod verify { - use crate::pack; - use crate::OutputFormat; use std::{path::PathBuf, sync::atomic::AtomicBool}; use git_repository::Progress; + use crate::{pack, OutputFormat}; + /// A general purpose context for many operations provided here pub struct Context { /// If set, provide statistics to `out` in the given format From 79ca582045dd03434737c779b84c991acf1b0823 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 18:30:08 +0800 Subject: [PATCH 20/57] right before implementing a traversal over extension chunks (#293) --- git-index/src/extension.rs | 50 +++++++++++++++++++++++++++++++++++ git-index/src/lib.rs | 53 +------------------------------------- 2 files changed, 51 insertions(+), 52 deletions(-) create mode 100644 git-index/src/extension.rs diff --git a/git-index/src/extension.rs b/git-index/src/extension.rs new file mode 100644 index 00000000000..ae38f73661e --- /dev/null +++ b/git-index/src/extension.rs @@ -0,0 +1,50 @@ +use crate::{util::read_u32, Version}; + +const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; + +fn decode_header(data: &[u8]) -> ([u8; 4], u32, &[u8]) { + let (signature, data) = data.split_at(4); + let (size, data) = data.split_at(4); + (signature.try_into().unwrap(), read_u32(size), data) +} + +mod end_of_index_entry { + use crate::{extension, extension::EndOfIndexEntry, file::header, util::read_u32}; + + impl EndOfIndexEntry { + pub fn from_bytes(data: &[u8], object_hash: git_hash::Kind) -> Option { + let hash_len = object_hash.len_in_bytes(); + if data.len() < EndOfIndexEntry::SIZE_WITH_HEADER + hash_len { + return None; + } + + let start_of_eoie = data.len() - EndOfIndexEntry::SIZE_WITH_HEADER - hash_len; + let data = &data[start_of_eoie..][..hash_len]; + + let (signature, ext_size, data) = extension::decode_header(data); + if &signature != EndOfIndexEntry::SIGNATURE || ext_size as usize != EndOfIndexEntry::SIZE { + return None; + } + + let (offset, hash) = data.split_at(4); + let offset = read_u32(offset) as usize; + if offset < header::SIZE || offset > start_of_eoie { + return None; + } + todo!("eoie") + } + } +} + +pub struct EndOfIndexEntry { + /// The offset the the beginning of all extensions, or the end of all entries. + offset_to_extensions: u32, + /// The SHA1 checksum over the signature and size of all extensions. + checksum: git_hash::ObjectId, +} + +impl EndOfIndexEntry { + pub const SIGNATURE: &'static [u8] = b"EOIE"; + pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); + pub const SIZE_WITH_HEADER: usize = crate::extension::MIN_SIZE + Self::SIZE; +} diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 7fa0b41ef3e..3b0bac22f96 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -7,58 +7,7 @@ use filetime::FileTime; pub mod file; -pub mod extension { - use crate::{util::read_u32, Version}; - - const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; - - fn decode_header(data: &[u8]) -> ([u8; 4], u32, &[u8]) { - let (signature, data) = data.split_at(4); - let (size, data) = data.split_at(4); - (signature.try_into().unwrap(), read_u32(size), data) - } - - mod end_of_index_entry { - use crate::{extension, extension::EndOfIndexEntry, file::header, util::read_u32}; - - impl EndOfIndexEntry { - pub fn from_bytes(data: &[u8], object_hash: git_hash::Kind) -> Option { - let hash_len = object_hash.len_in_bytes(); - if data.len() < EndOfIndexEntry::SIZE_WITH_HEADER + hash_len { - return None; - } - - let start_of_eoie = data.len() - EndOfIndexEntry::SIZE_WITH_HEADER - hash_len; - let data = &data[start_of_eoie..][..hash_len]; - - let (signature, ext_size, data) = extension::decode_header(data); - if &signature != EndOfIndexEntry::SIGNATURE || ext_size as usize != EndOfIndexEntry::SIZE { - return None; - } - - let (offset, hash) = data.split_at(4); - let offset = read_u32(offset) as usize; - if offset < header::SIZE { - return None; - } - todo!("eoie") - } - } - } - - pub struct EndOfIndexEntry { - /// The offset the the beginning of all extensions, or the end of all entries. - offset_to_extensions: u32, - /// The SHA1 checksum over the signature and size of all extensions. - checksum: git_hash::ObjectId, - } - - impl EndOfIndexEntry { - pub const SIGNATURE: &'static [u8] = b"EOIE"; - pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); - pub const SIZE_WITH_HEADER: usize = crate::extension::MIN_SIZE + Self::SIZE; - } -} +pub mod extension; pub mod init { use filetime::FileTime; From 591511a739f91c5e8ff4243059ac98052a44c914 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 18:30:44 +0800 Subject: [PATCH 21/57] thanks clippy --- git-index/src/extension.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/git-index/src/extension.rs b/git-index/src/extension.rs index ae38f73661e..70f84ef9bdd 100644 --- a/git-index/src/extension.rs +++ b/git-index/src/extension.rs @@ -22,7 +22,7 @@ mod end_of_index_entry { let data = &data[start_of_eoie..][..hash_len]; let (signature, ext_size, data) = extension::decode_header(data); - if &signature != EndOfIndexEntry::SIGNATURE || ext_size as usize != EndOfIndexEntry::SIZE { + if signature != EndOfIndexEntry::SIGNATURE || ext_size as usize != EndOfIndexEntry::SIZE { return None; } From 9ffd5231c582a3870c6d25ea870c005e77e32276 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 20:40:15 +0800 Subject: [PATCH 22/57] Another big step, even though EOIE checksum is still bugged (#293) --- Cargo.lock | 1 + git-index/Cargo.toml | 1 + git-index/src/extension.rs | 81 ++++++++++++++++++++++++++++++++++---- 3 files changed, 76 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b66233d9548..898674d5745 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1228,6 +1228,7 @@ name = "git-index" version = "0.0.0" dependencies = [ "filetime", + "git-features 0.18.0", "git-hash 0.8.0", "git-testtools", "memmap2", diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index 7718a554bb0..7bc7bd2a76c 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -16,6 +16,7 @@ serde1 = ["serde"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +git-features = { version ="^0.18.0", path = "../git-features", features = ["rustsha1"] } git-hash = { version ="^0.8.0", path = "../git-hash" } quick-error = "2.0.0" diff --git a/git-index/src/extension.rs b/git-index/src/extension.rs index 70f84ef9bdd..86f602f34b9 100644 --- a/git-index/src/extension.rs +++ b/git-index/src/extension.rs @@ -2,7 +2,9 @@ use crate::{util::read_u32, Version}; const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; -fn decode_header(data: &[u8]) -> ([u8; 4], u32, &[u8]) { +pub type Signature = [u8; 4]; + +fn decode_header(data: &[u8]) -> (Signature, u32, &[u8]) { let (signature, data) = data.split_at(4); let (size, data) = data.split_at(4); (signature.try_into().unwrap(), read_u32(size), data) @@ -19,23 +21,88 @@ mod end_of_index_entry { } let start_of_eoie = data.len() - EndOfIndexEntry::SIZE_WITH_HEADER - hash_len; - let data = &data[start_of_eoie..][..hash_len]; + let ext_data = &data[start_of_eoie..][..hash_len]; - let (signature, ext_size, data) = extension::decode_header(data); + let (signature, ext_size, ext_data) = extension::decode_header(ext_data); if signature != EndOfIndexEntry::SIGNATURE || ext_size as usize != EndOfIndexEntry::SIZE { return None; } - let (offset, hash) = data.split_at(4); + let (offset, checksum) = ext_data.split_at(4); let offset = read_u32(offset) as usize; - if offset < header::SIZE || offset > start_of_eoie { + if offset < header::SIZE || offset > start_of_eoie || checksum.len() != git_hash::Kind::Sha1.len_in_bytes() + { + dbg!("checksum too small"); return None; } - todo!("eoie") + + let mut hasher = git_features::hash::hasher(git_hash::Kind::Sha1); + let mut last_chunk = None; + for (signature, chunk) in + extension::Iter::new(&data[offset..data.len() - EndOfIndexEntry::SIZE_WITH_HEADER - hash_len]) + { + hasher.update(&signature); + hasher.update(&(chunk.len() as u32).to_be_bytes()); + last_chunk = Some(chunk); + } + + if hasher.digest() != checksum { + return None; + } + if last_chunk + .map(|s| s.as_ptr_range() != ext_data.as_ptr_range()) + .unwrap_or(true) + { + return None; + } + todo!("euio") } } } +mod iter { + use crate::extension; + use crate::extension::Iter; + use crate::util::read_u32; + + impl<'a> Iter<'a> { + pub fn new(data_at_beginning_of_extensions_and_truncated: &'a [u8]) -> Self { + Iter { + data: data_at_beginning_of_extensions_and_truncated, + } + } + } + + impl<'a> Iterator for Iter<'a> { + type Item = (extension::Signature, &'a [u8]); + + fn next(&mut self) -> Option { + if self.data.len() < 4 + 4 { + return None; + } + + let (signature, data) = self.data.split_at(4); + let (size, data) = data.split_at(4); + let size = read_u32(size) as usize; + + match data.get(..size) { + Some(ext_data) => { + self.data = &data[size..]; + Some((signature.try_into().unwrap(), ext_data)) + } + None => { + self.data = &[]; + None + } + } + } + } +} + +pub struct Iter<'a> { + data: &'a [u8], +} + pub struct EndOfIndexEntry { /// The offset the the beginning of all extensions, or the end of all entries. offset_to_extensions: u32, @@ -44,7 +111,7 @@ pub struct EndOfIndexEntry { } impl EndOfIndexEntry { - pub const SIGNATURE: &'static [u8] = b"EOIE"; + pub const SIGNATURE: Signature = *b"EOIE"; pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); pub const SIZE_WITH_HEADER: usize = crate::extension::MIN_SIZE + Self::SIZE; } From cc337526365a04a23571123531f1ae565d386bcf Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 21:03:41 +0800 Subject: [PATCH 23/57] Fix counting issue, checksum matches now (#293) --- git-index/src/extension.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/git-index/src/extension.rs b/git-index/src/extension.rs index 86f602f34b9..a6e83447d07 100644 --- a/git-index/src/extension.rs +++ b/git-index/src/extension.rs @@ -21,7 +21,7 @@ mod end_of_index_entry { } let start_of_eoie = data.len() - EndOfIndexEntry::SIZE_WITH_HEADER - hash_len; - let ext_data = &data[start_of_eoie..][..hash_len]; + let ext_data = &data[start_of_eoie..data.len() - hash_len]; let (signature, ext_size, ext_data) = extension::decode_header(ext_data); if signature != EndOfIndexEntry::SIGNATURE || ext_size as usize != EndOfIndexEntry::SIZE { @@ -32,7 +32,6 @@ mod end_of_index_entry { let offset = read_u32(offset) as usize; if offset < header::SIZE || offset > start_of_eoie || checksum.len() != git_hash::Kind::Sha1.len_in_bytes() { - dbg!("checksum too small"); return None; } @@ -49,8 +48,9 @@ mod end_of_index_entry { if hasher.digest() != checksum { return None; } + // The last-to-this chunk ends where ours starts if last_chunk - .map(|s| s.as_ptr_range() != ext_data.as_ptr_range()) + .map(|s| s.as_ptr_range().end != (&data[start_of_eoie]) as *const _) .unwrap_or(true) { return None; From 9fdd34b634f4f15eb6cf5c2e7912bdc32dd61de6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jan 2022 21:10:26 +0800 Subject: [PATCH 24/57] refactor (#293) --- git-index/src/extension.rs | 97 ++++++++++++++++---------------------- git-index/src/file.rs | 2 +- git-index/src/lib.rs | 2 +- 3 files changed, 43 insertions(+), 58 deletions(-) diff --git a/git-index/src/extension.rs b/git-index/src/extension.rs index a6e83447d07..f620112f8ed 100644 --- a/git-index/src/extension.rs +++ b/git-index/src/extension.rs @@ -10,60 +10,58 @@ fn decode_header(data: &[u8]) -> (Signature, u32, &[u8]) { (signature.try_into().unwrap(), read_u32(size), data) } -mod end_of_index_entry { - use crate::{extension, extension::EndOfIndexEntry, file::header, util::read_u32}; +pub(crate) mod end_of_index_entry { + use crate::{extension, extension::Signature, file::header, util::read_u32}; - impl EndOfIndexEntry { - pub fn from_bytes(data: &[u8], object_hash: git_hash::Kind) -> Option { - let hash_len = object_hash.len_in_bytes(); - if data.len() < EndOfIndexEntry::SIZE_WITH_HEADER + hash_len { - return None; - } + pub const SIGNATURE: Signature = *b"EOIE"; + pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); + pub const SIZE_WITH_HEADER: usize = crate::extension::MIN_SIZE + SIZE; - let start_of_eoie = data.len() - EndOfIndexEntry::SIZE_WITH_HEADER - hash_len; - let ext_data = &data[start_of_eoie..data.len() - hash_len]; + pub fn decode(data: &[u8], object_hash: git_hash::Kind) -> Option { + let hash_len = object_hash.len_in_bytes(); + if data.len() < SIZE_WITH_HEADER + hash_len { + return None; + } - let (signature, ext_size, ext_data) = extension::decode_header(ext_data); - if signature != EndOfIndexEntry::SIGNATURE || ext_size as usize != EndOfIndexEntry::SIZE { - return None; - } + let start_of_eoie = data.len() - SIZE_WITH_HEADER - hash_len; + let ext_data = &data[start_of_eoie..data.len() - hash_len]; - let (offset, checksum) = ext_data.split_at(4); - let offset = read_u32(offset) as usize; - if offset < header::SIZE || offset > start_of_eoie || checksum.len() != git_hash::Kind::Sha1.len_in_bytes() - { - return None; - } + let (signature, ext_size, ext_data) = extension::decode_header(ext_data); + if signature != SIGNATURE || ext_size as usize != SIZE { + return None; + } - let mut hasher = git_features::hash::hasher(git_hash::Kind::Sha1); - let mut last_chunk = None; - for (signature, chunk) in - extension::Iter::new(&data[offset..data.len() - EndOfIndexEntry::SIZE_WITH_HEADER - hash_len]) - { - hasher.update(&signature); - hasher.update(&(chunk.len() as u32).to_be_bytes()); - last_chunk = Some(chunk); - } + let (offset, checksum) = ext_data.split_at(4); + let offset = read_u32(offset) as usize; + if offset < header::SIZE || offset > start_of_eoie || checksum.len() != git_hash::Kind::Sha1.len_in_bytes() { + return None; + } - if hasher.digest() != checksum { - return None; - } - // The last-to-this chunk ends where ours starts - if last_chunk - .map(|s| s.as_ptr_range().end != (&data[start_of_eoie]) as *const _) - .unwrap_or(true) - { - return None; - } - todo!("euio") + let mut hasher = git_features::hash::hasher(git_hash::Kind::Sha1); + let mut last_chunk = None; + for (signature, chunk) in extension::Iter::new(&data[offset..data.len() - SIZE_WITH_HEADER - hash_len]) { + hasher.update(&signature); + hasher.update(&(chunk.len() as u32).to_be_bytes()); + last_chunk = Some(chunk); } + + if hasher.digest() != checksum { + return None; + } + // The last-to-this chunk ends where ours starts + if last_chunk + .map(|s| s.as_ptr_range().end != (&data[start_of_eoie]) as *const _) + .unwrap_or(true) + { + return None; + } + + Some(offset) } } mod iter { - use crate::extension; - use crate::extension::Iter; - use crate::util::read_u32; + use crate::{extension, extension::Iter, util::read_u32}; impl<'a> Iter<'a> { pub fn new(data_at_beginning_of_extensions_and_truncated: &'a [u8]) -> Self { @@ -102,16 +100,3 @@ mod iter { pub struct Iter<'a> { data: &'a [u8], } - -pub struct EndOfIndexEntry { - /// The offset the the beginning of all extensions, or the end of all entries. - offset_to_extensions: u32, - /// The SHA1 checksum over the signature and size of all extensions. - checksum: git_hash::ObjectId, -} - -impl EndOfIndexEntry { - pub const SIGNATURE: Signature = *b"EOIE"; - pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); - pub const SIZE_WITH_HEADER: usize = crate::extension::MIN_SIZE + Self::SIZE; -} diff --git a/git-index/src/file.rs b/git-index/src/file.rs index 94c438d1aed..c5f73fd9344 100644 --- a/git-index/src/file.rs +++ b/git-index/src/file.rs @@ -42,7 +42,7 @@ pub mod init { }; let (version, num_entries, post_header_data) = header::decode(&data, object_hash)?; - let start_of_extensions = extension::EndOfIndexEntry::from_bytes(&data, object_hash); + let start_of_extensions = extension::end_of_index_entry::decode(&data, object_hash); Ok(File { state: State { timestamp: mtime }, diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 3b0bac22f96..6bc6bfa7e82 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -7,7 +7,7 @@ use filetime::FileTime; pub mod file; -pub mod extension; +pub(crate) mod extension; pub mod init { use filetime::FileTime; From 8acd65b2cf5091edd4eddf8ece8c9622c0d05ab3 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 08:49:59 +0800 Subject: [PATCH 25/57] Write down some idea for a db system I want --- README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 10199d74fb4..a46098b026c 100644 --- a/README.md +++ b/README.md @@ -243,11 +243,17 @@ Provide a CLI to for the most basic user journey: as well. What would an MVP look like? Maybe even something that could ship with gitoxide. * [ ] A truly awesome history rewriter which makes it easy to understand what happened while avoiding all pitfalls. Think BFG, but more awesome, if that's possible. * [ ] `git-tui` should learn a lot from [fossil-scm] regarding the presentation of data. Maybe [this](https://github.com/Lutetium-Vanadium/requestty/) can be used for prompts. Probably [magit] has a lot to offer, too. -* [ ] Can markdown be used as database so issue-trackers along with meta-data could just be markdown files which are mostly human-editable? Could user interfaces - be meta-data aware and just hide the meta-data chunks which are now editable in the GUI itself? Doing this would make conflicts easier to resolve than an `sqlite` - database. - * ~~A git-backend for `sqlite` which should allow embedding sqlite databases into git repositories, which in turn can be used for bug-trackers, wikis or other - features, making for a fully distributed github like experience, maybe.~~ +* An event-based database that uses commit messages to store deltas, while occasionally aggregating the actual state in a tree. Of course it's distributed by nature, allowing + people to work offline. + - It's abstracted to completely hide the actual data model behind it, allowing for all kinds of things to be implemented on top. + - Commits probably need a nanosecond component for the timestamp, which can be added via custom header field. + - having recording all changes allows for perfect merging, both on the client or on the server, while keeping a natural audit log which makes it useful for mission critical + databases in business. + * Can markdown be used as database so issue-trackers along with meta-data could just be markdown files which are mostly human-editable? Could user interfaces + be meta-data aware and just hide the meta-data chunks which are now editable in the GUI itself? Doing this would make conflicts easier to resolve than an `sqlite` + database. + * ~~A git-backend for `sqlite` which should allow embedding sqlite databases into git repositories, which in turn can be used for bug-trackers, wikis or other + features, making for a fully distributed github like experience, maybe.~~ ### Ideas for Spin-Offs From d4b3a07489703fb6d5e9b9fb9328741172826db9 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 09:00:11 +0800 Subject: [PATCH 26/57] refactor (#293) --- git-index/src/extension.rs | 2 +- git-index/src/file.rs | 65 ++-------------------------- git-index/src/lib.rs | 88 +++++++++++++++++++++++++++++++++++++- 3 files changed, 92 insertions(+), 63 deletions(-) diff --git a/git-index/src/extension.rs b/git-index/src/extension.rs index f620112f8ed..db5670c2b5a 100644 --- a/git-index/src/extension.rs +++ b/git-index/src/extension.rs @@ -11,7 +11,7 @@ fn decode_header(data: &[u8]) -> (Signature, u32, &[u8]) { } pub(crate) mod end_of_index_entry { - use crate::{extension, extension::Signature, file::header, util::read_u32}; + use crate::{extension, extension::Signature, header, util::read_u32}; pub const SIGNATURE: Signature = *b"EOIE"; pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); diff --git a/git-index/src/file.rs b/git-index/src/file.rs index c5f73fd9344..3b21bf19c48 100644 --- a/git-index/src/file.rs +++ b/git-index/src/file.rs @@ -5,13 +5,11 @@ pub mod init { use memmap2::Mmap; - use crate::{extension, file::header, File, State}; + use crate::{extension, File, State}; mod error { use quick_error::quick_error; - use crate::file::header; - quick_error! { #[derive(Debug)] pub enum Error { @@ -20,8 +18,8 @@ pub mod init { source(err) from() } - DecodeHeader(err: header::decode::Error) { - display("The header could not be understood") + Decode(err: crate::decode::Error) { + display("The file could not be decoded") source(err) from() } @@ -41,65 +39,10 @@ pub mod init { (data, filetime::FileTime::from_last_modification_time(&file.metadata()?)) }; - let (version, num_entries, post_header_data) = header::decode(&data, object_hash)?; - let start_of_extensions = extension::end_of_index_entry::decode(&data, object_hash); - Ok(File { - state: State { timestamp: mtime }, + state: State::from_bytes(&data, mtime, object_hash)?, path, }) } } } - -pub mod header { - pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; - - pub mod decode { - use quick_error::quick_error; - - quick_error! { - #[derive(Debug)] - pub enum Error { - Corrupt(message: &'static str) { - display("{}", message) - } - UnsupportedVersion(version: u32) { - display("Index version {} is not supported", version) - } - } - } - } - use crate::{util::read_u32, Version}; - - pub(crate) fn decode( - data: &[u8], - object_hash: git_hash::Kind, - ) -> Result<(crate::Version, u32, &[u8]), decode::Error> { - if data.len() < (3 * 4) + object_hash.len_in_bytes() { - return Err(decode::Error::Corrupt( - "File is too small even for header with zero entries and smallest hash", - )); - } - - const SIGNATURE: &[u8] = b"DIRC"; - let (signature, data) = data.split_at(4); - if signature != SIGNATURE { - return Err(decode::Error::Corrupt( - "Signature mismatch - this doesn't claim to be a header file", - )); - } - - let (version, data) = data.split_at(4); - let version = match read_u32(version) { - 2 => Version::V2, - 3 => Version::V3, - 4 => Version::V4, - unknown => return Err(decode::Error::UnsupportedVersion(unknown)), - }; - let (entries, data) = data.split_at(4); - let entries = read_u32(entries); - - Ok((version, entries, data)) - } -} diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 6bc6bfa7e82..6756b60b2aa 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -12,7 +12,7 @@ pub(crate) mod extension; pub mod init { use filetime::FileTime; - use crate::State; + use crate::{State, Version}; impl State { /// Returns an empty state. @@ -20,6 +20,7 @@ pub mod init { fn new() -> Self { State { timestamp: FileTime::from_system_time(std::time::SystemTime::UNIX_EPOCH), + version: Version::V4, } } } @@ -31,6 +32,90 @@ pub mod init { } } +pub mod decode { + use crate::{extension, header, State}; + use filetime::FileTime; + + mod error { + use quick_error::quick_error; + + use crate::header; + + quick_error! { + #[derive(Debug)] + pub enum Error { + Header(err: header::decode::Error) { + display("The header could not be decoded") + source(err) + from() + } + } + } + } + pub use error::Error; + + impl State { + pub fn from_bytes(data: &[u8], timestamp: FileTime, object_hash: git_hash::Kind) -> Result { + let (version, num_entries, post_header_data) = header::decode(&data, object_hash)?; + let start_of_extensions = extension::end_of_index_entry::decode(&data, object_hash); + + Ok(State { timestamp, version }) + } + } +} + +pub mod header { + pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; + + pub mod decode { + use quick_error::quick_error; + + quick_error! { + #[derive(Debug)] + pub enum Error { + Corrupt(message: &'static str) { + display("{}", message) + } + UnsupportedVersion(version: u32) { + display("Index version {} is not supported", version) + } + } + } + } + use crate::{util::read_u32, Version}; + + pub(crate) fn decode( + data: &[u8], + object_hash: git_hash::Kind, + ) -> Result<(crate::Version, u32, &[u8]), decode::Error> { + if data.len() < (3 * 4) + object_hash.len_in_bytes() { + return Err(decode::Error::Corrupt( + "File is too small even for header with zero entries and smallest hash", + )); + } + + const SIGNATURE: &[u8] = b"DIRC"; + let (signature, data) = data.split_at(4); + if signature != SIGNATURE { + return Err(decode::Error::Corrupt( + "Signature mismatch - this doesn't claim to be a header file", + )); + } + + let (version, data) = data.split_at(4); + let version = match read_u32(version) { + 2 => Version::V2, + 3 => Version::V3, + 4 => Version::V4, + unknown => return Err(decode::Error::UnsupportedVersion(unknown)), + }; + let (entries, data) = data.split_at(4); + let entries = read_u32(entries); + + Ok((version, entries, data)) + } +} + /// All known versions of a git index file. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] @@ -57,6 +142,7 @@ pub struct State { /// Note that on platforms that only have a precisions of a second for this time, we will treat all entries with the /// same timestamp as this as potentially changed, checking more thoroughly if a change actually happened. timestamp: FileTime, + version: Version, } pub(crate) mod util { From c17240d0cbd6134a77a69359611789f4eebc727d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 09:24:39 +0800 Subject: [PATCH 27/57] the first actual assetion (#293) --- git-index/src/file.rs | 19 +++++++++++++++++++ git-index/src/lib.rs | 10 ++++++++++ git-index/tests/file/mod.rs | 5 ++++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/git-index/src/file.rs b/git-index/src/file.rs index 3b21bf19c48..b8aa6a84ff5 100644 --- a/git-index/src/file.rs +++ b/git-index/src/file.rs @@ -1,3 +1,22 @@ +mod impls { + use crate::{File, State}; + use std::ops::{Deref, DerefMut}; + + impl Deref for File { + type Target = State; + + fn deref(&self) -> &Self::Target { + &self.state + } + } + + impl DerefMut for File { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.state + } + } +} + pub mod init { #![allow(unused)] diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 6756b60b2aa..e37dd80940d 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -9,6 +9,16 @@ pub mod file; pub(crate) mod extension; +mod access { + use crate::{State, Version}; + + impl State { + pub fn version(&self) -> Version { + self.version + } + } +} + pub mod init { use filetime::FileTime; diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index 53bb122796f..4142feb68de 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -1,10 +1,13 @@ mod init { + use git_index::Version; + fn file(name: &str) -> git_index::File { git_index::File::at(crate::index_fixture_path(name), git_hash::Kind::Sha1).unwrap() } #[test] fn read_v2() { - let _file = file("v2"); + let file = file("v2"); + assert_eq!(file.version(), Version::V2); } } From 07e8fb2cb6b7819eb34676ede57808b845298674 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 09:27:24 +0800 Subject: [PATCH 28/57] refactor (#293) --- git-index/src/decode.rs | 80 ++++++++++++++++++++++++++++++++++++ git-index/src/extension.rs | 2 +- git-index/src/lib.rs | 84 +------------------------------------- 3 files changed, 82 insertions(+), 84 deletions(-) create mode 100644 git-index/src/decode.rs diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs new file mode 100644 index 00000000000..0b2c7683b7a --- /dev/null +++ b/git-index/src/decode.rs @@ -0,0 +1,80 @@ +use crate::{extension, State}; +use filetime::FileTime; + +pub mod header { + pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; + + mod error { + use quick_error::quick_error; + + quick_error! { + #[derive(Debug)] + pub enum Error { + Corrupt(message: &'static str) { + display("{}", message) + } + UnsupportedVersion(version: u32) { + display("Index version {} is not supported", version) + } + } + } + } + use crate::{util::read_u32, Version}; + pub use error::Error; + + pub(crate) fn decode(data: &[u8], object_hash: git_hash::Kind) -> Result<(crate::Version, u32, &[u8]), Error> { + if data.len() < (3 * 4) + object_hash.len_in_bytes() { + return Err(Error::Corrupt( + "File is too small even for header with zero entries and smallest hash", + )); + } + + const SIGNATURE: &[u8] = b"DIRC"; + let (signature, data) = data.split_at(4); + if signature != SIGNATURE { + return Err(Error::Corrupt( + "Signature mismatch - this doesn't claim to be a header file", + )); + } + + let (version, data) = data.split_at(4); + let version = match read_u32(version) { + 2 => Version::V2, + 3 => Version::V3, + 4 => Version::V4, + unknown => return Err(Error::UnsupportedVersion(unknown)), + }; + let (entries, data) = data.split_at(4); + let entries = read_u32(entries); + + Ok((version, entries, data)) + } +} + +mod error { + use quick_error::quick_error; + + use crate::decode; + + quick_error! { + #[derive(Debug)] + pub enum Error { + Header(err: decode::header::Error) { + display("The header could not be decoded") + source(err) + from() + } + } + } +} + +pub use error::Error; + +impl State { + pub fn from_bytes(data: &[u8], timestamp: FileTime, object_hash: git_hash::Kind) -> Result { + let (version, num_entries, post_header_data) = header::decode(&data, object_hash)?; + let start_of_extensions = extension::end_of_index_entry::decode(&data, object_hash); + + Ok(State { timestamp, version }) + } +} diff --git a/git-index/src/extension.rs b/git-index/src/extension.rs index db5670c2b5a..253cea210f5 100644 --- a/git-index/src/extension.rs +++ b/git-index/src/extension.rs @@ -11,7 +11,7 @@ fn decode_header(data: &[u8]) -> (Signature, u32, &[u8]) { } pub(crate) mod end_of_index_entry { - use crate::{extension, extension::Signature, header, util::read_u32}; + use crate::{decode::header, extension, extension::Signature, util::read_u32}; pub const SIGNATURE: Signature = *b"EOIE"; pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index e37dd80940d..9fdb51dd1fe 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -42,89 +42,7 @@ pub mod init { } } -pub mod decode { - use crate::{extension, header, State}; - use filetime::FileTime; - - mod error { - use quick_error::quick_error; - - use crate::header; - - quick_error! { - #[derive(Debug)] - pub enum Error { - Header(err: header::decode::Error) { - display("The header could not be decoded") - source(err) - from() - } - } - } - } - pub use error::Error; - - impl State { - pub fn from_bytes(data: &[u8], timestamp: FileTime, object_hash: git_hash::Kind) -> Result { - let (version, num_entries, post_header_data) = header::decode(&data, object_hash)?; - let start_of_extensions = extension::end_of_index_entry::decode(&data, object_hash); - - Ok(State { timestamp, version }) - } - } -} - -pub mod header { - pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; - - pub mod decode { - use quick_error::quick_error; - - quick_error! { - #[derive(Debug)] - pub enum Error { - Corrupt(message: &'static str) { - display("{}", message) - } - UnsupportedVersion(version: u32) { - display("Index version {} is not supported", version) - } - } - } - } - use crate::{util::read_u32, Version}; - - pub(crate) fn decode( - data: &[u8], - object_hash: git_hash::Kind, - ) -> Result<(crate::Version, u32, &[u8]), decode::Error> { - if data.len() < (3 * 4) + object_hash.len_in_bytes() { - return Err(decode::Error::Corrupt( - "File is too small even for header with zero entries and smallest hash", - )); - } - - const SIGNATURE: &[u8] = b"DIRC"; - let (signature, data) = data.split_at(4); - if signature != SIGNATURE { - return Err(decode::Error::Corrupt( - "Signature mismatch - this doesn't claim to be a header file", - )); - } - - let (version, data) = data.split_at(4); - let version = match read_u32(version) { - 2 => Version::V2, - 3 => Version::V3, - 4 => Version::V4, - unknown => return Err(decode::Error::UnsupportedVersion(unknown)), - }; - let (entries, data) = data.split_at(4); - let entries = read_u32(entries); - - Ok((version, entries, data)) - } -} +pub mod decode; /// All known versions of a git index file. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] From 49fcb6f6ae9d6ed47e7c0c3ea2aa644d4e8cd264 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 13:48:53 +0800 Subject: [PATCH 29/57] Get closer to implementing a simple TREE extension decoding (#293) --- Cargo.lock | 1 + README.md | 23 +++++++++-------- git-index/Cargo.toml | 1 + git-index/src/decode.rs | 18 ++++++++++++- .../src/{extension.rs => extension/mod.rs} | 20 +++++++++++++++ git-index/src/extension/tree.rs | 25 +++++++++++++++++++ git-index/tests/file/mod.rs | 2 +- 7 files changed, 77 insertions(+), 13 deletions(-) rename git-index/src/{extension.rs => extension/mod.rs} (82%) create mode 100644 git-index/src/extension/tree.rs diff --git a/Cargo.lock b/Cargo.lock index 898674d5745..8a08de8bdf3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1234,6 +1234,7 @@ dependencies = [ "memmap2", "quick-error", "serde", + "smallvec", ] [[package]] diff --git a/README.md b/README.md index a46098b026c..56a72eccddc 100644 --- a/README.md +++ b/README.md @@ -243,17 +243,6 @@ Provide a CLI to for the most basic user journey: as well. What would an MVP look like? Maybe even something that could ship with gitoxide. * [ ] A truly awesome history rewriter which makes it easy to understand what happened while avoiding all pitfalls. Think BFG, but more awesome, if that's possible. * [ ] `git-tui` should learn a lot from [fossil-scm] regarding the presentation of data. Maybe [this](https://github.com/Lutetium-Vanadium/requestty/) can be used for prompts. Probably [magit] has a lot to offer, too. -* An event-based database that uses commit messages to store deltas, while occasionally aggregating the actual state in a tree. Of course it's distributed by nature, allowing - people to work offline. - - It's abstracted to completely hide the actual data model behind it, allowing for all kinds of things to be implemented on top. - - Commits probably need a nanosecond component for the timestamp, which can be added via custom header field. - - having recording all changes allows for perfect merging, both on the client or on the server, while keeping a natural audit log which makes it useful for mission critical - databases in business. - * Can markdown be used as database so issue-trackers along with meta-data could just be markdown files which are mostly human-editable? Could user interfaces - be meta-data aware and just hide the meta-data chunks which are now editable in the GUI itself? Doing this would make conflicts easier to resolve than an `sqlite` - database. - * ~~A git-backend for `sqlite` which should allow embedding sqlite databases into git repositories, which in turn can be used for bug-trackers, wikis or other - features, making for a fully distributed github like experience, maybe.~~ ### Ideas for Spin-Offs @@ -265,6 +254,18 @@ Provide a CLI to for the most basic user journey: * [ ] A [syncthing] like client/server application. This is to demonstrate how lower-level crates can be combined into custom applications that use only part of git's technology to achieve their very own thing. Watch out for big file support, multi-device cross-syncing, the possibility for untrusted destinations using full-encryption, case-insensitive and sensitive filesystems, and extended file attributes as well as ignore files. +* An event-based database that uses commit messages to store deltas, while occasionally aggregating the actual state in a tree. Of course it's distributed by nature, allowing + people to work offline. + - It's abstracted to completely hide the actual data model behind it, allowing for all kinds of things to be implemented on top. + - Commits probably need a nanosecond component for the timestamp, which can be added via custom header field. + - having recording all changes allows for perfect merging, both on the client or on the server, while keeping a natural audit log which makes it useful for mission critical + databases in business. + * **Applications** + - Can markdown be used as database so issue-trackers along with meta-data could just be markdown files which are mostly human-editable? Could user interfaces + be meta-data aware and just hide the meta-data chunks which are now editable in the GUI itself? Doing this would make conflicts easier to resolve than an `sqlite` + database. + - A time tracker - simple data, very likely naturally conflict free, and interesting to see it in terms of teams or companies using it with maybe GitHub as Backing for authentication. + - How about supporting multiple different trackers, as in different remotes? [syncthing]: https://github.com/syncthing/syncthing [fossil-scm]: https://www.fossil-scm.org diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index 7bc7bd2a76c..5b109fb758c 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -24,6 +24,7 @@ memmap2 = "0.5.0" filetime = "0.2.15" serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } +smallvec = "1.7.0" [dev-dependencies] git-testtools = { path = "../tests/tools"} diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index 0b2c7683b7a..10db2bc70d4 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -67,13 +67,29 @@ mod error { } } } - pub use error::Error; impl State { pub fn from_bytes(data: &[u8], timestamp: FileTime, object_hash: git_hash::Kind) -> Result { let (version, num_entries, post_header_data) = header::decode(&data, object_hash)?; let start_of_extensions = extension::end_of_index_entry::decode(&data, object_hash); + match start_of_extensions { + Some(offset) => { + let extensions = extension::Iter::new_without_checksum(&data[offset..], object_hash); + for (signature, ext_data) in extensions { + match signature { + extension::tree::SIGNATURE => { + let tree = extension::tree::decode(ext_data, object_hash); + todo!("put tree somewhere") + } + extension::end_of_index_entry::SIGNATURE => {} // skip already done + _unknown => {} // skip unknown extensions, too + } + } + todo!("load all extensions in thread, then get IEOT, then possibly multi-threaded entry parsing") + } + None => todo!("load entries singlge-threaded, then extensions"), + } Ok(State { timestamp, version }) } diff --git a/git-index/src/extension.rs b/git-index/src/extension/mod.rs similarity index 82% rename from git-index/src/extension.rs rename to git-index/src/extension/mod.rs index 253cea210f5..e5b3aaa5ced 100644 --- a/git-index/src/extension.rs +++ b/git-index/src/extension/mod.rs @@ -1,4 +1,5 @@ use crate::{util::read_u32, Version}; +use smallvec::SmallVec; const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; @@ -10,6 +11,15 @@ fn decode_header(data: &[u8]) -> (Signature, u32, &[u8]) { (signature.try_into().unwrap(), read_u32(size), data) } +pub struct Tree { + /// Only set if there are any entries in the index we are associated with. + id: Option, + name: SmallVec<[u8; 23]>, + children: Vec, +} + +pub(crate) mod tree; + pub(crate) mod end_of_index_entry { use crate::{decode::header, extension, extension::Signature, util::read_u32}; @@ -69,6 +79,16 @@ mod iter { data: data_at_beginning_of_extensions_and_truncated, } } + + pub fn new_without_checksum(data_at_beginning_of_extensions: &'a [u8], object_hash: git_hash::Kind) -> Self { + let end = data_at_beginning_of_extensions + .len() + .checked_sub(object_hash.len_in_bytes()) + .expect("someone asserted that there is at least one extension"); + Iter { + data: &data_at_beginning_of_extensions[..end], + } + } } impl<'a> Iterator for Iter<'a> { diff --git a/git-index/src/extension/tree.rs b/git-index/src/extension/tree.rs new file mode 100644 index 00000000000..bb274c616eb --- /dev/null +++ b/git-index/src/extension/tree.rs @@ -0,0 +1,25 @@ +use crate::extension::{Signature, Tree}; + +pub const SIGNATURE: Signature = *b"TREE"; + +pub struct NodeId { + /// The id of the directory tree of the associated tree object. + id: git_hash::ObjectId, + /// The amount of non-tree entries contained within, and definitely not zero. + entry_count: u32, +} + +/// A recursive data structure +pub fn decode(data: &[u8], object_hash: git_hash::Kind) -> Option { + todo!("decode tree") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn size_of_tree() { + assert_eq!(std::mem::size_of::(), 88); + } +} diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index 4142feb68de..dc6ea0233b5 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -6,7 +6,7 @@ mod init { } #[test] - fn read_v2() { + fn read_v2_with_single_entry_tree() { let file = file("v2"); assert_eq!(file.version(), Version::V2); } From a2ea49841a333c8af18fd258781a649214a0ae0b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 14:48:01 +0800 Subject: [PATCH 30/57] parse TREE chunk (#293) For now the data structure is just 'as-written' and we see what needs to change there as we have to maintain it. --- Cargo.lock | 10 +++++++ git-index/Cargo.toml | 1 + git-index/src/decode.rs | 12 ++++++--- git-index/src/extension/mod.rs | 6 ++++- git-index/src/extension/tree.rs | 47 ++++++++++++++++++++++++++++++++- git-index/src/lib.rs | 21 ++++++++++++++- git-index/tests/file/mod.rs | 1 + 7 files changed, 92 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8a08de8bdf3..fede8097e30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -207,6 +207,15 @@ dependencies = [ "syn", ] +[[package]] +name = "atoi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "616896e05fc0e2649463a93a15183c6a16bf03413a7af88ef1285ddedfa9cda5" +dependencies = [ + "num-traits", +] + [[package]] name = "atomic-waker" version = "1.0.0" @@ -1227,6 +1236,7 @@ dependencies = [ name = "git-index" version = "0.0.0" dependencies = [ + "atoi", "filetime", "git-features 0.18.0", "git-hash 0.8.0", diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index 5b109fb758c..52796eaf7a7 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -25,6 +25,7 @@ filetime = "0.2.15" serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } smallvec = "1.7.0" +atoi = "0.4.0" [dev-dependencies] git-testtools = { path = "../tests/tools"} diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index 10db2bc70d4..7d7a5e22318 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -73,14 +73,16 @@ impl State { pub fn from_bytes(data: &[u8], timestamp: FileTime, object_hash: git_hash::Kind) -> Result { let (version, num_entries, post_header_data) = header::decode(&data, object_hash)?; let start_of_extensions = extension::end_of_index_entry::decode(&data, object_hash); + let mut cache_tree = None; + + // Note that we ignore all errors for optional signatures. match start_of_extensions { Some(offset) => { let extensions = extension::Iter::new_without_checksum(&data[offset..], object_hash); for (signature, ext_data) in extensions { match signature { extension::tree::SIGNATURE => { - let tree = extension::tree::decode(ext_data, object_hash); - todo!("put tree somewhere") + cache_tree = extension::tree::decode(ext_data, object_hash); } extension::end_of_index_entry::SIGNATURE => {} // skip already done _unknown => {} // skip unknown extensions, too @@ -91,6 +93,10 @@ impl State { None => todo!("load entries singlge-threaded, then extensions"), } - Ok(State { timestamp, version }) + Ok(State { + timestamp, + version, + cache_tree, + }) } } diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index e5b3aaa5ced..66dc514cdb4 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -11,10 +11,14 @@ fn decode_header(data: &[u8]) -> (Signature, u32, &[u8]) { (signature.try_into().unwrap(), read_u32(size), data) } +/// A structure to associate object ids of a tree with sections in the index entries list. +/// +/// It allows to more quickly build trees by avoiding as it can quickly re-use portions of the index and its associated tree ids +/// if there wa sno change to them. Portions of this tree are invalidated as the index is changed. pub struct Tree { + name: SmallVec<[u8; 23]>, /// Only set if there are any entries in the index we are associated with. id: Option, - name: SmallVec<[u8; 23]>, children: Vec, } diff --git a/git-index/src/extension/tree.rs b/git-index/src/extension/tree.rs index bb274c616eb..a1e39ebc550 100644 --- a/git-index/src/extension/tree.rs +++ b/git-index/src/extension/tree.rs @@ -1,4 +1,6 @@ use crate::extension::{Signature, Tree}; +use crate::util::split_at_byte_exclusive; +use git_hash::ObjectId; pub const SIGNATURE: Signature = *b"TREE"; @@ -11,7 +13,50 @@ pub struct NodeId { /// A recursive data structure pub fn decode(data: &[u8], object_hash: git_hash::Kind) -> Option { - todo!("decode tree") + let (tree, data) = one_recursive(data, object_hash.len_in_bytes())?; + assert!( + data.is_empty(), + "BUG: should fully consume the entire tree extension chunk, got {} left", + data.len() + ); + Some(tree) +} + +pub fn one_recursive(data: &[u8], hash_len: usize) -> Option<(Tree, &[u8])> { + let (path, data) = split_at_byte_exclusive(data, 0)?; + + let (entry_count, data) = split_at_byte_exclusive(data, b' ')?; + let entry_count: u32 = atoi::atoi(entry_count)?; + + let (subtree_count, mut data) = split_at_byte_exclusive(data, b'\n')?; + let subtree_count: usize = atoi::atoi(subtree_count)?; + + let node_id = (entry_count != 0) + .then(|| { + (data.len() >= hash_len).then(|| { + let (hash, rest) = data.split_at(hash_len); + data = rest; + ObjectId::from(hash) + }) + }) + .flatten() + .map(|id| NodeId { id, entry_count }); + + let mut subtrees = Vec::with_capacity(subtree_count); + for _ in 0..subtree_count { + let (tree, rest) = one_recursive(data, hash_len)?; + subtrees.push(tree); + data = rest; + } + + Some(( + Tree { + id: node_id, + name: path.into(), + children: subtrees, + }, + data, + )) } #[cfg(test)] diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 9fdb51dd1fe..c6ceaefd494 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -30,7 +30,8 @@ pub mod init { fn new() -> Self { State { timestamp: FileTime::from_system_time(std::time::SystemTime::UNIX_EPOCH), - version: Version::V4, + version: Version::V3, + cache_tree: None, } } } @@ -71,6 +72,7 @@ pub struct State { /// same timestamp as this as potentially changed, checking more thoroughly if a change actually happened. timestamp: FileTime, version: Version, + pub cache_tree: Option, } pub(crate) mod util { @@ -78,4 +80,21 @@ pub(crate) mod util { pub fn read_u32(b: &[u8]) -> u32 { u32::from_be_bytes(b.try_into().unwrap()) } + + #[inline] + pub fn split_at_byte_exclusive(data: &[u8], byte: u8) -> Option<(&[u8], &[u8])> { + if data.len() < 2 { + return None; + } + data.iter().enumerate().find_map(|(idx, b)| { + (*b == byte).then(|| { + if idx == 0 { + (&[] as &[u8], &data[1..]) + } else { + let (a, b) = data.split_at(idx); + (a, &b[1..]) + } + }) + }) + } } diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index dc6ea0233b5..ef252dc2bbd 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -6,6 +6,7 @@ mod init { } #[test] + #[ignore] fn read_v2_with_single_entry_tree() { let file = file("v2"); assert_eq!(file.version(), Version::V2); From e7e067977ef440cf3edb8812c0d614b5d8213b58 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 14:58:07 +0800 Subject: [PATCH 31/57] Prepare a more complex test for tree parsing, requires entry parsing (#293) --- git-index/src/decode.rs | 36 ++++++++++++------- git-index/tests/file/mod.rs | 9 ++++- .../fixtures/make_index/v2_more_files.sh | 13 +++++++ 3 files changed, 45 insertions(+), 13 deletions(-) create mode 100644 git-index/tests/fixtures/make_index/v2_more_files.sh diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index 7d7a5e22318..23c4596b106 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -1,5 +1,6 @@ use crate::{extension, State}; use filetime::FileTime; +use git_hash::Kind; pub mod header { pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; @@ -73,21 +74,12 @@ impl State { pub fn from_bytes(data: &[u8], timestamp: FileTime, object_hash: git_hash::Kind) -> Result { let (version, num_entries, post_header_data) = header::decode(&data, object_hash)?; let start_of_extensions = extension::end_of_index_entry::decode(&data, object_hash); - let mut cache_tree = None; + let mut ext = Extensions::default(); // Note that we ignore all errors for optional signatures. match start_of_extensions { Some(offset) => { - let extensions = extension::Iter::new_without_checksum(&data[offset..], object_hash); - for (signature, ext_data) in extensions { - match signature { - extension::tree::SIGNATURE => { - cache_tree = extension::tree::decode(ext_data, object_hash); - } - extension::end_of_index_entry::SIGNATURE => {} // skip already done - _unknown => {} // skip unknown extensions, too - } - } + ext = load_extensions(&data[offset..], object_hash); todo!("load all extensions in thread, then get IEOT, then possibly multi-threaded entry parsing") } None => todo!("load entries singlge-threaded, then extensions"), @@ -96,7 +88,27 @@ impl State { Ok(State { timestamp, version, - cache_tree, + cache_tree: ext.cache_tree, }) } } + +fn load_extensions(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> Extensions { + let extensions = extension::Iter::new_without_checksum(beginning_of_extensions, object_hash); + let mut ext = Extensions::default(); + for (signature, ext_data) in extensions { + match signature { + extension::tree::SIGNATURE => { + ext.cache_tree = extension::tree::decode(ext_data, object_hash); + } + extension::end_of_index_entry::SIGNATURE => {} // skip already done + _unknown => {} // skip unknown extensions, too + } + } + ext +} + +#[derive(Default)] +struct Extensions { + cache_tree: Option, +} diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index ef252dc2bbd..0c46fdce08d 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -7,8 +7,15 @@ mod init { #[test] #[ignore] - fn read_v2_with_single_entry_tree() { + fn read_v2_with_single_entry_tree_and_eoie_ext() { let file = file("v2"); assert_eq!(file.version(), Version::V2); } + + #[test] + #[ignore] + fn read_v2_with_multiple_entries_without_eoie_ext() { + let file = file("v2_more_files"); + assert_eq!(file.version(), Version::V2); + } } diff --git a/git-index/tests/fixtures/make_index/v2_more_files.sh b/git-index/tests/fixtures/make_index/v2_more_files.sh new file mode 100644 index 00000000000..a43ec62203c --- /dev/null +++ b/git-index/tests/fixtures/make_index/v2_more_files.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu -o pipefail + +GIT_INDEX_VERSION=2 git init -q +git config commit.gpgsign false +git config index.threads 1 + +touch a b c +mkdir d +(cd d && touch a b c) + +git add . +git commit -m "empty" From 552602074a99dc536624f0c6295e807caf32f58b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 14:58:43 +0800 Subject: [PATCH 32/57] thanks clippy --- etc/check-package-size.sh | 2 +- git-index/src/decode.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/etc/check-package-size.sh b/etc/check-package-size.sh index 6e02a56a902..a4029192647 100755 --- a/etc/check-package-size.sh +++ b/etc/check-package-size.sh @@ -18,7 +18,7 @@ echo "in root: gitoxide CLI" #indent cargo diet -n --package-size-limit 25KB - fails right now because of dotted profile.dev.package (enter cargo-smart-release && indent cargo diet -n --package-size-limit 85KB) (enter git-actor && indent cargo diet -n --package-size-limit 5KB) -(enter git-index && indent cargo diet -n --package-size-limit 5KB) +(enter git-index && indent cargo diet -n --package-size-limit 10KB) (enter git-tempfile && indent cargo diet -n --package-size-limit 25KB) (enter git-lock && indent cargo diet -n --package-size-limit 15KB) (enter git-config && indent cargo diet -n --package-size-limit 65KB) diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index 23c4596b106..de58b235a60 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -72,8 +72,8 @@ pub use error::Error; impl State { pub fn from_bytes(data: &[u8], timestamp: FileTime, object_hash: git_hash::Kind) -> Result { - let (version, num_entries, post_header_data) = header::decode(&data, object_hash)?; - let start_of_extensions = extension::end_of_index_entry::decode(&data, object_hash); + let (version, num_entries, post_header_data) = header::decode(data, object_hash)?; + let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash); let mut ext = Extensions::default(); // Note that we ignore all errors for optional signatures. From 620d2e6bd4ef6d3281c096aaf344669bcf49e723 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 16:42:30 +0800 Subject: [PATCH 33/57] Extensions are optional, and so is their iteration (#293) --- git-index/src/decode.rs | 45 +++++++++++++++++++++++----------- git-index/src/extension/mod.rs | 9 ++++--- git-index/src/lib.rs | 3 +++ git-index/tests/file/mod.rs | 4 +++ 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index de58b235a60..2fa60c36fdb 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -1,4 +1,4 @@ -use crate::{extension, State}; +use crate::{extension, Entry, State, Version}; use filetime::FileTime; use git_hash::Kind; @@ -79,10 +79,15 @@ impl State { // Note that we ignore all errors for optional signatures. match start_of_extensions { Some(offset) => { - ext = load_extensions(&data[offset..], object_hash); + let (ext, entries) = + git_features::parallel::join(|| load_extensions(&data[offset..], object_hash), || ()); todo!("load all extensions in thread, then get IEOT, then possibly multi-threaded entry parsing") } - None => todo!("load entries singlge-threaded, then extensions"), + None => { + let (entries, data) = load_entries(version, data, num_entries, object_hash)?; + let ext = load_extensions(data, object_hash); + todo!("load entries singlge-threaded, then extensions") + } } Ok(State { @@ -93,19 +98,31 @@ impl State { } } +fn load_entries( + version: Version, + beginning_of_entries: &[u8], + num_entries: u32, + object_hash: git_hash::Kind, +) -> Result<(Vec, &[u8]), Error> { + todo!("load entries") +} + fn load_extensions(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> Extensions { - let extensions = extension::Iter::new_without_checksum(beginning_of_extensions, object_hash); - let mut ext = Extensions::default(); - for (signature, ext_data) in extensions { - match signature { - extension::tree::SIGNATURE => { - ext.cache_tree = extension::tree::decode(ext_data, object_hash); + extension::Iter::new_without_checksum(beginning_of_extensions, object_hash) + .map(|extensions| { + let mut ext = Extensions::default(); + for (signature, ext_data) in extensions { + match signature { + extension::tree::SIGNATURE => { + ext.cache_tree = extension::tree::decode(ext_data, object_hash); + } + extension::end_of_index_entry::SIGNATURE => {} // skip already done + _unknown => {} // skip unknown extensions, too + } } - extension::end_of_index_entry::SIGNATURE => {} // skip already done - _unknown => {} // skip unknown extensions, too - } - } - ext + ext + }) + .unwrap_or_default() } #[derive(Default)] diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index 66dc514cdb4..a43e23f7b90 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -84,14 +84,17 @@ mod iter { } } - pub fn new_without_checksum(data_at_beginning_of_extensions: &'a [u8], object_hash: git_hash::Kind) -> Self { + pub fn new_without_checksum( + data_at_beginning_of_extensions: &'a [u8], + object_hash: git_hash::Kind, + ) -> Option { let end = data_at_beginning_of_extensions .len() - .checked_sub(object_hash.len_in_bytes()) - .expect("someone asserted that there is at least one extension"); + .checked_sub(object_hash.len_in_bytes())?; Iter { data: &data_at_beginning_of_extensions[..end], } + .into() } } diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index c6ceaefd494..9071479ae73 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -55,6 +55,9 @@ pub enum Version { V4 = 4, } +/// An entry in the index, identifying a non-tree item on disk. +pub struct Entry; + /// An index file whose state was read from a file on disk. pub struct File { pub state: State, diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index 0c46fdce08d..f45590a3dcf 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -18,4 +18,8 @@ mod init { let file = file("v2_more_files"); assert_eq!(file.version(), Version::V2); } + + #[test] + #[ignore] + fn read_without_any_extension() {} } From 53e2d754262d9752d3b106f7991543986ad5426f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 20:43:11 +0800 Subject: [PATCH 34/57] Most of the entry decoding, name is still missing (#293) --- git-index/Cargo.toml | 1 + git-index/src/decode.rs | 88 +++++++++++++++++++++++++++++++--- git-index/src/extension/mod.rs | 12 ++--- git-index/src/lib.rs | 42 +++++++++++++++- git-index/tests/file/mod.rs | 4 ++ git-index/tests/index.rs | 9 ++++ 6 files changed, 141 insertions(+), 15 deletions(-) diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index 52796eaf7a7..ce7a54d2a8a 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -9,6 +9,7 @@ edition = "2021" [lib] doctest = false +test = true [features] serde1 = ["serde"] diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index 2fa60c36fdb..b5fabb40e47 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -1,4 +1,4 @@ -use crate::{extension, Entry, State, Version}; +use crate::{entry, extension, Entry, State, Version}; use filetime::FileTime; use git_hash::Kind; @@ -20,7 +20,7 @@ pub mod header { } } } - use crate::{util::read_u32, Version}; + use crate::{util::from_be_u32, Version}; pub use error::Error; pub(crate) fn decode(data: &[u8], object_hash: git_hash::Kind) -> Result<(crate::Version, u32, &[u8]), Error> { @@ -39,14 +39,14 @@ pub mod header { } let (version, data) = data.split_at(4); - let version = match read_u32(version) { + let version = match from_be_u32(version) { 2 => Version::V2, 3 => Version::V3, 4 => Version::V4, unknown => return Err(Error::UnsupportedVersion(unknown)), }; let (entries, data) = data.split_at(4); - let entries = read_u32(entries); + let entries = from_be_u32(entries); Ok((version, entries, data)) } @@ -68,6 +68,7 @@ mod error { } } } +use crate::util::{from_be_u32, split_at_pos}; pub use error::Error; impl State { @@ -84,7 +85,7 @@ impl State { todo!("load all extensions in thread, then get IEOT, then possibly multi-threaded entry parsing") } None => { - let (entries, data) = load_entries(version, data, num_entries, object_hash)?; + let (entries, data) = load_entries(data, num_entries, object_hash, version)?; let ext = load_extensions(data, object_hash); todo!("load entries singlge-threaded, then extensions") } @@ -98,15 +99,88 @@ impl State { } } +mod load_entries { + use crate::Entry; + + pub struct Outcome { + entries: Vec, + /// A memory area keeping all index paths, in full length, independently of the index version. + path_backing: Vec, + /// True if one entry in the index has a special marker mode + is_sparse: bool, + } +} + fn load_entries( - version: Version, beginning_of_entries: &[u8], num_entries: u32, object_hash: git_hash::Kind, -) -> Result<(Vec, &[u8]), Error> { + version: Version, +) -> Result<(load_entries::Outcome, &[u8]), Error> { todo!("load entries") } +fn decode_entry(data: &[u8], hash_len: usize) -> Option { + let (ctime_secs, data) = read_u32(data)?; + let (ctime_nsecs, data) = read_u32(data)?; + let (mtime_secs, data) = read_u32(data)?; + let (mtime_nsecs, data) = read_u32(data)?; + let (dev, data) = read_u32(data)?; + let (ino, data) = read_u32(data)?; + let (mode, data) = read_u32(data)?; + let (uid, data) = read_u32(data)?; + let (gid, data) = read_u32(data)?; + let (size, data) = read_u32(data)?; + let (hash, data) = split_at_pos(data, hash_len)?; + let (flags, data) = read_u16(data)?; + let flags = flags as u32; + let (flags, data) = if flags & entry::flags::EXTENDED == entry::flags::EXTENDED { + let (mut extended_flags, data) = read_u16(data)?; + let extended_flags: u32 = (extended_flags as u32) << 16; + const ALL_KNOWN_EXTENDED_FLAGS: u32 = entry::flags::INTENT_TO_ADD | entry::flags::SKIP_WORKTREE; + assert_eq!( + extended_flags & !ALL_KNOWN_EXTENDED_FLAGS, + 0, + "BUG: encountered unknown extended bitflags in {:b}", + extended_flags + ); + (flags | extended_flags, data) + } else { + (flags, data) + }; + + Entry { + stat: entry::Stat { + ctime: entry::Time { + secs: ctime_secs, + nsecs: ctime_nsecs, + }, + mtime: entry::Time { + secs: mtime_secs, + nsecs: mtime_nsecs, + }, + dev, + ino, + mode, + uid, + gid, + size, + }, + id: git_hash::ObjectId::from(hash), + flags: flags & !entry::mask::PATH_LEN, + } + .into() +} + +#[inline] +fn read_u32(data: &[u8]) -> Option<(u32, &[u8])> { + split_at_pos(data, 4).map(|(num, data)| (u32::from_be_bytes(num.try_into().unwrap()), data)) +} +#[inline] +fn read_u16(data: &[u8]) -> Option<(u16, &[u8])> { + split_at_pos(data, 4).map(|(num, data)| (u16::from_be_bytes(num.try_into().unwrap()), data)) +} + fn load_extensions(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> Extensions { extension::Iter::new_without_checksum(beginning_of_extensions, object_hash) .map(|extensions| { diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index a43e23f7b90..e06bc9ee662 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -1,4 +1,4 @@ -use crate::{util::read_u32, Version}; +use crate::{util::from_be_u32, Version}; use smallvec::SmallVec; const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; @@ -8,7 +8,7 @@ pub type Signature = [u8; 4]; fn decode_header(data: &[u8]) -> (Signature, u32, &[u8]) { let (signature, data) = data.split_at(4); let (size, data) = data.split_at(4); - (signature.try_into().unwrap(), read_u32(size), data) + (signature.try_into().unwrap(), from_be_u32(size), data) } /// A structure to associate object ids of a tree with sections in the index entries list. @@ -25,7 +25,7 @@ pub struct Tree { pub(crate) mod tree; pub(crate) mod end_of_index_entry { - use crate::{decode::header, extension, extension::Signature, util::read_u32}; + use crate::{decode::header, extension, extension::Signature, util::from_be_u32}; pub const SIGNATURE: Signature = *b"EOIE"; pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); @@ -46,7 +46,7 @@ pub(crate) mod end_of_index_entry { } let (offset, checksum) = ext_data.split_at(4); - let offset = read_u32(offset) as usize; + let offset = from_be_u32(offset) as usize; if offset < header::SIZE || offset > start_of_eoie || checksum.len() != git_hash::Kind::Sha1.len_in_bytes() { return None; } @@ -75,7 +75,7 @@ pub(crate) mod end_of_index_entry { } mod iter { - use crate::{extension, extension::Iter, util::read_u32}; + use crate::{extension, extension::Iter, util::from_be_u32}; impl<'a> Iter<'a> { pub fn new(data_at_beginning_of_extensions_and_truncated: &'a [u8]) -> Self { @@ -108,7 +108,7 @@ mod iter { let (signature, data) = self.data.split_at(4); let (size, data) = data.split_at(4); - let size = read_u32(size) as usize; + let size = from_be_u32(size) as usize; match data.get(..size) { Some(ext_data) => { diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 9071479ae73..0800acd71da 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -55,8 +55,38 @@ pub enum Version { V4 = 4, } +pub mod entry { + pub(crate) mod flags { + pub const EXTENDED: u32 = 0x4000; + pub const INTENT_TO_ADD: u32 = 1 << 29; + pub const SKIP_WORKTREE: u32 = 1 << 30; + } + pub(crate) mod mask { + pub const PATH_LEN: u32 = 0x0fff; + } + pub struct Time { + pub secs: u32, + pub nsecs: u32, + } + pub struct Stat { + pub mtime: Time, + pub ctime: Time, + pub dev: u32, + pub ino: u32, + pub mode: u32, + pub uid: u32, + pub gid: u32, + /// The size of bytes on disk. Capped to u32 so files bigger than that will need thorough checking (and hopefully never make it) + pub size: u32, + } +} + /// An entry in the index, identifying a non-tree item on disk. -pub struct Entry; +pub struct Entry { + pub stat: entry::Stat, + pub id: git_hash::ObjectId, + pub flags: u32, +} /// An index file whose state was read from a file on disk. pub struct File { @@ -80,7 +110,7 @@ pub struct State { pub(crate) mod util { #[inline] - pub fn read_u32(b: &[u8]) -> u32 { + pub fn from_be_u32(b: &[u8]) -> u32 { u32::from_be_bytes(b.try_into().unwrap()) } @@ -100,4 +130,12 @@ pub(crate) mod util { }) }) } + + #[inline] + pub fn split_at_pos(data: &[u8], pos: usize) -> Option<(&[u8], &[u8])> { + if data.len() < pos { + return None; + } + data.split_at(pos).into() + } } diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index f45590a3dcf..a5ac8629669 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -22,4 +22,8 @@ mod init { #[test] #[ignore] fn read_without_any_extension() {} + + #[test] + #[ignore] + fn read_v4_with_delta_paths() {} } diff --git a/git-index/tests/index.rs b/git-index/tests/index.rs index 4e1bf722477..f04a997c81c 100644 --- a/git-index/tests/index.rs +++ b/git-index/tests/index.rs @@ -7,3 +7,12 @@ pub fn index_fixture_path(name: &str) -> PathBuf { .expect("script works"); dir.join(".git").join("index") } + +#[test] +fn size_of_entry() { + assert_eq!(std::mem::size_of::(), 64); + + // the reason we have our own time is half the size. + assert_eq!(std::mem::size_of::(), 8); + assert_eq!(std::mem::size_of::(), 16); +} From 01036ad1bafb6a830734a9dd4f4e2949b8981a30 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 10 Jan 2022 21:45:54 +0800 Subject: [PATCH 35/57] a step towards pasing V2 paths (#293) --- git-index/src/decode.rs | 70 +++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index b5fabb40e47..ad43ac6f4df 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -120,7 +120,12 @@ fn load_entries( todo!("load entries") } -fn decode_entry(data: &[u8], hash_len: usize) -> Option { +fn decode_entry<'a>( + data: &'a [u8], + path_backing: &mut Vec, + hash_len: usize, + version: Version, +) -> Option<(Entry, &'a [u8])> { let (ctime_secs, data) = read_u32(data)?; let (ctime_nsecs, data) = read_u32(data)?; let (mtime_secs, data) = read_u32(data)?; @@ -149,27 +154,52 @@ fn decode_entry(data: &[u8], hash_len: usize) -> Option { (flags, data) }; - Entry { - stat: entry::Stat { - ctime: entry::Time { - secs: ctime_secs, - nsecs: ctime_nsecs, - }, - mtime: entry::Time { - secs: mtime_secs, - nsecs: mtime_nsecs, + let (path, data) = match version { + Version::V2 | Version::V3 => { + if (flags & entry::mask::PATH_LEN) == entry::mask::PATH_LEN { + todo!("get to 0 byte and skip padding") + } else { + let path_len = (flags & entry::mask::PATH_LEN) as usize; + let (path, data) = split_at_pos(data, path_len)?; + + let start = path_backing.len(); + path_backing.extend_from_slice(path); + + (start..path_backing.len(), skip_padding(data)) + } + } + Version::V4 => todo!("handle delta-paths"), + }; + + Some(( + Entry { + stat: entry::Stat { + ctime: entry::Time { + secs: ctime_secs, + nsecs: ctime_nsecs, + }, + mtime: entry::Time { + secs: mtime_secs, + nsecs: mtime_nsecs, + }, + dev, + ino, + mode, + uid, + gid, + size, }, - dev, - ino, - mode, - uid, - gid, - size, + id: git_hash::ObjectId::from(hash), + flags: flags & !entry::mask::PATH_LEN, }, - id: git_hash::ObjectId::from(hash), - flags: flags & !entry::mask::PATH_LEN, - } - .into() + data, + )) +} + +#[inline] +fn skip_padding(data: &[u8]) -> &[u8] { + let foo = data.iter().filter(|b| **b == 0).count(); + todo!("continue") } #[inline] From 0a03f196b7ec4dc1e0e2377c729467781c9e6c2c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 11:05:42 +0800 Subject: [PATCH 36/57] =?UTF-8?q?All=20code=20needed=20to=20load=20extensi?= =?UTF-8?q?ons=E2=80=A6=20(#293)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …even though it doesn't work yet as the flags don't pass an assertion. --- git-index/src/decode.rs | 171 +++++++++++++++++++++++++-------- git-index/src/extension/mod.rs | 2 +- git-index/src/file.rs | 6 +- git-index/src/lib.rs | 39 +++----- git-index/tests/file/mod.rs | 2 - 5 files changed, 148 insertions(+), 72 deletions(-) diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index ad43ac6f4df..c9f9647c5e4 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -65,59 +65,147 @@ mod error { source(err) from() } + Entry(index: u32) { + display("Could not parse entry at index {}", index) + } + UnexpectedTrailerLength { expected: usize, actual: usize } { + display("Index trailer should have been {} bytes long, but was {}", expected, actual) + } } } } -use crate::util::{from_be_u32, split_at_pos}; +use crate::util::{from_be_u32, split_at_byte_exclusive, split_at_pos}; pub use error::Error; impl State { - pub fn from_bytes(data: &[u8], timestamp: FileTime, object_hash: git_hash::Kind) -> Result { + pub fn from_bytes( + data: &[u8], + timestamp: FileTime, + object_hash: git_hash::Kind, + ) -> Result<(Self, git_hash::ObjectId), Error> { let (version, num_entries, post_header_data) = header::decode(data, object_hash)?; let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash); - let mut ext = Extensions::default(); - // Note that we ignore all errors for optional signatures. - match start_of_extensions { + let path_backing_buffer_size = load_entries::estimate_path_storage_requirements_in_bytes( + num_entries, + data.len(), + start_of_extensions, + object_hash, + version, + ); + let (entries, ext, data) = match start_of_extensions { Some(offset) => { - let (ext, entries) = - git_features::parallel::join(|| load_extensions(&data[offset..], object_hash), || ()); - todo!("load all extensions in thread, then get IEOT, then possibly multi-threaded entry parsing") + let (entries_res, (ext, data)) = git_features::parallel::join( + // TODO load all extensions in thread, then get IEOT, then possibly multi-threaded entry parsing + || load_entries(data, num_entries, path_backing_buffer_size, object_hash, version), + || load_extensions(&data[offset..], object_hash), + ); + (entries_res?.0, ext, data) } None => { - let (entries, data) = load_entries(data, num_entries, object_hash, version)?; - let ext = load_extensions(data, object_hash); - todo!("load entries singlge-threaded, then extensions") + let (entries, data) = load_entries(data, num_entries, path_backing_buffer_size, object_hash, version)?; + let (ext, data) = load_extensions(data, object_hash); + (entries, ext, data) } + }; + + if data.len() != object_hash.len_in_bytes() { + return Err(Error::UnexpectedTrailerLength { + expected: object_hash.len_in_bytes(), + actual: data.len(), + }); } - Ok(State { - timestamp, - version, - cache_tree: ext.cache_tree, - }) + let checksum = git_hash::ObjectId::from(data); + let load_entries::Outcome { + entries, + path_backing, + is_sparse, + } = entries; + let Extensions { cache_tree } = ext; + + Ok(( + State { + timestamp, + version, + cache_tree, + entries, + path_backing, + is_sparse, + }, + checksum, + )) } } mod load_entries { - use crate::Entry; + use crate::decode::header; + use crate::{Entry, Version}; pub struct Outcome { - entries: Vec, - /// A memory area keeping all index paths, in full length, independently of the index version. - path_backing: Vec, - /// True if one entry in the index has a special marker mode - is_sparse: bool, + pub entries: Vec, + pub path_backing: Vec, + pub is_sparse: bool, + } + + pub fn estimate_path_storage_requirements_in_bytes( + num_entries: u32, + on_disk_size: usize, + offset_to_extensions: Option, + object_hash: git_hash::Kind, + version: Version, + ) -> usize { + const fn on_disk_entry_sans_path(object_hash: git_hash::Kind) -> usize { + 8 + // ctime + 8 + // mtime + (4 * 6) + // various stat fields + 2 + // flag, ignore extended flag as we'd rather overallocate a bit + object_hash.len_in_bytes() + }; + match version { + Version::V3 | Version::V2 => { + let size_of_entries_block = offset_to_extensions.unwrap_or(on_disk_size); + size_of_entries_block + .saturating_sub(num_entries as usize * on_disk_entry_sans_path(object_hash)) + .saturating_sub(header::SIZE) + } + Version::V4 => num_entries as usize * 80, /* a guess directly from git sources */ + } } } +/// Note that `data` must point to the beginning of the entries, right past the header. fn load_entries( - beginning_of_entries: &[u8], + mut data: &[u8], num_entries: u32, + path_backing_capacity: usize, object_hash: git_hash::Kind, version: Version, ) -> Result<(load_entries::Outcome, &[u8]), Error> { - todo!("load entries") + let mut path_backing = Vec::::with_capacity(path_backing_capacity); + let mut entries = Vec::with_capacity(num_entries as usize); + let mut is_sparse = false; + for idx in 0..num_entries { + let (entry, remaining) = + decode_entry(data, &mut path_backing, object_hash.len_in_bytes(), version).ok_or(Error::Entry(idx))?; + data = remaining; + if entry::mode::is_sparse(entry.stat.mode) { + is_sparse = true; + } + // TODO: entries are actually in an intrusive collection, with path as key. Could be set for us. This affects 'ignore_case' which we + // also don't yet handle but probably could, maybe even smartly with the collection. + // For now it's unclear to me how they access the index, they could iterate quickly, and have fast access by path. + entries.push(entry); + } + + Ok(( + load_entries::Outcome { + entries, + path_backing, + is_sparse, + }, + data, + )) } fn decode_entry<'a>( @@ -156,21 +244,24 @@ fn decode_entry<'a>( let (path, data) = match version { Version::V2 | Version::V3 => { - if (flags & entry::mask::PATH_LEN) == entry::mask::PATH_LEN { - todo!("get to 0 byte and skip padding") + let (path, data) = if (flags & entry::mask::PATH_LEN) == entry::mask::PATH_LEN { + split_at_byte_exclusive(data, 0)? } else { let path_len = (flags & entry::mask::PATH_LEN) as usize; - let (path, data) = split_at_pos(data, path_len)?; - - let start = path_backing.len(); - path_backing.extend_from_slice(path); + split_at_pos(data, path_len)? + }; - (start..path_backing.len(), skip_padding(data)) - } + (path, skip_padding(data)) } Version::V4 => todo!("handle delta-paths"), }; + let path = { + let start = path_backing.len(); + path_backing.extend_from_slice(path); + start..path_backing.len() + }; + Some(( Entry { stat: entry::Stat { @@ -198,8 +289,8 @@ fn decode_entry<'a>( #[inline] fn skip_padding(data: &[u8]) -> &[u8] { - let foo = data.iter().filter(|b| **b == 0).count(); - todo!("continue") + let skip = data.iter().take_while(|b| **b == 0).count(); + &data[skip..] } #[inline] @@ -208,14 +299,14 @@ fn read_u32(data: &[u8]) -> Option<(u32, &[u8])> { } #[inline] fn read_u16(data: &[u8]) -> Option<(u16, &[u8])> { - split_at_pos(data, 4).map(|(num, data)| (u16::from_be_bytes(num.try_into().unwrap()), data)) + split_at_pos(data, 2).map(|(num, data)| (u16::from_be_bytes(num.try_into().unwrap()), data)) } -fn load_extensions(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> Extensions { +fn load_extensions(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> (Extensions, &[u8]) { extension::Iter::new_without_checksum(beginning_of_extensions, object_hash) - .map(|extensions| { + .map(|mut extensions| { let mut ext = Extensions::default(); - for (signature, ext_data) in extensions { + for (signature, ext_data) in extensions.by_ref() { match signature { extension::tree::SIGNATURE => { ext.cache_tree = extension::tree::decode(ext_data, object_hash); @@ -224,9 +315,9 @@ fn load_extensions(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) _unknown => {} // skip unknown extensions, too } } - ext + (ext, extensions.data) }) - .unwrap_or_default() + .unwrap_or_else(|| (Extensions::default(), beginning_of_extensions)) } #[derive(Default)] diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index e06bc9ee662..172f85b79ff 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -125,5 +125,5 @@ mod iter { } pub struct Iter<'a> { - data: &'a [u8], + pub data: &'a [u8], } diff --git a/git-index/src/file.rs b/git-index/src/file.rs index b8aa6a84ff5..a4bb2be5e7b 100644 --- a/git-index/src/file.rs +++ b/git-index/src/file.rs @@ -58,10 +58,8 @@ pub mod init { (data, filetime::FileTime::from_last_modification_time(&file.metadata()?)) }; - Ok(File { - state: State::from_bytes(&data, mtime, object_hash)?, - path, - }) + let (state, checksum) = State::from_bytes(&data, mtime, object_hash)?; + Ok(File { state, path, checksum }) } } } diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 0800acd71da..49631495933 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -19,30 +19,6 @@ mod access { } } -pub mod init { - use filetime::FileTime; - - use crate::{State, Version}; - - impl State { - /// Returns an empty state. - /// TODO: figure out if it needs to know some configuration, and if this would actually be used somewhere - fn new() -> Self { - State { - timestamp: FileTime::from_system_time(std::time::SystemTime::UNIX_EPOCH), - version: Version::V3, - cache_tree: None, - } - } - } - - impl Default for State { - fn default() -> Self { - State::new() - } - } -} - pub mod decode; /// All known versions of a git index file. @@ -56,6 +32,12 @@ pub enum Version { } pub mod entry { + pub(crate) mod mode { + const S_IFDIR: u32 = 0040000; + pub fn is_sparse(mode: u32) -> bool { + mode == S_IFDIR + } + } pub(crate) mod flags { pub const EXTENDED: u32 = 0x4000; pub const INTENT_TO_ADD: u32 = 1 << 29; @@ -92,6 +74,8 @@ pub struct Entry { pub struct File { pub state: State, pub path: PathBuf, + /// The checksum of all bytes prior to the checksum itself. + pub checksum: git_hash::ObjectId, } /// An in-memory cache of a fully parsed git index file. @@ -105,7 +89,12 @@ pub struct State { /// same timestamp as this as potentially changed, checking more thoroughly if a change actually happened. timestamp: FileTime, version: Version, - pub cache_tree: Option, + cache_tree: Option, + entries: Vec, + /// A memory area keeping all index paths, in full length, independently of the index version. + path_backing: Vec, + /// True if one entry in the index has a special marker mode + is_sparse: bool, } pub(crate) mod util { diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index a5ac8629669..f125c5310fe 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -6,14 +6,12 @@ mod init { } #[test] - #[ignore] fn read_v2_with_single_entry_tree_and_eoie_ext() { let file = file("v2"); assert_eq!(file.version(), Version::V2); } #[test] - #[ignore] fn read_v2_with_multiple_entries_without_eoie_ext() { let file = file("v2_more_files"); assert_eq!(file.version(), Version::V2); From da556b0a64ac9ca8eaee62cab163789b55903b3d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 11:32:33 +0800 Subject: [PATCH 37/57] Use correct post-header slice when parsing entries (#293) Now it works more, but for some reason we don't see the trailer checksum. It seems extensions consume too much. --- git-index/src/decode.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index c9f9647c5e4..099bfd7f7fa 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -97,13 +97,27 @@ impl State { Some(offset) => { let (entries_res, (ext, data)) = git_features::parallel::join( // TODO load all extensions in thread, then get IEOT, then possibly multi-threaded entry parsing - || load_entries(data, num_entries, path_backing_buffer_size, object_hash, version), + || { + load_entries( + post_header_data, + num_entries, + path_backing_buffer_size, + object_hash, + version, + ) + }, || load_extensions(&data[offset..], object_hash), ); (entries_res?.0, ext, data) } None => { - let (entries, data) = load_entries(data, num_entries, path_backing_buffer_size, object_hash, version)?; + let (entries, data) = load_entries( + post_header_data, + num_entries, + path_backing_buffer_size, + object_hash, + version, + )?; let (ext, data) = load_extensions(data, object_hash); (entries, ext, data) } From 77a062cdaff1bdf80556301f1e1aa41002af9cef Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 11:40:09 +0800 Subject: [PATCH 38/57] Now with counting of consumed bytes in extensions (#293) This leads to the first seemingly correct parsing of simple index files. --- git-index/src/decode.rs | 6 +++--- git-index/src/extension/mod.rs | 9 ++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index 099bfd7f7fa..440ef828e2c 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -318,9 +318,9 @@ fn read_u16(data: &[u8]) -> Option<(u16, &[u8])> { fn load_extensions(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> (Extensions, &[u8]) { extension::Iter::new_without_checksum(beginning_of_extensions, object_hash) - .map(|mut extensions| { + .map(|mut ext_iter| { let mut ext = Extensions::default(); - for (signature, ext_data) in extensions.by_ref() { + for (signature, ext_data) in ext_iter.by_ref() { match signature { extension::tree::SIGNATURE => { ext.cache_tree = extension::tree::decode(ext_data, object_hash); @@ -329,7 +329,7 @@ fn load_extensions(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) _unknown => {} // skip unknown extensions, too } } - (ext, extensions.data) + (ext, &beginning_of_extensions[ext_iter.consumed..]) }) .unwrap_or_else(|| (Extensions::default(), beginning_of_extensions)) } diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index 172f85b79ff..1ab877ef64d 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -81,6 +81,7 @@ mod iter { pub fn new(data_at_beginning_of_extensions_and_truncated: &'a [u8]) -> Self { Iter { data: data_at_beginning_of_extensions_and_truncated, + consumed: 0, } } @@ -93,6 +94,7 @@ mod iter { .checked_sub(object_hash.len_in_bytes())?; Iter { data: &data_at_beginning_of_extensions[..end], + consumed: 0, } .into() } @@ -108,11 +110,15 @@ mod iter { let (signature, data) = self.data.split_at(4); let (size, data) = data.split_at(4); + self.data = data; + self.consumed += 4 + 4; + let size = from_be_u32(size) as usize; match data.get(..size) { Some(ext_data) => { self.data = &data[size..]; + self.consumed += size; Some((signature.try_into().unwrap(), ext_data)) } None => { @@ -125,5 +131,6 @@ mod iter { } pub struct Iter<'a> { - pub data: &'a [u8], + data: &'a [u8], + pub consumed: usize, } From f865ef6c626c9db39a09416333b6465fdd12c734 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 11:51:41 +0800 Subject: [PATCH 39/57] The first test to validate an entry (#293) --- Cargo.lock | 1 + git-index/Cargo.toml | 1 + git-index/src/decode.rs | 1 + git-index/src/entry.rs | 44 +++++++++++++++++++++++++++++++++++++ git-index/src/lib.rs | 42 ++++++++--------------------------- git-index/tests/file/mod.rs | 7 ++++++ git-index/tests/index.rs | 2 +- 7 files changed, 64 insertions(+), 34 deletions(-) create mode 100644 git-index/src/entry.rs diff --git a/Cargo.lock b/Cargo.lock index fede8097e30..c084db2057d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1237,6 +1237,7 @@ name = "git-index" version = "0.0.0" dependencies = [ "atoi", + "bstr", "filetime", "git-features 0.18.0", "git-hash 0.8.0", diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index ce7a54d2a8a..29f4ad6f4a3 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -23,6 +23,7 @@ git-hash = { version ="^0.8.0", path = "../git-hash" } quick-error = "2.0.0" memmap2 = "0.5.0" filetime = "0.2.15" +bstr = { version = "0.2.13", default-features = false, features = ["std"]} serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } smallvec = "1.7.0" diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index 440ef828e2c..90a69f98777 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -296,6 +296,7 @@ fn decode_entry<'a>( }, id: git_hash::ObjectId::from(hash), flags: flags & !entry::mask::PATH_LEN, + path, }, data, )) diff --git a/git-index/src/entry.rs b/git-index/src/entry.rs new file mode 100644 index 00000000000..109cf16ea6f --- /dev/null +++ b/git-index/src/entry.rs @@ -0,0 +1,44 @@ +pub(crate) mod mode { + const S_IFDIR: u32 = 0040000; + pub fn is_sparse(mode: u32) -> bool { + mode == S_IFDIR + } +} + +pub(crate) mod flags { + pub const EXTENDED: u32 = 0x4000; + pub const INTENT_TO_ADD: u32 = 1 << 29; + pub const SKIP_WORKTREE: u32 = 1 << 30; +} + +pub(crate) mod mask { + pub const PATH_LEN: u32 = 0x0fff; +} + +pub struct Time { + pub secs: u32, + pub nsecs: u32, +} + +pub struct Stat { + pub mtime: Time, + pub ctime: Time, + pub dev: u32, + pub ino: u32, + pub mode: u32, + pub uid: u32, + pub gid: u32, + /// The size of bytes on disk. Capped to u32 so files bigger than that will need thorough checking (and hopefully never make it) + pub size: u32, +} + +mod access { + use crate::{Entry, State}; + use bstr::{BStr, ByteSlice}; + + impl Entry { + pub fn path<'a>(&self, state: &'a State) -> &'a BStr { + (&state.path_backing[self.path.clone()]).as_bstr() + } + } +} diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 49631495933..8af32700b1d 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -1,6 +1,7 @@ #![deny(unsafe_code, missing_docs, rust_2018_idioms)] #![allow(missing_docs, unused)] +use std::ops::Range; use std::path::PathBuf; use filetime::FileTime; @@ -9,13 +10,19 @@ pub mod file; pub(crate) mod extension; +pub mod entry; + mod access { - use crate::{State, Version}; + use crate::{Entry, State, Version}; impl State { pub fn version(&self) -> Version { self.version } + + pub fn entries(&self) -> &[Entry] { + &self.entries + } } } @@ -31,43 +38,12 @@ pub enum Version { V4 = 4, } -pub mod entry { - pub(crate) mod mode { - const S_IFDIR: u32 = 0040000; - pub fn is_sparse(mode: u32) -> bool { - mode == S_IFDIR - } - } - pub(crate) mod flags { - pub const EXTENDED: u32 = 0x4000; - pub const INTENT_TO_ADD: u32 = 1 << 29; - pub const SKIP_WORKTREE: u32 = 1 << 30; - } - pub(crate) mod mask { - pub const PATH_LEN: u32 = 0x0fff; - } - pub struct Time { - pub secs: u32, - pub nsecs: u32, - } - pub struct Stat { - pub mtime: Time, - pub ctime: Time, - pub dev: u32, - pub ino: u32, - pub mode: u32, - pub uid: u32, - pub gid: u32, - /// The size of bytes on disk. Capped to u32 so files bigger than that will need thorough checking (and hopefully never make it) - pub size: u32, - } -} - /// An entry in the index, identifying a non-tree item on disk. pub struct Entry { pub stat: entry::Stat, pub id: git_hash::ObjectId, pub flags: u32, + path: Range, } /// An index file whose state was read from a file on disk. diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index f125c5310fe..2543e9c6e19 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -1,5 +1,6 @@ mod init { use git_index::Version; + use git_testtools::hex_to_id; fn file(name: &str) -> git_index::File { git_index::File::at(crate::index_fixture_path(name), git_hash::Kind::Sha1).unwrap() @@ -9,6 +10,12 @@ mod init { fn read_v2_with_single_entry_tree_and_eoie_ext() { let file = file("v2"); assert_eq!(file.version(), Version::V2); + + assert_eq!(file.entries().len(), 1); + + let entry = &file.entries()[0]; + assert_eq!(entry.id, hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391")); + assert_eq!(entry.path(&file.state), "a"); } #[test] diff --git a/git-index/tests/index.rs b/git-index/tests/index.rs index f04a997c81c..d32bfa60c2b 100644 --- a/git-index/tests/index.rs +++ b/git-index/tests/index.rs @@ -10,7 +10,7 @@ pub fn index_fixture_path(name: &str) -> PathBuf { #[test] fn size_of_entry() { - assert_eq!(std::mem::size_of::(), 64); + assert_eq!(std::mem::size_of::(), 80); // the reason we have our own time is half the size. assert_eq!(std::mem::size_of::(), 8); From f47703256fe6a5c68ed3af6705bcdf01262500d6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 11:52:31 +0800 Subject: [PATCH 40/57] thanks clippy --- git-index/src/entry.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/git-index/src/entry.rs b/git-index/src/entry.rs index 109cf16ea6f..4768c859c8b 100644 --- a/git-index/src/entry.rs +++ b/git-index/src/entry.rs @@ -1,5 +1,5 @@ pub(crate) mod mode { - const S_IFDIR: u32 = 0040000; + const S_IFDIR: u32 = 0o040000; pub fn is_sparse(mode: u32) -> bool { mode == S_IFDIR } From 273853f1614a0106c60d3d73c3bf72fb57b405e8 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 16:15:31 +0800 Subject: [PATCH 41/57] more thorough tests for more complex repo with more entries (#293) --- git-index/src/decode.rs | 12 +++++++----- git-index/src/entry.rs | 3 ++- git-index/src/extension/mod.rs | 3 ++- git-index/src/extension/tree.rs | 7 +++++-- git-index/src/file.rs | 3 ++- git-index/src/lib.rs | 3 +-- git-index/tests/file/mod.rs | 9 ++++++++- .../tests/fixtures/make_index/v4_more_files_IEOT.sh | 13 +++++++++++++ 8 files changed, 40 insertions(+), 13 deletions(-) create mode 100644 git-index/tests/fixtures/make_index/v4_more_files_IEOT.sh diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index 90a69f98777..ca71f70d745 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -1,10 +1,13 @@ -use crate::{entry, extension, Entry, State, Version}; use filetime::FileTime; use git_hash::Kind; +use crate::{entry, extension, Entry, State, Version}; + pub mod header { pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; + use crate::{util::from_be_u32, Version}; + mod error { use quick_error::quick_error; @@ -20,7 +23,6 @@ pub mod header { } } } - use crate::{util::from_be_u32, Version}; pub use error::Error; pub(crate) fn decode(data: &[u8], object_hash: git_hash::Kind) -> Result<(crate::Version, u32, &[u8]), Error> { @@ -74,9 +76,10 @@ mod error { } } } -use crate::util::{from_be_u32, split_at_byte_exclusive, split_at_pos}; pub use error::Error; +use crate::util::{from_be_u32, split_at_byte_exclusive, split_at_pos}; + impl State { pub fn from_bytes( data: &[u8], @@ -153,8 +156,7 @@ impl State { } mod load_entries { - use crate::decode::header; - use crate::{Entry, Version}; + use crate::{decode::header, Entry, Version}; pub struct Outcome { pub entries: Vec, diff --git a/git-index/src/entry.rs b/git-index/src/entry.rs index 4768c859c8b..741b2bf56d3 100644 --- a/git-index/src/entry.rs +++ b/git-index/src/entry.rs @@ -33,9 +33,10 @@ pub struct Stat { } mod access { - use crate::{Entry, State}; use bstr::{BStr, ByteSlice}; + use crate::{Entry, State}; + impl Entry { pub fn path<'a>(&self, state: &'a State) -> &'a BStr { (&state.path_backing[self.path.clone()]).as_bstr() diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index 1ab877ef64d..ac0bd4703a8 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -1,6 +1,7 @@ -use crate::{util::from_be_u32, Version}; use smallvec::SmallVec; +use crate::{util::from_be_u32, Version}; + const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; pub type Signature = [u8; 4]; diff --git a/git-index/src/extension/tree.rs b/git-index/src/extension/tree.rs index a1e39ebc550..f7d97fbefe2 100644 --- a/git-index/src/extension/tree.rs +++ b/git-index/src/extension/tree.rs @@ -1,7 +1,10 @@ -use crate::extension::{Signature, Tree}; -use crate::util::split_at_byte_exclusive; use git_hash::ObjectId; +use crate::{ + extension::{Signature, Tree}, + util::split_at_byte_exclusive, +}; + pub const SIGNATURE: Signature = *b"TREE"; pub struct NodeId { diff --git a/git-index/src/file.rs b/git-index/src/file.rs index a4bb2be5e7b..f4570e7510c 100644 --- a/git-index/src/file.rs +++ b/git-index/src/file.rs @@ -1,7 +1,8 @@ mod impls { - use crate::{File, State}; use std::ops::{Deref, DerefMut}; + use crate::{File, State}; + impl Deref for File { type Target = State; diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 8af32700b1d..c27545fdee6 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -1,8 +1,7 @@ #![deny(unsafe_code, missing_docs, rust_2018_idioms)] #![allow(missing_docs, unused)] -use std::ops::Range; -use std::path::PathBuf; +use std::{ops::Range, path::PathBuf}; use filetime::FileTime; diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index 2543e9c6e19..afec8804c98 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -22,6 +22,13 @@ mod init { fn read_v2_with_multiple_entries_without_eoie_ext() { let file = file("v2_more_files"); assert_eq!(file.version(), Version::V2); + + assert_eq!(file.entries().len(), 6); + for (idx, path) in ["a", "b", "c", "d/a", "d/b", "d/c"].into_iter().enumerate() { + let e = &file.entries()[idx]; + assert_eq!(e.path(&file), path); + assert_eq!(e.id, hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391")) + } } #[test] @@ -30,5 +37,5 @@ mod init { #[test] #[ignore] - fn read_v4_with_delta_paths() {} + fn read_v4_with_delta_paths_and_ieot_ext() {} } diff --git a/git-index/tests/fixtures/make_index/v4_more_files_IEOT.sh b/git-index/tests/fixtures/make_index/v4_more_files_IEOT.sh new file mode 100644 index 00000000000..5a61cb47de0 --- /dev/null +++ b/git-index/tests/fixtures/make_index/v4_more_files_IEOT.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu -o pipefail + +GIT_INDEX_VERSION=4 git init -q +git config commit.gpgsign false +git config index.threads 2 + +touch a b c +mkdir d +(cd d && touch a b c && mkdir last && cd last && touch 123 35 6) + +git add . +git commit -m "empty" From b8400ed80543d67a5895c975ba9b1fc28427411c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 17:01:37 +0800 Subject: [PATCH 42/57] feat: decoding of variable int numbers (#293). It's here only so that we can share the code across crates, for now without any feature toggles. --- git-features/src/decode.rs | 38 ++++++++++++++++++++++++++++++++++++++ git-features/src/lib.rs | 2 ++ 2 files changed, 40 insertions(+) create mode 100644 git-features/src/decode.rs diff --git a/git-features/src/decode.rs b/git-features/src/decode.rs new file mode 100644 index 00000000000..0df38710ddb --- /dev/null +++ b/git-features/src/decode.rs @@ -0,0 +1,38 @@ +use std::io::Read; + +/// Decode variable int numbers from a `Read` implementation. +/// +/// Note: currently overflow checks are only done in debug mode. +#[inline] +pub fn leb64_from_read(mut r: impl Read) -> Result<(u64, usize), std::io::Error> { + let mut b = [0u8; 1]; + let mut i = 0; + r.read_exact(&mut b)?; + i += 1; + let mut value = b[0] as u64 & 0x7f; + while b[0] & 0x80 != 0 { + r.read_exact(&mut b)?; + i += 1; + debug_assert!(i <= 10, "Would overflow value at 11th iteration"); + value += 1; + value = (value << 7) + (b[0] as u64 & 0x7f) + } + Ok((value, i)) +} + +/// Decode variable int numbers. +#[inline] +pub fn leb64(d: &[u8]) -> (u64, usize) { + let mut i = 0; + let mut c = d[i]; + i += 1; + let mut value = c as u64 & 0x7f; + while c & 0x80 != 0 { + c = d[i]; + i += 1; + debug_assert!(i <= 10, "Would overflow value at 11th iteration"); + value += 1; + value = (value << 7) + (c as u64 & 0x7f) + } + (value, i) +} diff --git a/git-features/src/lib.rs b/git-features/src/lib.rs index 2321b72f12d..ebeef93bbe8 100644 --- a/git-features/src/lib.rs +++ b/git-features/src/lib.rs @@ -11,6 +11,8 @@ /// pub mod cache; +/// +pub mod decode; pub mod fs; pub mod hash; pub mod interrupt; From 52e3c6f6f4cd1bf677c9189fb59db16173954669 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 17:01:47 +0800 Subject: [PATCH 43/57] Adapt to changes in git-features: use var-int decoding from there (#293) --- git-pack/src/data/entry/decode.rs | 36 +++---------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/git-pack/src/data/entry/decode.rs b/git-pack/src/data/entry/decode.rs index e01b2cf50ea..22bf362d19d 100644 --- a/git-pack/src/data/entry/decode.rs +++ b/git-pack/src/data/entry/decode.rs @@ -1,3 +1,4 @@ +use git_features::decode::{leb64, leb64_from_read}; use std::io; use super::{BLOB, COMMIT, OFS_DELTA, REF_DELTA, TAG, TREE}; @@ -16,7 +17,7 @@ impl data::Entry { use crate::data::entry::Header::*; let object = match type_id { OFS_DELTA => { - let (distance, leb_bytes) = leb64decode(&d[consumed..]); + let (distance, leb_bytes) = leb64(&d[consumed..]); let delta = OfsDelta { base_distance: distance, }; @@ -54,7 +55,7 @@ impl data::Entry { use crate::data::entry::Header::*; let object = match type_id { OFS_DELTA => { - let (distance, leb_bytes) = streaming_leb64decode(&mut r)?; + let (distance, leb_bytes) = leb64_from_read(&mut r)?; let delta = OfsDelta { base_distance: distance, }; @@ -85,37 +86,6 @@ impl data::Entry { } } -#[inline] -fn streaming_leb64decode(mut r: impl io::Read) -> Result<(u64, usize), io::Error> { - let mut b = [0u8; 1]; - let mut i = 0; - r.read_exact(&mut b)?; - i += 1; - let mut value = b[0] as u64 & 0x7f; - while b[0] & 0x80 != 0 { - r.read_exact(&mut b)?; - i += 1; - value += 1; - value = (value << 7) + (b[0] as u64 & 0x7f) - } - Ok((value, i)) -} - -#[inline] -fn leb64decode(d: &[u8]) -> (u64, usize) { - let mut i = 0; - let mut c = d[i]; - i += 1; - let mut value = c as u64 & 0x7f; - while c & 0x80 != 0 { - c = d[i]; - i += 1; - value += 1; - value = (value << 7) + (c as u64 & 0x7f) - } - (value, i) -} - #[inline] fn streaming_parse_header_info(mut read: impl io::Read) -> Result<(u8, u64, usize), io::Error> { let mut byte = [0u8; 1]; From 7558844b40b6c9af5038fea6b8a4e81583c46bde Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 17:10:58 +0800 Subject: [PATCH 44/57] Assure we are right about the leb64 buffer needed for a 64 bit int (#293) --- git-pack/src/data/entry/header.rs | 47 ++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/git-pack/src/data/entry/header.rs b/git-pack/src/data/entry/header.rs index df25a500e05..5856086de2c 100644 --- a/git-pack/src/data/entry/header.rs +++ b/git-pack/src/data/entry/header.rs @@ -102,21 +102,11 @@ impl Header { out.write_all(oid.as_slice())?; written += oid.as_slice().len(); } - OfsDelta { mut base_distance } => { + OfsDelta { base_distance } => { let mut buf = [0u8; 10]; - let mut bytes_written = 1; - buf[buf.len() - 1] = base_distance as u8 & 0b0111_1111; - for out in buf.iter_mut().rev().skip(1) { - base_distance >>= 7; - if base_distance == 0 { - break; - } - base_distance -= 1; - *out = 0b1000_0000 | (base_distance as u8 & 0b0111_1111); - bytes_written += 1; - } - out.write_all(&buf[buf.len() - bytes_written..])?; - written += bytes_written; + let buf = leb64_encode(*base_distance, &mut buf); + out.write_all(buf)?; + written += buf.len(); } Blob | Tree | Commit | Tag => {} } @@ -129,3 +119,32 @@ impl Header { .expect("io::sink() to never fail") } } + +#[inline] +fn leb64_encode(mut n: u64, buf: &mut [u8; 10]) -> &[u8] { + let mut bytes_written = 1; + buf[buf.len() - 1] = n as u8 & 0b0111_1111; + for out in buf.iter_mut().rev().skip(1) { + n >>= 7; + if n == 0 { + break; + } + n -= 1; + *out = 0b1000_0000 | (n as u8 & 0b0111_1111); + bytes_written += 1; + } + debug_assert_eq!(n, 0, "BUG: buffer must be large enough to hold a 64 bit integer"); + &buf[buf.len() - bytes_written..] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn leb64_encode_max_int() { + let mut buf = [0u8; 10]; + let buf = leb64_encode(u64::MAX, &mut buf); + assert_eq!(buf.len(), 10, "10 bytes should be used when 64bits are encoded"); + } +} From 06640e3f98f25e9502db7ac68e1967d9fd25e8b2 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 18:13:59 +0800 Subject: [PATCH 45/57] parse V4 delta-paths (#293) Unfortunately we are a little more inefficient there as we have to copy the shared portion into a buffer before we can use these bytes to extend the backing storage with. Fair enough, it's most definitely not measurable. --- README.md | 2 +- git-index/src/decode.rs | 71 +++++++++++++------ git-index/tests/file/mod.rs | 27 ++++++- git-index/tests/fixtures/make_index/v2.sh | 3 +- .../fixtures/make_index/v2_more_files.sh | 3 +- .../fixtures/make_index/v4_more_files_IEOT.sh | 6 +- 6 files changed, 84 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 56a72eccddc..d268f20120d 100644 --- a/README.md +++ b/README.md @@ -240,7 +240,7 @@ Provide a CLI to for the most basic user journey: * [ ] `gix tool open-remote` open the URL of the remote, possibly after applying known transformations to go from `ssh` to `https`. * [ ] Open up SQL for git using [sqlite virtual tables](https://github.com/rusqlite/rusqlite/blob/master/tests/vtab.rs). Check out gitqlite - as well. What would an MVP look like? Maybe even something that could ship with gitoxide. + as well. What would an MVP look like? Maybe even something that could ship with gitoxide. See [this go implementation as example](https://github.com/filhodanuvem/gitql). * [ ] A truly awesome history rewriter which makes it easy to understand what happened while avoiding all pitfalls. Think BFG, but more awesome, if that's possible. * [ ] `git-tui` should learn a lot from [fossil-scm] regarding the presentation of data. Maybe [this](https://github.com/Lutetium-Vanadium/requestty/) can be used for prompts. Probably [magit] has a lot to offer, too. diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs index ca71f70d745..e17d60143f4 100644 --- a/git-index/src/decode.rs +++ b/git-index/src/decode.rs @@ -1,5 +1,7 @@ +use bstr::{BStr, ByteSlice}; use filetime::FileTime; use git_hash::Kind; +use std::ops::Range; use crate::{entry, extension, Entry, State, Version}; @@ -157,6 +159,8 @@ impl State { mod load_entries { use crate::{decode::header, Entry, Version}; + /// a guess directly from git sources + pub const AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES: usize = 80; pub struct Outcome { pub entries: Vec, @@ -185,7 +189,7 @@ mod load_entries { .saturating_sub(num_entries as usize * on_disk_entry_sans_path(object_hash)) .saturating_sub(header::SIZE) } - Version::V4 => num_entries as usize * 80, /* a guess directly from git sources */ + Version::V4 => num_entries as usize * AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES, } } } @@ -199,11 +203,22 @@ fn load_entries( version: Version, ) -> Result<(load_entries::Outcome, &[u8]), Error> { let mut path_backing = Vec::::with_capacity(path_backing_capacity); - let mut entries = Vec::with_capacity(num_entries as usize); + let mut entries = Vec::::with_capacity(num_entries as usize); let mut is_sparse = false; + let has_delta_paths = version == Version::V4; + let mut prev_path = None; + let mut delta_buf = Vec::::with_capacity(load_entries::AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES); + for idx in 0..num_entries { - let (entry, remaining) = - decode_entry(data, &mut path_backing, object_hash.len_in_bytes(), version).ok_or(Error::Entry(idx))?; + let (entry, remaining) = decode_entry( + data, + &mut path_backing, + object_hash.len_in_bytes(), + has_delta_paths, + prev_path, + ) + .ok_or(Error::Entry(idx))?; + data = remaining; if entry::mode::is_sparse(entry.stat.mode) { is_sparse = true; @@ -212,6 +227,7 @@ fn load_entries( // also don't yet handle but probably could, maybe even smartly with the collection. // For now it's unclear to me how they access the index, they could iterate quickly, and have fast access by path. entries.push(entry); + prev_path = entries.last().map(|e| (e.path.clone(), &mut delta_buf)); } Ok(( @@ -224,11 +240,13 @@ fn load_entries( )) } +/// Note that `prev_path` is only useful if the version is V4 fn decode_entry<'a>( data: &'a [u8], path_backing: &mut Vec, hash_len: usize, - version: Version, + has_delta_paths: bool, + prev_path_and_buf: Option<(Range, &mut Vec)>, ) -> Option<(Entry, &'a [u8])> { let (ctime_secs, data) = read_u32(data)?; let (ctime_nsecs, data) = read_u32(data)?; @@ -258,25 +276,36 @@ fn decode_entry<'a>( (flags, data) }; - let (path, data) = match version { - Version::V2 | Version::V3 => { - let (path, data) = if (flags & entry::mask::PATH_LEN) == entry::mask::PATH_LEN { - split_at_byte_exclusive(data, 0)? - } else { - let path_len = (flags & entry::mask::PATH_LEN) as usize; - split_at_pos(data, path_len)? - }; - - (path, skip_padding(data)) + let start = path_backing.len(); + let data = if has_delta_paths { + let (strip_len, consumed) = git_features::decode::leb64(data); + let data = &data[consumed..]; + if let Some((prev_path, buf)) = prev_path_and_buf { + let end = prev_path.end.checked_sub(strip_len.try_into().ok()?)?; + let copy_len = end.checked_sub(prev_path.start)?; + if copy_len > 0 { + buf.resize(copy_len, 0); + buf.copy_from_slice(&path_backing[prev_path.start..end]); + path_backing.extend_from_slice(buf); + } } - Version::V4 => todo!("handle delta-paths"), - }; - let path = { - let start = path_backing.len(); + let (path, data) = split_at_byte_exclusive(data, 0)?; + path_backing.extend_from_slice(path); + + data + } else { + let (path, data) = if (flags & entry::mask::PATH_LEN) == entry::mask::PATH_LEN { + split_at_byte_exclusive(data, 0)? + } else { + let path_len = (flags & entry::mask::PATH_LEN) as usize; + split_at_pos(data, path_len)? + }; + path_backing.extend_from_slice(path); - start..path_backing.len() + skip_padding(data) }; + let path_range = start..path_backing.len(); Some(( Entry { @@ -298,7 +327,7 @@ fn decode_entry<'a>( }, id: git_hash::ObjectId::from(hash), flags: flags & !entry::mask::PATH_LEN, - path, + path: path_range, }, data, )) diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index afec8804c98..faff7cdef65 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -36,6 +36,29 @@ mod init { fn read_without_any_extension() {} #[test] - #[ignore] - fn read_v4_with_delta_paths_and_ieot_ext() {} + fn read_v4_with_delta_paths_and_ieot_ext() { + let file = file("v4_more_files_IEOT"); + assert_eq!(file.version(), Version::V4); + + assert_eq!(file.entries().len(), 10); + for (idx, path) in [ + "a", + "b", + "c", + "d/a", + "d/b", + "d/c", + "d/last/123", + "d/last/34", + "d/last/6", + "x", + ] + .into_iter() + .enumerate() + { + let e = &file.entries()[idx]; + assert_eq!(e.path(&file), path); + assert_eq!(e.id, hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391")) + } + } } diff --git a/git-index/tests/fixtures/make_index/v2.sh b/git-index/tests/fixtures/make_index/v2.sh index 20b16c3b587..a7ab127a393 100644 --- a/git-index/tests/fixtures/make_index/v2.sh +++ b/git-index/tests/fixtures/make_index/v2.sh @@ -1,7 +1,8 @@ #!/bin/bash set -eu -o pipefail -GIT_INDEX_VERSION=2 git init -q +export GIT_INDEX_VERSION=2 +git init -q git config commit.gpgsign false git config index.threads 2 diff --git a/git-index/tests/fixtures/make_index/v2_more_files.sh b/git-index/tests/fixtures/make_index/v2_more_files.sh index a43ec62203c..83a3583e6d0 100644 --- a/git-index/tests/fixtures/make_index/v2_more_files.sh +++ b/git-index/tests/fixtures/make_index/v2_more_files.sh @@ -1,7 +1,8 @@ #!/bin/bash set -eu -o pipefail -GIT_INDEX_VERSION=2 git init -q +export GIT_INDEX_VERSION=2; +git init -q git config commit.gpgsign false git config index.threads 1 diff --git a/git-index/tests/fixtures/make_index/v4_more_files_IEOT.sh b/git-index/tests/fixtures/make_index/v4_more_files_IEOT.sh index 5a61cb47de0..9dff63b5b4d 100644 --- a/git-index/tests/fixtures/make_index/v4_more_files_IEOT.sh +++ b/git-index/tests/fixtures/make_index/v4_more_files_IEOT.sh @@ -1,13 +1,15 @@ #!/bin/bash set -eu -o pipefail -GIT_INDEX_VERSION=4 git init -q +export GIT_INDEX_VERSION=4 +git init -q git config commit.gpgsign false git config index.threads 2 touch a b c mkdir d -(cd d && touch a b c && mkdir last && cd last && touch 123 35 6) +(cd d && touch a b c && mkdir last && cd last && touch 123 34 6) +touch x git add . git commit -m "empty" From 6f04f8b8276de9c6b649642fb7c95eb5ffad77e4 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 19:51:46 +0800 Subject: [PATCH 46/57] refactor (#293) --- git-index/src/decode.rs | 372 ------------------ git-index/src/decode/entries.rs | 195 +++++++++ git-index/src/decode/header.rs | 49 +++ git-index/src/decode/mod.rs | 109 +++++ git-index/src/extension/decode.rs | 32 ++ git-index/src/extension/end_of_index_entry.rs | 47 +++ git-index/src/extension/iter.rs | 53 +++ git-index/src/extension/mod.rs | 114 +----- 8 files changed, 489 insertions(+), 482 deletions(-) delete mode 100644 git-index/src/decode.rs create mode 100644 git-index/src/decode/entries.rs create mode 100644 git-index/src/decode/header.rs create mode 100644 git-index/src/decode/mod.rs create mode 100644 git-index/src/extension/decode.rs create mode 100644 git-index/src/extension/end_of_index_entry.rs create mode 100644 git-index/src/extension/iter.rs diff --git a/git-index/src/decode.rs b/git-index/src/decode.rs deleted file mode 100644 index e17d60143f4..00000000000 --- a/git-index/src/decode.rs +++ /dev/null @@ -1,372 +0,0 @@ -use bstr::{BStr, ByteSlice}; -use filetime::FileTime; -use git_hash::Kind; -use std::ops::Range; - -use crate::{entry, extension, Entry, State, Version}; - -pub mod header { - pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; - - use crate::{util::from_be_u32, Version}; - - mod error { - use quick_error::quick_error; - - quick_error! { - #[derive(Debug)] - pub enum Error { - Corrupt(message: &'static str) { - display("{}", message) - } - UnsupportedVersion(version: u32) { - display("Index version {} is not supported", version) - } - } - } - } - pub use error::Error; - - pub(crate) fn decode(data: &[u8], object_hash: git_hash::Kind) -> Result<(crate::Version, u32, &[u8]), Error> { - if data.len() < (3 * 4) + object_hash.len_in_bytes() { - return Err(Error::Corrupt( - "File is too small even for header with zero entries and smallest hash", - )); - } - - const SIGNATURE: &[u8] = b"DIRC"; - let (signature, data) = data.split_at(4); - if signature != SIGNATURE { - return Err(Error::Corrupt( - "Signature mismatch - this doesn't claim to be a header file", - )); - } - - let (version, data) = data.split_at(4); - let version = match from_be_u32(version) { - 2 => Version::V2, - 3 => Version::V3, - 4 => Version::V4, - unknown => return Err(Error::UnsupportedVersion(unknown)), - }; - let (entries, data) = data.split_at(4); - let entries = from_be_u32(entries); - - Ok((version, entries, data)) - } -} - -mod error { - use quick_error::quick_error; - - use crate::decode; - - quick_error! { - #[derive(Debug)] - pub enum Error { - Header(err: decode::header::Error) { - display("The header could not be decoded") - source(err) - from() - } - Entry(index: u32) { - display("Could not parse entry at index {}", index) - } - UnexpectedTrailerLength { expected: usize, actual: usize } { - display("Index trailer should have been {} bytes long, but was {}", expected, actual) - } - } - } -} -pub use error::Error; - -use crate::util::{from_be_u32, split_at_byte_exclusive, split_at_pos}; - -impl State { - pub fn from_bytes( - data: &[u8], - timestamp: FileTime, - object_hash: git_hash::Kind, - ) -> Result<(Self, git_hash::ObjectId), Error> { - let (version, num_entries, post_header_data) = header::decode(data, object_hash)?; - let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash); - - let path_backing_buffer_size = load_entries::estimate_path_storage_requirements_in_bytes( - num_entries, - data.len(), - start_of_extensions, - object_hash, - version, - ); - let (entries, ext, data) = match start_of_extensions { - Some(offset) => { - let (entries_res, (ext, data)) = git_features::parallel::join( - // TODO load all extensions in thread, then get IEOT, then possibly multi-threaded entry parsing - || { - load_entries( - post_header_data, - num_entries, - path_backing_buffer_size, - object_hash, - version, - ) - }, - || load_extensions(&data[offset..], object_hash), - ); - (entries_res?.0, ext, data) - } - None => { - let (entries, data) = load_entries( - post_header_data, - num_entries, - path_backing_buffer_size, - object_hash, - version, - )?; - let (ext, data) = load_extensions(data, object_hash); - (entries, ext, data) - } - }; - - if data.len() != object_hash.len_in_bytes() { - return Err(Error::UnexpectedTrailerLength { - expected: object_hash.len_in_bytes(), - actual: data.len(), - }); - } - - let checksum = git_hash::ObjectId::from(data); - let load_entries::Outcome { - entries, - path_backing, - is_sparse, - } = entries; - let Extensions { cache_tree } = ext; - - Ok(( - State { - timestamp, - version, - cache_tree, - entries, - path_backing, - is_sparse, - }, - checksum, - )) - } -} - -mod load_entries { - use crate::{decode::header, Entry, Version}; - /// a guess directly from git sources - pub const AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES: usize = 80; - - pub struct Outcome { - pub entries: Vec, - pub path_backing: Vec, - pub is_sparse: bool, - } - - pub fn estimate_path_storage_requirements_in_bytes( - num_entries: u32, - on_disk_size: usize, - offset_to_extensions: Option, - object_hash: git_hash::Kind, - version: Version, - ) -> usize { - const fn on_disk_entry_sans_path(object_hash: git_hash::Kind) -> usize { - 8 + // ctime - 8 + // mtime - (4 * 6) + // various stat fields - 2 + // flag, ignore extended flag as we'd rather overallocate a bit - object_hash.len_in_bytes() - }; - match version { - Version::V3 | Version::V2 => { - let size_of_entries_block = offset_to_extensions.unwrap_or(on_disk_size); - size_of_entries_block - .saturating_sub(num_entries as usize * on_disk_entry_sans_path(object_hash)) - .saturating_sub(header::SIZE) - } - Version::V4 => num_entries as usize * AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES, - } - } -} - -/// Note that `data` must point to the beginning of the entries, right past the header. -fn load_entries( - mut data: &[u8], - num_entries: u32, - path_backing_capacity: usize, - object_hash: git_hash::Kind, - version: Version, -) -> Result<(load_entries::Outcome, &[u8]), Error> { - let mut path_backing = Vec::::with_capacity(path_backing_capacity); - let mut entries = Vec::::with_capacity(num_entries as usize); - let mut is_sparse = false; - let has_delta_paths = version == Version::V4; - let mut prev_path = None; - let mut delta_buf = Vec::::with_capacity(load_entries::AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES); - - for idx in 0..num_entries { - let (entry, remaining) = decode_entry( - data, - &mut path_backing, - object_hash.len_in_bytes(), - has_delta_paths, - prev_path, - ) - .ok_or(Error::Entry(idx))?; - - data = remaining; - if entry::mode::is_sparse(entry.stat.mode) { - is_sparse = true; - } - // TODO: entries are actually in an intrusive collection, with path as key. Could be set for us. This affects 'ignore_case' which we - // also don't yet handle but probably could, maybe even smartly with the collection. - // For now it's unclear to me how they access the index, they could iterate quickly, and have fast access by path. - entries.push(entry); - prev_path = entries.last().map(|e| (e.path.clone(), &mut delta_buf)); - } - - Ok(( - load_entries::Outcome { - entries, - path_backing, - is_sparse, - }, - data, - )) -} - -/// Note that `prev_path` is only useful if the version is V4 -fn decode_entry<'a>( - data: &'a [u8], - path_backing: &mut Vec, - hash_len: usize, - has_delta_paths: bool, - prev_path_and_buf: Option<(Range, &mut Vec)>, -) -> Option<(Entry, &'a [u8])> { - let (ctime_secs, data) = read_u32(data)?; - let (ctime_nsecs, data) = read_u32(data)?; - let (mtime_secs, data) = read_u32(data)?; - let (mtime_nsecs, data) = read_u32(data)?; - let (dev, data) = read_u32(data)?; - let (ino, data) = read_u32(data)?; - let (mode, data) = read_u32(data)?; - let (uid, data) = read_u32(data)?; - let (gid, data) = read_u32(data)?; - let (size, data) = read_u32(data)?; - let (hash, data) = split_at_pos(data, hash_len)?; - let (flags, data) = read_u16(data)?; - let flags = flags as u32; - let (flags, data) = if flags & entry::flags::EXTENDED == entry::flags::EXTENDED { - let (mut extended_flags, data) = read_u16(data)?; - let extended_flags: u32 = (extended_flags as u32) << 16; - const ALL_KNOWN_EXTENDED_FLAGS: u32 = entry::flags::INTENT_TO_ADD | entry::flags::SKIP_WORKTREE; - assert_eq!( - extended_flags & !ALL_KNOWN_EXTENDED_FLAGS, - 0, - "BUG: encountered unknown extended bitflags in {:b}", - extended_flags - ); - (flags | extended_flags, data) - } else { - (flags, data) - }; - - let start = path_backing.len(); - let data = if has_delta_paths { - let (strip_len, consumed) = git_features::decode::leb64(data); - let data = &data[consumed..]; - if let Some((prev_path, buf)) = prev_path_and_buf { - let end = prev_path.end.checked_sub(strip_len.try_into().ok()?)?; - let copy_len = end.checked_sub(prev_path.start)?; - if copy_len > 0 { - buf.resize(copy_len, 0); - buf.copy_from_slice(&path_backing[prev_path.start..end]); - path_backing.extend_from_slice(buf); - } - } - - let (path, data) = split_at_byte_exclusive(data, 0)?; - path_backing.extend_from_slice(path); - - data - } else { - let (path, data) = if (flags & entry::mask::PATH_LEN) == entry::mask::PATH_LEN { - split_at_byte_exclusive(data, 0)? - } else { - let path_len = (flags & entry::mask::PATH_LEN) as usize; - split_at_pos(data, path_len)? - }; - - path_backing.extend_from_slice(path); - skip_padding(data) - }; - let path_range = start..path_backing.len(); - - Some(( - Entry { - stat: entry::Stat { - ctime: entry::Time { - secs: ctime_secs, - nsecs: ctime_nsecs, - }, - mtime: entry::Time { - secs: mtime_secs, - nsecs: mtime_nsecs, - }, - dev, - ino, - mode, - uid, - gid, - size, - }, - id: git_hash::ObjectId::from(hash), - flags: flags & !entry::mask::PATH_LEN, - path: path_range, - }, - data, - )) -} - -#[inline] -fn skip_padding(data: &[u8]) -> &[u8] { - let skip = data.iter().take_while(|b| **b == 0).count(); - &data[skip..] -} - -#[inline] -fn read_u32(data: &[u8]) -> Option<(u32, &[u8])> { - split_at_pos(data, 4).map(|(num, data)| (u32::from_be_bytes(num.try_into().unwrap()), data)) -} -#[inline] -fn read_u16(data: &[u8]) -> Option<(u16, &[u8])> { - split_at_pos(data, 2).map(|(num, data)| (u16::from_be_bytes(num.try_into().unwrap()), data)) -} - -fn load_extensions(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> (Extensions, &[u8]) { - extension::Iter::new_without_checksum(beginning_of_extensions, object_hash) - .map(|mut ext_iter| { - let mut ext = Extensions::default(); - for (signature, ext_data) in ext_iter.by_ref() { - match signature { - extension::tree::SIGNATURE => { - ext.cache_tree = extension::tree::decode(ext_data, object_hash); - } - extension::end_of_index_entry::SIGNATURE => {} // skip already done - _unknown => {} // skip unknown extensions, too - } - } - (ext, &beginning_of_extensions[ext_iter.consumed..]) - }) - .unwrap_or_else(|| (Extensions::default(), beginning_of_extensions)) -} - -#[derive(Default)] -struct Extensions { - cache_tree: Option, -} diff --git a/git-index/src/decode/entries.rs b/git-index/src/decode/entries.rs new file mode 100644 index 00000000000..fac92be0669 --- /dev/null +++ b/git-index/src/decode/entries.rs @@ -0,0 +1,195 @@ +use crate::util::{split_at_byte_exclusive, split_at_pos}; +use crate::{ + decode::{header, Error}, + entry, Entry, Version, +}; +use std::ops::Range; + +/// a guess directly from git sources +pub const AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES: usize = 80; + +pub struct Outcome { + pub entries: Vec, + pub path_backing: Vec, + pub is_sparse: bool, +} + +pub fn estimate_path_storage_requirements_in_bytes( + num_entries: u32, + on_disk_size: usize, + offset_to_extensions: Option, + object_hash: git_hash::Kind, + version: Version, +) -> usize { + const fn on_disk_entry_sans_path(object_hash: git_hash::Kind) -> usize { + 8 + // ctime + 8 + // mtime + (4 * 6) + // various stat fields + 2 + // flag, ignore extended flag as we'd rather overallocate a bit + object_hash.len_in_bytes() + }; + match version { + Version::V3 | Version::V2 => { + let size_of_entries_block = offset_to_extensions.unwrap_or(on_disk_size); + size_of_entries_block + .saturating_sub(num_entries as usize * on_disk_entry_sans_path(object_hash)) + .saturating_sub(header::SIZE) + } + Version::V4 => num_entries as usize * AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES, + } +} + +/// Note that `data` must point to the beginning of the entries, right past the header. +pub fn load_all( + mut data: &[u8], + num_entries: u32, + path_backing_capacity: usize, + object_hash: git_hash::Kind, + version: Version, +) -> Result<(Outcome, &[u8]), Error> { + let mut path_backing = Vec::::with_capacity(path_backing_capacity); + let mut entries = Vec::::with_capacity(num_entries as usize); + let mut is_sparse = false; + let has_delta_paths = version == Version::V4; + let mut prev_path = None; + let mut delta_buf = Vec::::with_capacity(AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES); + + for idx in 0..num_entries { + let (entry, remaining) = load_one( + data, + &mut path_backing, + object_hash.len_in_bytes(), + has_delta_paths, + prev_path, + ) + .ok_or(Error::Entry(idx))?; + + data = remaining; + if entry::mode::is_sparse(entry.stat.mode) { + is_sparse = true; + } + // TODO: entries are actually in an intrusive collection, with path as key. Could be set for us. This affects 'ignore_case' which we + // also don't yet handle but probably could, maybe even smartly with the collection. + // For now it's unclear to me how they access the index, they could iterate quickly, and have fast access by path. + entries.push(entry); + prev_path = entries.last().map(|e| (e.path.clone(), &mut delta_buf)); + } + + Ok(( + Outcome { + entries, + path_backing, + is_sparse, + }, + data, + )) +} + +/// Note that `prev_path` is only useful if the version is V4 +fn load_one<'a>( + data: &'a [u8], + path_backing: &mut Vec, + hash_len: usize, + has_delta_paths: bool, + prev_path_and_buf: Option<(Range, &mut Vec)>, +) -> Option<(Entry, &'a [u8])> { + let (ctime_secs, data) = read_u32(data)?; + let (ctime_nsecs, data) = read_u32(data)?; + let (mtime_secs, data) = read_u32(data)?; + let (mtime_nsecs, data) = read_u32(data)?; + let (dev, data) = read_u32(data)?; + let (ino, data) = read_u32(data)?; + let (mode, data) = read_u32(data)?; + let (uid, data) = read_u32(data)?; + let (gid, data) = read_u32(data)?; + let (size, data) = read_u32(data)?; + let (hash, data) = split_at_pos(data, hash_len)?; + let (flags, data) = read_u16(data)?; + let flags = flags as u32; + let (flags, data) = if flags & entry::flags::EXTENDED == entry::flags::EXTENDED { + let (mut extended_flags, data) = read_u16(data)?; + let extended_flags: u32 = (extended_flags as u32) << 16; + const ALL_KNOWN_EXTENDED_FLAGS: u32 = entry::flags::INTENT_TO_ADD | entry::flags::SKIP_WORKTREE; + assert_eq!( + extended_flags & !ALL_KNOWN_EXTENDED_FLAGS, + 0, + "BUG: encountered unknown extended bitflags in {:b}", + extended_flags + ); + (flags | extended_flags, data) + } else { + (flags, data) + }; + + let start = path_backing.len(); + let data = if has_delta_paths { + let (strip_len, consumed) = git_features::decode::leb64(data); + let data = &data[consumed..]; + if let Some((prev_path, buf)) = prev_path_and_buf { + let end = prev_path.end.checked_sub(strip_len.try_into().ok()?)?; + let copy_len = end.checked_sub(prev_path.start)?; + if copy_len > 0 { + buf.resize(copy_len, 0); + buf.copy_from_slice(&path_backing[prev_path.start..end]); + path_backing.extend_from_slice(buf); + } + } + + let (path, data) = split_at_byte_exclusive(data, 0)?; + path_backing.extend_from_slice(path); + + data + } else { + let (path, data) = if (flags & entry::mask::PATH_LEN) == entry::mask::PATH_LEN { + split_at_byte_exclusive(data, 0)? + } else { + let path_len = (flags & entry::mask::PATH_LEN) as usize; + split_at_pos(data, path_len)? + }; + + path_backing.extend_from_slice(path); + skip_padding(data) + }; + let path_range = start..path_backing.len(); + + Some(( + Entry { + stat: entry::Stat { + ctime: entry::Time { + secs: ctime_secs, + nsecs: ctime_nsecs, + }, + mtime: entry::Time { + secs: mtime_secs, + nsecs: mtime_nsecs, + }, + dev, + ino, + mode, + uid, + gid, + size, + }, + id: git_hash::ObjectId::from(hash), + flags: flags & !entry::mask::PATH_LEN, + path: path_range, + }, + data, + )) +} + +#[inline] +fn skip_padding(data: &[u8]) -> &[u8] { + let skip = data.iter().take_while(|b| **b == 0).count(); + &data[skip..] +} + +#[inline] +fn read_u32(data: &[u8]) -> Option<(u32, &[u8])> { + split_at_pos(data, 4).map(|(num, data)| (u32::from_be_bytes(num.try_into().unwrap()), data)) +} + +#[inline] +fn read_u16(data: &[u8]) -> Option<(u16, &[u8])> { + split_at_pos(data, 2).map(|(num, data)| (u16::from_be_bytes(num.try_into().unwrap()), data)) +} diff --git a/git-index/src/decode/header.rs b/git-index/src/decode/header.rs new file mode 100644 index 00000000000..097807dd511 --- /dev/null +++ b/git-index/src/decode/header.rs @@ -0,0 +1,49 @@ +pub(crate) const SIZE: usize = 4 /*signature*/ + 4 /*version*/ + 4 /* num entries */; + +use crate::{util::from_be_u32, Version}; + +mod error { + use quick_error::quick_error; + + quick_error! { + #[derive(Debug)] + pub enum Error { + Corrupt(message: &'static str) { + display("{}", message) + } + UnsupportedVersion(version: u32) { + display("Index version {} is not supported", version) + } + } + } +} + +pub use error::Error; + +pub(crate) fn decode(data: &[u8], object_hash: git_hash::Kind) -> Result<(crate::Version, u32, &[u8]), Error> { + if data.len() < (3 * 4) + object_hash.len_in_bytes() { + return Err(Error::Corrupt( + "File is too small even for header with zero entries and smallest hash", + )); + } + + const SIGNATURE: &[u8] = b"DIRC"; + let (signature, data) = data.split_at(4); + if signature != SIGNATURE { + return Err(Error::Corrupt( + "Signature mismatch - this doesn't claim to be a header file", + )); + } + + let (version, data) = data.split_at(4); + let version = match from_be_u32(version) { + 2 => Version::V2, + 3 => Version::V3, + 4 => Version::V4, + unknown => return Err(Error::UnsupportedVersion(unknown)), + }; + let (entries, data) = data.split_at(4); + let entries = from_be_u32(entries); + + Ok((version, entries, data)) +} diff --git a/git-index/src/decode/mod.rs b/git-index/src/decode/mod.rs new file mode 100644 index 00000000000..1009e41e51b --- /dev/null +++ b/git-index/src/decode/mod.rs @@ -0,0 +1,109 @@ +use bstr::{BStr, ByteSlice}; +use filetime::FileTime; +use git_hash::Kind; +use std::ops::Range; + +use crate::util::{from_be_u32, split_at_byte_exclusive, split_at_pos}; +use crate::{entry, extension, Entry, State, Version}; + +mod entries; +pub mod header; + +mod error { + use quick_error::quick_error; + + use crate::decode; + + quick_error! { + #[derive(Debug)] + pub enum Error { + Header(err: decode::header::Error) { + display("The header could not be decoded") + source(err) + from() + } + Entry(index: u32) { + display("Could not parse entry at index {}", index) + } + UnexpectedTrailerLength { expected: usize, actual: usize } { + display("Index trailer should have been {} bytes long, but was {}", expected, actual) + } + } + } +} +pub use error::Error; + +impl State { + pub fn from_bytes( + data: &[u8], + timestamp: FileTime, + object_hash: git_hash::Kind, + ) -> Result<(Self, git_hash::ObjectId), Error> { + let (version, num_entries, post_header_data) = header::decode(data, object_hash)?; + let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash); + + let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes( + num_entries, + data.len(), + start_of_extensions, + object_hash, + version, + ); + let (entries, ext, data) = match start_of_extensions { + Some(offset) => { + let (entries_res, (ext, data)) = git_features::parallel::join( + // TODO load all extensions in thread, then get IEOT, then possibly multi-threaded entry parsing + || { + entries::load_all( + post_header_data, + num_entries, + path_backing_buffer_size, + object_hash, + version, + ) + }, + || extension::decode::all(&data[offset..], object_hash), + ); + (entries_res?.0, ext, data) + } + None => { + let (entries, data) = entries::load_all( + post_header_data, + num_entries, + path_backing_buffer_size, + object_hash, + version, + )?; + let (ext, data) = extension::decode::all(data, object_hash); + (entries, ext, data) + } + }; + + if data.len() != object_hash.len_in_bytes() { + return Err(Error::UnexpectedTrailerLength { + expected: object_hash.len_in_bytes(), + actual: data.len(), + }); + } + + let checksum = git_hash::ObjectId::from(data); + let entries::Outcome { + entries, + path_backing, + is_sparse, + } = entries; + let extension::decode::Outcome { cache_tree } = ext; + + Ok(( + State { + timestamp, + version, + cache_tree, + entries, + path_backing, + is_sparse, + }, + checksum, + )) + } +} diff --git a/git-index/src/extension/decode.rs b/git-index/src/extension/decode.rs new file mode 100644 index 00000000000..90e7f953332 --- /dev/null +++ b/git-index/src/extension/decode.rs @@ -0,0 +1,32 @@ +use crate::extension; +use crate::extension::Signature; +use crate::util::from_be_u32; + +pub fn header(data: &[u8]) -> (Signature, u32, &[u8]) { + let (signature, data) = data.split_at(4); + let (size, data) = data.split_at(4); + (signature.try_into().unwrap(), from_be_u32(size), data) +} + +pub fn all(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> (Outcome, &[u8]) { + extension::Iter::new_without_checksum(beginning_of_extensions, object_hash) + .map(|mut ext_iter| { + let mut ext = Outcome::default(); + for (signature, ext_data) in ext_iter.by_ref() { + match signature { + extension::tree::SIGNATURE => { + ext.cache_tree = extension::tree::decode(ext_data, object_hash); + } + extension::end_of_index_entry::SIGNATURE => {} // skip already done + _unknown => {} // skip unknown extensions, too + } + } + (ext, &beginning_of_extensions[ext_iter.consumed..]) + }) + .unwrap_or_else(|| (Outcome::default(), beginning_of_extensions)) +} + +#[derive(Default)] +pub struct Outcome { + pub cache_tree: Option, +} diff --git a/git-index/src/extension/end_of_index_entry.rs b/git-index/src/extension/end_of_index_entry.rs new file mode 100644 index 00000000000..c44d15b295c --- /dev/null +++ b/git-index/src/extension/end_of_index_entry.rs @@ -0,0 +1,47 @@ +use crate::{decode::header, extension, extension::Signature, util::from_be_u32}; + +pub const SIGNATURE: Signature = *b"EOIE"; +pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); +pub const SIZE_WITH_HEADER: usize = crate::extension::MIN_SIZE + SIZE; + +pub fn decode(data: &[u8], object_hash: git_hash::Kind) -> Option { + let hash_len = object_hash.len_in_bytes(); + if data.len() < SIZE_WITH_HEADER + hash_len { + return None; + } + + let start_of_eoie = data.len() - SIZE_WITH_HEADER - hash_len; + let ext_data = &data[start_of_eoie..data.len() - hash_len]; + + let (signature, ext_size, ext_data) = extension::decode::header(ext_data); + if signature != SIGNATURE || ext_size as usize != SIZE { + return None; + } + + let (offset, checksum) = ext_data.split_at(4); + let offset = from_be_u32(offset) as usize; + if offset < header::SIZE || offset > start_of_eoie || checksum.len() != git_hash::Kind::Sha1.len_in_bytes() { + return None; + } + + let mut hasher = git_features::hash::hasher(git_hash::Kind::Sha1); + let mut last_chunk = None; + for (signature, chunk) in extension::Iter::new(&data[offset..data.len() - SIZE_WITH_HEADER - hash_len]) { + hasher.update(&signature); + hasher.update(&(chunk.len() as u32).to_be_bytes()); + last_chunk = Some(chunk); + } + + if hasher.digest() != checksum { + return None; + } + // The last-to-this chunk ends where ours starts + if last_chunk + .map(|s| s.as_ptr_range().end != (&data[start_of_eoie]) as *const _) + .unwrap_or(true) + { + return None; + } + + Some(offset) +} diff --git a/git-index/src/extension/iter.rs b/git-index/src/extension/iter.rs new file mode 100644 index 00000000000..5e3d2dd136a --- /dev/null +++ b/git-index/src/extension/iter.rs @@ -0,0 +1,53 @@ +use crate::{extension, extension::Iter, util::from_be_u32}; + +impl<'a> Iter<'a> { + pub fn new(data_at_beginning_of_extensions_and_truncated: &'a [u8]) -> Self { + Iter { + data: data_at_beginning_of_extensions_and_truncated, + consumed: 0, + } + } + + pub fn new_without_checksum( + data_at_beginning_of_extensions: &'a [u8], + object_hash: git_hash::Kind, + ) -> Option { + let end = data_at_beginning_of_extensions + .len() + .checked_sub(object_hash.len_in_bytes())?; + Iter { + data: &data_at_beginning_of_extensions[..end], + consumed: 0, + } + .into() + } +} + +impl<'a> Iterator for Iter<'a> { + type Item = (extension::Signature, &'a [u8]); + + fn next(&mut self) -> Option { + if self.data.len() < 4 + 4 { + return None; + } + + let (signature, data) = self.data.split_at(4); + let (size, data) = data.split_at(4); + self.data = data; + self.consumed += 4 + 4; + + let size = from_be_u32(size) as usize; + + match data.get(..size) { + Some(ext_data) => { + self.data = &data[size..]; + self.consumed += size; + Some((signature.try_into().unwrap(), ext_data)) + } + None => { + self.data = &[]; + None + } + } + } +} diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index ac0bd4703a8..7d0aeafb322 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -6,12 +6,6 @@ const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; pub type Signature = [u8; 4]; -fn decode_header(data: &[u8]) -> (Signature, u32, &[u8]) { - let (signature, data) = data.split_at(4); - let (size, data) = data.split_at(4); - (signature.try_into().unwrap(), from_be_u32(size), data) -} - /// A structure to associate object ids of a tree with sections in the index entries list. /// /// It allows to more quickly build trees by avoiding as it can quickly re-use portions of the index and its associated tree ids @@ -23,113 +17,13 @@ pub struct Tree { children: Vec, } -pub(crate) mod tree; - -pub(crate) mod end_of_index_entry { - use crate::{decode::header, extension, extension::Signature, util::from_be_u32}; - - pub const SIGNATURE: Signature = *b"EOIE"; - pub const SIZE: usize = 4 /* offset to extensions */ + git_hash::Kind::Sha1.len_in_bytes(); - pub const SIZE_WITH_HEADER: usize = crate::extension::MIN_SIZE + SIZE; - - pub fn decode(data: &[u8], object_hash: git_hash::Kind) -> Option { - let hash_len = object_hash.len_in_bytes(); - if data.len() < SIZE_WITH_HEADER + hash_len { - return None; - } - - let start_of_eoie = data.len() - SIZE_WITH_HEADER - hash_len; - let ext_data = &data[start_of_eoie..data.len() - hash_len]; - - let (signature, ext_size, ext_data) = extension::decode_header(ext_data); - if signature != SIGNATURE || ext_size as usize != SIZE { - return None; - } - - let (offset, checksum) = ext_data.split_at(4); - let offset = from_be_u32(offset) as usize; - if offset < header::SIZE || offset > start_of_eoie || checksum.len() != git_hash::Kind::Sha1.len_in_bytes() { - return None; - } - - let mut hasher = git_features::hash::hasher(git_hash::Kind::Sha1); - let mut last_chunk = None; - for (signature, chunk) in extension::Iter::new(&data[offset..data.len() - SIZE_WITH_HEADER - hash_len]) { - hasher.update(&signature); - hasher.update(&(chunk.len() as u32).to_be_bytes()); - last_chunk = Some(chunk); - } - - if hasher.digest() != checksum { - return None; - } - // The last-to-this chunk ends where ours starts - if last_chunk - .map(|s| s.as_ptr_range().end != (&data[start_of_eoie]) as *const _) - .unwrap_or(true) - { - return None; - } - - Some(offset) - } -} - -mod iter { - use crate::{extension, extension::Iter, util::from_be_u32}; - - impl<'a> Iter<'a> { - pub fn new(data_at_beginning_of_extensions_and_truncated: &'a [u8]) -> Self { - Iter { - data: data_at_beginning_of_extensions_and_truncated, - consumed: 0, - } - } +mod iter; - pub fn new_without_checksum( - data_at_beginning_of_extensions: &'a [u8], - object_hash: git_hash::Kind, - ) -> Option { - let end = data_at_beginning_of_extensions - .len() - .checked_sub(object_hash.len_in_bytes())?; - Iter { - data: &data_at_beginning_of_extensions[..end], - consumed: 0, - } - .into() - } - } +pub(crate) mod decode; - impl<'a> Iterator for Iter<'a> { - type Item = (extension::Signature, &'a [u8]); - - fn next(&mut self) -> Option { - if self.data.len() < 4 + 4 { - return None; - } - - let (signature, data) = self.data.split_at(4); - let (size, data) = data.split_at(4); - self.data = data; - self.consumed += 4 + 4; - - let size = from_be_u32(size) as usize; +pub(crate) mod tree; - match data.get(..size) { - Some(ext_data) => { - self.data = &data[size..]; - self.consumed += size; - Some((signature.try_into().unwrap(), ext_data)) - } - None => { - self.data = &[]; - None - } - } - } - } -} +pub(crate) mod end_of_index_entry; pub struct Iter<'a> { data: &'a [u8], From 35bdee4bf77787bcbe6c3dd715a677e2e46a8ad1 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 20:38:17 +0800 Subject: [PATCH 47/57] Basic IEOT parsing (#293) Now for actually using it, that needs some work. --- git-index/src/decode/entries.rs | 13 +++----- git-index/src/decode/mod.rs | 44 +++++++++++++++++--------- git-index/src/extension/decode.rs | 5 ++- git-index/src/extension/mod.rs | 52 +++++++++++++++++++++++++++++-- git-index/src/lib.rs | 5 +++ git-index/tests/file/mod.rs | 1 + git-pack/src/data/entry/decode.rs | 3 +- 7 files changed, 93 insertions(+), 30 deletions(-) diff --git a/git-index/src/decode/entries.rs b/git-index/src/decode/entries.rs index fac92be0669..8f04c3afea7 100644 --- a/git-index/src/decode/entries.rs +++ b/git-index/src/decode/entries.rs @@ -1,9 +1,11 @@ -use crate::util::{split_at_byte_exclusive, split_at_pos}; +use std::ops::Range; + use crate::{ decode::{header, Error}, - entry, Entry, Version, + entry, + util::{read_u32, split_at_byte_exclusive, split_at_pos}, + Entry, Version, }; -use std::ops::Range; /// a guess directly from git sources pub const AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES: usize = 80; @@ -184,11 +186,6 @@ fn skip_padding(data: &[u8]) -> &[u8] { &data[skip..] } -#[inline] -fn read_u32(data: &[u8]) -> Option<(u32, &[u8])> { - split_at_pos(data, 4).map(|(num, data)| (u32::from_be_bytes(num.try_into().unwrap()), data)) -} - #[inline] fn read_u16(data: &[u8]) -> Option<(u16, &[u8])> { split_at_pos(data, 2).map(|(num, data)| (u16::from_be_bytes(num.try_into().unwrap()), data)) diff --git a/git-index/src/decode/mod.rs b/git-index/src/decode/mod.rs index 1009e41e51b..1e5dccc573e 100644 --- a/git-index/src/decode/mod.rs +++ b/git-index/src/decode/mod.rs @@ -1,10 +1,14 @@ +use std::ops::Range; + use bstr::{BStr, ByteSlice}; use filetime::FileTime; use git_hash::Kind; -use std::ops::Range; -use crate::util::{from_be_u32, split_at_byte_exclusive, split_at_pos}; -use crate::{entry, extension, Entry, State, Version}; +use crate::{ + entry, extension, + util::{from_be_u32, split_at_byte_exclusive, split_at_pos}, + Entry, State, Version, +}; mod entries; pub mod header; @@ -51,19 +55,29 @@ impl State { ); let (entries, ext, data) = match start_of_extensions { Some(offset) => { - let (entries_res, (ext, data)) = git_features::parallel::join( - // TODO load all extensions in thread, then get IEOT, then possibly multi-threaded entry parsing - || { - entries::load_all( - post_header_data, - num_entries, - path_backing_buffer_size, - object_hash, - version, + let start_of_extensions = &data[offset..]; + let index_offsets_table = extension::index_entry_offset_table::find(start_of_extensions, object_hash); + let (entries_res, (ext, data)) = match index_offsets_table { + Some(entry_offsets) => { + dbg!(entry_offsets); + todo!("threaded entry loading if its worth it") + } + None => { + git_features::parallel::join( + // TODO load all extensions in scoped, then get IEOT, then possibly multi-threaded entry parsing + || { + entries::load_all( + post_header_data, + num_entries, + path_backing_buffer_size, + object_hash, + version, + ) + }, + || extension::decode::all(start_of_extensions, object_hash), ) - }, - || extension::decode::all(&data[offset..], object_hash), - ); + } + }; (entries_res?.0, ext, data) } None => { diff --git a/git-index/src/extension/decode.rs b/git-index/src/extension/decode.rs index 90e7f953332..a9ea372b506 100644 --- a/git-index/src/extension/decode.rs +++ b/git-index/src/extension/decode.rs @@ -1,6 +1,4 @@ -use crate::extension; -use crate::extension::Signature; -use crate::util::from_be_u32; +use crate::{extension, extension::Signature, util::from_be_u32}; pub fn header(data: &[u8]) -> (Signature, u32, &[u8]) { let (signature, data) = data.split_at(4); @@ -18,6 +16,7 @@ pub fn all(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> (Outc ext.cache_tree = extension::tree::decode(ext_data, object_hash); } extension::end_of_index_entry::SIGNATURE => {} // skip already done + extension::index_entry_offset_table::SIGNATURE => {} // not relevant/obtained already _unknown => {} // skip unknown extensions, too } } diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index 7d0aeafb322..179bebfc16f 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -6,6 +6,11 @@ const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; pub type Signature = [u8; 4]; +pub struct Iter<'a> { + data: &'a [u8], + pub consumed: usize, +} + /// A structure to associate object ids of a tree with sections in the index entries list. /// /// It allows to more quickly build trees by avoiding as it can quickly re-use portions of the index and its associated tree ids @@ -25,7 +30,48 @@ pub(crate) mod tree; pub(crate) mod end_of_index_entry; -pub struct Iter<'a> { - data: &'a [u8], - pub consumed: usize, +pub(crate) mod index_entry_offset_table { + use crate::{extension, extension::Signature, util::read_u32}; + + #[derive(Debug)] + pub struct Offset { + pub from_beginning_of_file: u32, + pub num_entries: u32, + } + + pub const SIGNATURE: Signature = *b"IEOT"; + + pub fn decode(data: &[u8]) -> Option> { + let (version, mut data) = read_u32(data)?; + match version { + 1 => {} + _unknown => return None, + } + + let entry_size = (4 + 4); + let num_offsets = data.len() / entry_size; + if num_offsets == 0 || data.len() % entry_size != 0 { + return None; + } + + let mut out = Vec::with_capacity(entry_size); + for _ in 0..num_offsets { + let (offset, chunk) = read_u32(data)?; + let (num_entries, chunk) = read_u32(chunk)?; + out.push(Offset { + from_beginning_of_file: offset, + num_entries, + }); + data = chunk; + } + debug_assert!(data.is_empty()); + + out.into() + } + + pub fn find(extensions: &[u8], object_hash: git_hash::Kind) -> Option> { + extension::Iter::new_without_checksum(extensions, object_hash)? + .find_map(|(sig, ext_data)| (sig == SIGNATURE).then(|| ext_data)) + .and_then(decode) + } } diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index c27545fdee6..7fa92622900 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -73,6 +73,11 @@ pub struct State { } pub(crate) mod util { + #[inline] + pub fn read_u32(data: &[u8]) -> Option<(u32, &[u8])> { + split_at_pos(data, 4).map(|(num, data)| (u32::from_be_bytes(num.try_into().unwrap()), data)) + } + #[inline] pub fn from_be_u32(b: &[u8]) -> u32 { u32::from_be_bytes(b.try_into().unwrap()) diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index faff7cdef65..237e1e59154 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -36,6 +36,7 @@ mod init { fn read_without_any_extension() {} #[test] + #[ignore] fn read_v4_with_delta_paths_and_ieot_ext() { let file = file("v4_more_files_IEOT"); assert_eq!(file.version(), Version::V4); diff --git a/git-pack/src/data/entry/decode.rs b/git-pack/src/data/entry/decode.rs index 22bf362d19d..646d5ffea18 100644 --- a/git-pack/src/data/entry/decode.rs +++ b/git-pack/src/data/entry/decode.rs @@ -1,6 +1,7 @@ -use git_features::decode::{leb64, leb64_from_read}; use std::io; +use git_features::decode::{leb64, leb64_from_read}; + use super::{BLOB, COMMIT, OFS_DELTA, REF_DELTA, TAG, TREE}; use crate::data; From 99d7224baa04c199a7eb7aa2675b39657b0aef6a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 20:55:12 +0800 Subject: [PATCH 48/57] cleanup (#293) --- git-index/src/decode/entries.rs | 4 ++-- git-index/src/decode/mod.rs | 10 +--------- git-index/src/extension/mod.rs | 4 +--- git-index/src/lib.rs | 2 +- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/git-index/src/decode/entries.rs b/git-index/src/decode/entries.rs index 8f04c3afea7..ec8f07728fd 100644 --- a/git-index/src/decode/entries.rs +++ b/git-index/src/decode/entries.rs @@ -29,7 +29,7 @@ pub fn estimate_path_storage_requirements_in_bytes( (4 * 6) + // various stat fields 2 + // flag, ignore extended flag as we'd rather overallocate a bit object_hash.len_in_bytes() - }; + } match version { Version::V3 | Version::V2 => { let size_of_entries_block = offset_to_extensions.unwrap_or(on_disk_size); @@ -109,7 +109,7 @@ fn load_one<'a>( let (flags, data) = read_u16(data)?; let flags = flags as u32; let (flags, data) = if flags & entry::flags::EXTENDED == entry::flags::EXTENDED { - let (mut extended_flags, data) = read_u16(data)?; + let (extended_flags, data) = read_u16(data)?; let extended_flags: u32 = (extended_flags as u32) << 16; const ALL_KNOWN_EXTENDED_FLAGS: u32 = entry::flags::INTENT_TO_ADD | entry::flags::SKIP_WORKTREE; assert_eq!( diff --git a/git-index/src/decode/mod.rs b/git-index/src/decode/mod.rs index 1e5dccc573e..6de2f95720b 100644 --- a/git-index/src/decode/mod.rs +++ b/git-index/src/decode/mod.rs @@ -1,14 +1,6 @@ -use std::ops::Range; - -use bstr::{BStr, ByteSlice}; use filetime::FileTime; -use git_hash::Kind; -use crate::{ - entry, extension, - util::{from_be_u32, split_at_byte_exclusive, split_at_pos}, - Entry, State, Version, -}; +use crate::{extension, State}; mod entries; pub mod header; diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index 179bebfc16f..260758198a2 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -1,7 +1,5 @@ use smallvec::SmallVec; -use crate::{util::from_be_u32, Version}; - const MIN_SIZE: usize = 4 /* signature */ + 4 /* size */; pub type Signature = [u8; 4]; @@ -48,7 +46,7 @@ pub(crate) mod index_entry_offset_table { _unknown => return None, } - let entry_size = (4 + 4); + let entry_size = 4 + 4; let num_offsets = data.len() / entry_size; if num_offsets == 0 || data.len() % entry_size != 0 { return None; diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index 7fa92622900..ef3d0aaa214 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -1,5 +1,5 @@ #![deny(unsafe_code, missing_docs, rust_2018_idioms)] -#![allow(missing_docs, unused)] +#![allow(missing_docs, dead_code)] use std::{ops::Range, path::PathBuf}; From 30de988f6a97177fcb32ffce37f4c80f46306a20 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 21:18:13 +0800 Subject: [PATCH 49/57] prepare decode options for better control of threads (#293) --- git-index/src/decode/mod.rs | 19 ++++++++++++++++++- git-index/src/file.rs | 6 +++--- git-index/src/lib.rs | 4 +++- git-index/tests/file/mod.rs | 2 +- 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/git-index/src/decode/mod.rs b/git-index/src/decode/mod.rs index 6de2f95720b..51a040cbe28 100644 --- a/git-index/src/decode/mod.rs +++ b/git-index/src/decode/mod.rs @@ -29,11 +29,28 @@ mod error { } pub use error::Error; +#[derive(Default)] +pub struct Options { + pub object_hash: git_hash::Kind, + /// If Some(_), we are allowed to use more than one thread. If Some(N), use no more than N threads. If Some(0)|None, use as many threads + /// as there are physical cores. + /// + /// This applies to loading extensions in parallel to entries if the common EOIE extension is available. + /// It also allows to use multiple threads for loading entries if the IEOT extension is present. + pub num_threads: Option, + /// The minimum size in bytes to load extensions in their own thread, assuming there is enough `num_threads` available. + pub min_extension_block_in_bytes_for_threading: usize, +} + impl State { pub fn from_bytes( data: &[u8], timestamp: FileTime, - object_hash: git_hash::Kind, + Options { + object_hash, + num_threads: _, + min_extension_block_in_bytes_for_threading: _, + }: Options, ) -> Result<(Self, git_hash::ObjectId), Error> { let (version, num_entries, post_header_data) = header::decode(data, object_hash)?; let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash); diff --git a/git-index/src/file.rs b/git-index/src/file.rs index f4570e7510c..6f087752a17 100644 --- a/git-index/src/file.rs +++ b/git-index/src/file.rs @@ -25,7 +25,7 @@ pub mod init { use memmap2::Mmap; - use crate::{extension, File, State}; + use crate::{decode, extension, File, State}; mod error { use quick_error::quick_error; @@ -49,7 +49,7 @@ pub mod init { pub use error::Error; impl File { - pub fn at(path: impl Into, object_hash: git_hash::Kind) -> Result { + pub fn at(path: impl Into, options: decode::Options) -> Result { let path = path.into(); let (data, mtime) = { // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. @@ -59,7 +59,7 @@ pub mod init { (data, filetime::FileTime::from_last_modification_time(&file.metadata()?)) }; - let (state, checksum) = State::from_bytes(&data, mtime, object_hash)?; + let (state, checksum) = State::from_bytes(&data, mtime, options)?; Ok(File { state, path, checksum }) } } diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index ef3d0aaa214..f8793b489ed 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -30,10 +30,12 @@ pub mod decode; /// All known versions of a git index file. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] -#[allow(missing_docs)] pub enum Version { + /// Supports entries and various extensions. V2 = 2, + /// Adds support for additional flags for each entry. V3 = 3, + /// Supports deltified entry paths. V4 = 4, } diff --git a/git-index/tests/file/mod.rs b/git-index/tests/file/mod.rs index 237e1e59154..27907c21b40 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/file/mod.rs @@ -3,7 +3,7 @@ mod init { use git_testtools::hex_to_id; fn file(name: &str) -> git_index::File { - git_index::File::at(crate::index_fixture_path(name), git_hash::Kind::Sha1).unwrap() + git_index::File::at(crate::index_fixture_path(name), git_index::decode::Options::default()).unwrap() } #[test] From a22cb0f1ead9a2f32e43eb2fb378281e592a4ed3 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 11 Jan 2022 21:33:00 +0800 Subject: [PATCH 50/57] single and multi-threaded index tests (#293) --- Makefile | 2 ++ etc/check-package-size.sh | 2 +- git-features/src/parallel/mod.rs | 4 ++-- git-index/Cargo.toml | 13 +++++++++++++ git-index/src/decode/mod.rs | 10 ++++++---- git-index/src/lib.rs | 9 +++++++++ git-index/tests/index-multi-threaded.rs | 1 + git-index/tests/index-single-threaded.rs | 1 + git-index/tests/{ => index}/file/mod.rs | 2 +- git-index/tests/{index.rs => index/mod.rs} | 2 +- 10 files changed, 37 insertions(+), 9 deletions(-) create mode 100644 git-index/tests/index-multi-threaded.rs create mode 100644 git-index/tests/index-single-threaded.rs rename git-index/tests/{ => index}/file/mod.rs (94%) rename git-index/tests/{index.rs => index/mod.rs} (90%) diff --git a/Makefile b/Makefile index 4b158201ff9..8e8227296f7 100644 --- a/Makefile +++ b/Makefile @@ -140,6 +140,8 @@ unit-tests: ## run all unit tests cd git-object && cargo test && cargo test --features verbose-object-parsing-errors cd git-pack && cargo test --features internal-testing-to-avoid-being-run-by-cargo-test-all \ && cargo test --features "internal-testing-git-features-parallel" + cd git-index && cargo test --features internal-testing-to-avoid-being-run-by-cargo-test-all \ + && cargo test --features "internal-testing-git-features-parallel" cd git-packetline && cargo test \ && cargo test --features blocking-io,maybe-async/is_sync --test blocking-packetline \ && cargo test --features "async-io" --test async-packetline diff --git a/etc/check-package-size.sh b/etc/check-package-size.sh index a4029192647..43b25767e00 100755 --- a/etc/check-package-size.sh +++ b/etc/check-package-size.sh @@ -18,7 +18,7 @@ echo "in root: gitoxide CLI" #indent cargo diet -n --package-size-limit 25KB - fails right now because of dotted profile.dev.package (enter cargo-smart-release && indent cargo diet -n --package-size-limit 85KB) (enter git-actor && indent cargo diet -n --package-size-limit 5KB) -(enter git-index && indent cargo diet -n --package-size-limit 10KB) +(enter git-index && indent cargo diet -n --package-size-limit 15KB) (enter git-tempfile && indent cargo diet -n --package-size-limit 25KB) (enter git-lock && indent cargo diet -n --package-size-limit 15KB) (enter git-config && indent cargo diet -n --package-size-limit 65KB) diff --git a/git-features/src/parallel/mod.rs b/git-features/src/parallel/mod.rs index d3873b4392c..83c8c6944e2 100644 --- a/git-features/src/parallel/mod.rs +++ b/git-features/src/parallel/mod.rs @@ -111,7 +111,7 @@ pub fn optimize_chunk_size_and_thread_limit( /// Always returns 1, available when the `parallel` feature toggle is unset. #[cfg(not(feature = "parallel"))] -fn num_threads(_thread_limit: Option) -> usize { +pub fn num_threads(_thread_limit: Option) -> usize { 1 } @@ -119,7 +119,7 @@ fn num_threads(_thread_limit: Option) -> usize { /// /// Only available with the `parallel` feature toggle set. #[cfg(feature = "parallel")] -fn num_threads(thread_limit: Option) -> usize { +pub fn num_threads(thread_limit: Option) -> usize { let logical_cores = num_cpus::get(); thread_limit .map(|l| if l == 0 { logical_cores } else { l }) diff --git a/git-index/Cargo.toml b/git-index/Cargo.toml index 29f4ad6f4a3..d0729a13831 100644 --- a/git-index/Cargo.toml +++ b/git-index/Cargo.toml @@ -11,9 +11,22 @@ edition = "2021" doctest = false test = true +[[test]] +name = "multi-threaded" +path = "tests/index-multi-threaded.rs" +required-features = ["internal-testing-git-features-parallel"] + +[[test]] +name = "single-threaded" +path = "tests/index-single-threaded.rs" +required-features = ["internal-testing-to-avoid-being-run-by-cargo-test-all"] + [features] serde1 = ["serde"] +internal-testing-git-features-parallel = ["git-features/parallel"] +internal-testing-to-avoid-being-run-by-cargo-test-all = [] + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] diff --git a/git-index/src/decode/mod.rs b/git-index/src/decode/mod.rs index 51a040cbe28..d5ddd92f084 100644 --- a/git-index/src/decode/mod.rs +++ b/git-index/src/decode/mod.rs @@ -37,7 +37,7 @@ pub struct Options { /// /// This applies to loading extensions in parallel to entries if the common EOIE extension is available. /// It also allows to use multiple threads for loading entries if the IEOT extension is present. - pub num_threads: Option, + pub thread_limit: Option, /// The minimum size in bytes to load extensions in their own thread, assuming there is enough `num_threads` available. pub min_extension_block_in_bytes_for_threading: usize, } @@ -48,13 +48,14 @@ impl State { timestamp: FileTime, Options { object_hash, - num_threads: _, + thread_limit, min_extension_block_in_bytes_for_threading: _, }: Options, ) -> Result<(Self, git_hash::ObjectId), Error> { let (version, num_entries, post_header_data) = header::decode(data, object_hash)?; let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash); + let num_threads = git_features::parallel::num_threads(thread_limit); let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes( num_entries, data.len(), @@ -62,8 +63,9 @@ impl State { object_hash, version, ); + let (entries, ext, data) = match start_of_extensions { - Some(offset) => { + Some(offset) if num_threads > 1 => { let start_of_extensions = &data[offset..]; let index_offsets_table = extension::index_entry_offset_table::find(start_of_extensions, object_hash); let (entries_res, (ext, data)) = match index_offsets_table { @@ -89,7 +91,7 @@ impl State { }; (entries_res?.0, ext, data) } - None => { + None | Some(_) => { let (entries, data) = entries::load_all( post_header_data, num_entries, diff --git a/git-index/src/lib.rs b/git-index/src/lib.rs index f8793b489ed..d7724084981 100644 --- a/git-index/src/lib.rs +++ b/git-index/src/lib.rs @@ -110,3 +110,12 @@ pub(crate) mod util { data.split_at(pos).into() } } + +#[test] +fn size_of_entry() { + assert_eq!(std::mem::size_of::(), 80); + + // the reason we have our own time is half the size. + assert_eq!(std::mem::size_of::(), 8); + assert_eq!(std::mem::size_of::(), 16); +} diff --git a/git-index/tests/index-multi-threaded.rs b/git-index/tests/index-multi-threaded.rs new file mode 100644 index 00000000000..58bd56b4864 --- /dev/null +++ b/git-index/tests/index-multi-threaded.rs @@ -0,0 +1 @@ +mod index; diff --git a/git-index/tests/index-single-threaded.rs b/git-index/tests/index-single-threaded.rs new file mode 100644 index 00000000000..58bd56b4864 --- /dev/null +++ b/git-index/tests/index-single-threaded.rs @@ -0,0 +1 @@ +mod index; diff --git a/git-index/tests/file/mod.rs b/git-index/tests/index/file/mod.rs similarity index 94% rename from git-index/tests/file/mod.rs rename to git-index/tests/index/file/mod.rs index 27907c21b40..15d408b37d3 100644 --- a/git-index/tests/file/mod.rs +++ b/git-index/tests/index/file/mod.rs @@ -3,7 +3,7 @@ mod init { use git_testtools::hex_to_id; fn file(name: &str) -> git_index::File { - git_index::File::at(crate::index_fixture_path(name), git_index::decode::Options::default()).unwrap() + git_index::File::at(crate::index::fixture_path(name), git_index::decode::Options::default()).unwrap() } #[test] diff --git a/git-index/tests/index.rs b/git-index/tests/index/mod.rs similarity index 90% rename from git-index/tests/index.rs rename to git-index/tests/index/mod.rs index d32bfa60c2b..714788653f4 100644 --- a/git-index/tests/index.rs +++ b/git-index/tests/index/mod.rs @@ -2,7 +2,7 @@ use std::path::{Path, PathBuf}; mod file; -pub fn index_fixture_path(name: &str) -> PathBuf { +pub fn fixture_path(name: &str) -> PathBuf { let dir = git_testtools::scripted_fixture_repo_read_only(Path::new("make_index").join(name).with_extension("sh")) .expect("script works"); dir.join(".git").join("index") From ca095ed881db2a8f06a6b067dbaac17e923b0945 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 12 Jan 2022 09:40:43 +0800 Subject: [PATCH 51/57] feat: Make a scope-like abstraction available (#293) This allows more delicate threading control like is required for the index. --- git-features/src/parallel/in_parallel.rs | 11 +++++ git-features/src/parallel/mod.rs | 4 +- git-features/src/parallel/serial.rs | 59 ++++++++++++++++++++++-- 3 files changed, 69 insertions(+), 5 deletions(-) diff --git a/git-features/src/parallel/in_parallel.rs b/git-features/src/parallel/in_parallel.rs index 55f8935c40b..7eab2087662 100644 --- a/git-features/src/parallel/in_parallel.rs +++ b/git-features/src/parallel/in_parallel.rs @@ -10,6 +10,17 @@ pub fn join(left: impl FnOnce() -> O1 + Send, right: impl Fn .unwrap() } +/// Runs `f` with a scope to be used for spawning threads that will not outlive the function call. +/// That way it's possible to handle threads without needing the 'static lifetime for data they interact with. +/// +/// Note that the threads should not rely on actual parallelism as threading might be turned off entirely. +pub fn threads<'env, F, R>(f: F) -> std::thread::Result +where + F: FnOnce(&crossbeam_utils::thread::Scope<'env>) -> R, +{ + crossbeam_utils::thread::scope(f) +} + /// Read items from `input` and `consume` them in multiple threads, /// whose output output is collected by a `reducer`. Its task is to /// aggregate these outputs into the final result returned by this function with the benefit of not having to be thread-safe. diff --git a/git-features/src/parallel/mod.rs b/git-features/src/parallel/mod.rs index 83c8c6944e2..7a4aca40eac 100644 --- a/git-features/src/parallel/mod.rs +++ b/git-features/src/parallel/mod.rs @@ -35,11 +35,11 @@ #[cfg(feature = "parallel")] mod in_parallel; #[cfg(feature = "parallel")] -pub use in_parallel::{in_parallel, join}; +pub use in_parallel::{in_parallel, join, threads}; mod serial; #[cfg(not(feature = "parallel"))] -pub use serial::{in_parallel, join}; +pub use serial::{in_parallel, join, threads}; mod eager_iter; pub use eager_iter::{EagerIter, EagerIterIf}; diff --git a/git-features/src/parallel/serial.rs b/git-features/src/parallel/serial.rs index ddc6c3ef589..eb92aa0a724 100644 --- a/git-features/src/parallel/serial.rs +++ b/git-features/src/parallel/serial.rs @@ -1,10 +1,63 @@ use crate::parallel::Reduce; -/// Runs `left` and then `right`, one after another, returning their output when both are done. #[cfg(not(feature = "parallel"))] -pub fn join(left: impl FnOnce() -> O1, right: impl FnOnce() -> O2) -> (O1, O2) { - (left(), right()) +mod not_parallel { + /// Runs `left` and then `right`, one after another, returning their output when both are done. + pub fn join(left: impl FnOnce() -> O1, right: impl FnOnce() -> O2) -> (O1, O2) { + (left(), right()) + } + + /// A scope for spawning threads. + pub struct Scope<'env> { + _marker: std::marker::PhantomData<&'env mut &'env ()>, + } + + #[allow(unsafe_code)] + unsafe impl Sync for Scope<'_> {} + + impl<'env> Scope<'env> { + pub fn spawn<'scope, F, T>(&'scope self, f: F) -> ScopedJoinHandle<'scope, T> + where + F: FnOnce(&Scope<'env>) -> T, + F: Send + 'env, + T: Send + 'env, + { + ScopedJoinHandle { + result: f(self), + _marker: Default::default(), + } + } + } + + /// Runs `f` with a scope to be used for spawning threads that will not outlive the function call. + /// Note that this implementation will run the spawned functions immediately. + pub fn threads<'env, F, R>(f: F) -> std::thread::Result + where + F: FnOnce(&Scope<'env>) -> R, + { + Ok(f(&Scope { + _marker: Default::default(), + })) + } + + /// A handle that can be used to join its scoped thread. + /// + /// This struct is created by the [`Scope::spawn`] method and the + /// [`ScopedThreadBuilder::spawn`] method. + pub struct ScopedJoinHandle<'scope, T> { + /// Holds the result of the inner closure. + result: T, + _marker: std::marker::PhantomData<&'scope mut &'scope ()>, + } + + impl ScopedJoinHandle<'_, T> { + pub fn join(self) -> std::thread::Result { + Ok(self.result) + } + } } +#[cfg(not(feature = "parallel"))] +pub use not_parallel::{join, threads, Scope, ScopedJoinHandle}; /// Read items from `input` and `consume` them in a single thread, producing an output to be collected by a `reducer`, /// whose task is to aggregate these outputs into the final result returned by this function. From 6fea17d1306679d0454d01aa59adf12cd83c7973 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 12 Jan 2022 09:41:11 +0800 Subject: [PATCH 52/57] Frame for using the new 'scoped threads' feature in git-features (#293) --- git-index/src/decode/mod.rs | 27 ++++++++++++++------------- git-index/tests/index/file/mod.rs | 1 - 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/git-index/src/decode/mod.rs b/git-index/src/decode/mod.rs index d5ddd92f084..fb32f13b50e 100644 --- a/git-index/src/decode/mod.rs +++ b/git-index/src/decode/mod.rs @@ -68,27 +68,28 @@ impl State { Some(offset) if num_threads > 1 => { let start_of_extensions = &data[offset..]; let index_offsets_table = extension::index_entry_offset_table::find(start_of_extensions, object_hash); - let (entries_res, (ext, data)) = match index_offsets_table { - Some(entry_offsets) => { - dbg!(entry_offsets); - todo!("threaded entry loading if its worth it") - } - None => { - git_features::parallel::join( + let (entries_res, (ext, data)) = git_features::parallel::threads(|_scope| { + match index_offsets_table { + Some(entry_offsets) => { + dbg!(entry_offsets); + todo!("threaded entry loading if its worth it") + } + None => { // TODO load all extensions in scoped, then get IEOT, then possibly multi-threaded entry parsing - || { + ( entries::load_all( post_header_data, num_entries, path_backing_buffer_size, object_hash, version, - ) - }, - || extension::decode::all(start_of_extensions, object_hash), - ) + ), + extension::decode::all(start_of_extensions, object_hash), + ) + } } - }; + }) + .unwrap(); // this unwrap is for panics - if these happened we are done anyway. (entries_res?.0, ext, data) } None | Some(_) => { diff --git a/git-index/tests/index/file/mod.rs b/git-index/tests/index/file/mod.rs index 15d408b37d3..af129dd95ad 100644 --- a/git-index/tests/index/file/mod.rs +++ b/git-index/tests/index/file/mod.rs @@ -36,7 +36,6 @@ mod init { fn read_without_any_extension() {} #[test] - #[ignore] fn read_v4_with_delta_paths_and_ieot_ext() { let file = file("v4_more_files_IEOT"); assert_eq!(file.version(), Version::V4); From de84a3a03bcc9dc3ff71810e35c869f9b73dd38f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 12 Jan 2022 12:07:41 +0800 Subject: [PATCH 53/57] parallel loading of entries right before reducing them (#293) --- git-index/src/decode/entries.rs | 28 ++----- git-index/src/decode/mod.rs | 132 ++++++++++++++++++++++++------ git-index/src/extension/mod.rs | 2 +- git-index/tests/index/file/mod.rs | 23 ++++-- 4 files changed, 133 insertions(+), 52 deletions(-) diff --git a/git-index/src/decode/entries.rs b/git-index/src/decode/entries.rs index ec8f07728fd..2b937c0cf51 100644 --- a/git-index/src/decode/entries.rs +++ b/git-index/src/decode/entries.rs @@ -1,7 +1,7 @@ use std::ops::Range; use crate::{ - decode::{header, Error}, + decode::{self, header}, entry, util::{read_u32, split_at_byte_exclusive, split_at_pos}, Entry, Version, @@ -11,8 +11,6 @@ use crate::{ pub const AVERAGE_V4_DELTA_PATH_LEN_IN_BYTES: usize = 80; pub struct Outcome { - pub entries: Vec, - pub path_backing: Vec, pub is_sparse: bool, } @@ -42,15 +40,14 @@ pub fn estimate_path_storage_requirements_in_bytes( } /// Note that `data` must point to the beginning of the entries, right past the header. -pub fn load_all( - mut data: &[u8], +pub fn load_chunk<'a>( + mut data: &'a [u8], + entries: &mut Vec, + path_backing: &mut Vec, num_entries: u32, - path_backing_capacity: usize, object_hash: git_hash::Kind, version: Version, -) -> Result<(Outcome, &[u8]), Error> { - let mut path_backing = Vec::::with_capacity(path_backing_capacity); - let mut entries = Vec::::with_capacity(num_entries as usize); +) -> Result<(Outcome, &'a [u8]), decode::Error> { let mut is_sparse = false; let has_delta_paths = version == Version::V4; let mut prev_path = None; @@ -59,12 +56,12 @@ pub fn load_all( for idx in 0..num_entries { let (entry, remaining) = load_one( data, - &mut path_backing, + path_backing, object_hash.len_in_bytes(), has_delta_paths, prev_path, ) - .ok_or(Error::Entry(idx))?; + .ok_or(decode::Error::Entry(idx))?; data = remaining; if entry::mode::is_sparse(entry.stat.mode) { @@ -77,14 +74,7 @@ pub fn load_all( prev_path = entries.last().map(|e| (e.path.clone(), &mut delta_buf)); } - Ok(( - Outcome { - entries, - path_backing, - is_sparse, - }, - data, - )) + Ok((Outcome { is_sparse }, data)) } /// Note that `prev_path` is only useful if the version is V4 diff --git a/git-index/src/decode/mod.rs b/git-index/src/decode/mod.rs index fb32f13b50e..9e93cfe3e2c 100644 --- a/git-index/src/decode/mod.rs +++ b/git-index/src/decode/mod.rs @@ -1,6 +1,6 @@ use filetime::FileTime; -use crate::{extension, State}; +use crate::{extension, Entry, State, Version}; mod entries; pub mod header; @@ -49,13 +49,13 @@ impl State { Options { object_hash, thread_limit, - min_extension_block_in_bytes_for_threading: _, + min_extension_block_in_bytes_for_threading, }: Options, ) -> Result<(Self, git_hash::ObjectId), Error> { let (version, num_entries, post_header_data) = header::decode(data, object_hash)?; let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash); - let num_threads = git_features::parallel::num_threads(thread_limit); + let mut num_threads = git_features::parallel::num_threads(thread_limit); let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes( num_entries, data.len(), @@ -66,37 +66,82 @@ impl State { let (entries, ext, data) = match start_of_extensions { Some(offset) if num_threads > 1 => { - let start_of_extensions = &data[offset..]; - let index_offsets_table = extension::index_entry_offset_table::find(start_of_extensions, object_hash); - let (entries_res, (ext, data)) = git_features::parallel::threads(|_scope| { - match index_offsets_table { + let extensions_data = &data[offset..]; + let index_offsets_table = extension::index_entry_offset_table::find(extensions_data, object_hash); + let (entries_res, (ext, data)) = git_features::parallel::threads(|scope| { + let extension_loading = + (extensions_data.len() > min_extension_block_in_bytes_for_threading).then({ + num_threads -= 1; + || scope.spawn(|_| extension::decode::all(extensions_data, object_hash)) + }); + let entries_res = match index_offsets_table { Some(entry_offsets) => { - dbg!(entry_offsets); - todo!("threaded entry loading if its worth it") + let chunk_size = (entry_offsets.len() as f32 / num_threads as f32).ceil() as usize; + let num_chunks = entry_offsets.chunks(chunk_size).count(); + let mut threads = Vec::with_capacity(num_chunks); + for (id, chunks) in entry_offsets.chunks(chunk_size).enumerate() { + let chunks = chunks.to_vec(); + threads.push(scope.spawn(move |_| { + let num_entries = chunks.iter().map(|c| c.num_entries).sum::() as usize; + let mut entries = Vec::with_capacity(num_entries); + let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes( + num_entries as u32, + data.len() / num_chunks, + start_of_extensions.map(|ofs| ofs / num_chunks), + object_hash, + version, + ); + let mut path_backing = Vec::with_capacity(path_backing_buffer_size); + let mut is_sparse = false; + for offset in chunks { + let ( + entries::Outcome { + is_sparse: chunk_is_sparse, + }, + _data, + ) = entries::load_chunk( + &data[offset.from_beginning_of_file as usize..], + &mut entries, + &mut path_backing, + offset.num_entries, + object_hash, + version, + )?; + is_sparse |= chunk_is_sparse; + } + Ok::<_, Error>(( + id, + EntriesOutcome { + entries, + path_backing, + is_sparse, + }, + )) + })); + } + todo!("combined thread results in order ") } - None => { - // TODO load all extensions in scoped, then get IEOT, then possibly multi-threaded entry parsing - ( - entries::load_all( - post_header_data, - num_entries, - path_backing_buffer_size, - object_hash, - version, - ), - extension::decode::all(start_of_extensions, object_hash), - ) - } - } + None => load_entries( + post_header_data, + path_backing_buffer_size, + num_entries, + object_hash, + version, + ), + }; + let ext_res = extension_loading + .map(|thread| thread.join().unwrap()) + .unwrap_or_else(|| extension::decode::all(extensions_data, object_hash)); + (entries_res, ext_res) }) .unwrap(); // this unwrap is for panics - if these happened we are done anyway. (entries_res?.0, ext, data) } None | Some(_) => { - let (entries, data) = entries::load_all( + let (entries, data) = load_entries( post_header_data, - num_entries, path_backing_buffer_size, + num_entries, object_hash, version, )?; @@ -113,7 +158,7 @@ impl State { } let checksum = git_hash::ObjectId::from(data); - let entries::Outcome { + let EntriesOutcome { entries, path_backing, is_sparse, @@ -133,3 +178,38 @@ impl State { )) } } + +struct EntriesOutcome { + pub entries: Vec, + pub path_backing: Vec, + pub is_sparse: bool, +} + +fn load_entries( + post_header_data: &[u8], + path_backing_buffer_size: usize, + num_entries: u32, + object_hash: git_hash::Kind, + version: Version, +) -> Result<(EntriesOutcome, &[u8]), Error> { + let mut entries = Vec::with_capacity(num_entries as usize); + let mut path_backing = Vec::with_capacity(path_backing_buffer_size); + entries::load_chunk( + post_header_data, + &mut entries, + &mut path_backing, + num_entries, + object_hash, + version, + ) + .map(|(entries::Outcome { is_sparse }, data): (entries::Outcome, &[u8])| { + ( + EntriesOutcome { + entries, + path_backing, + is_sparse, + }, + data, + ) + }) +} diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index 260758198a2..995b471d3ca 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -31,7 +31,7 @@ pub(crate) mod end_of_index_entry; pub(crate) mod index_entry_offset_table { use crate::{extension, extension::Signature, util::read_u32}; - #[derive(Debug)] + #[derive(Debug, Clone, Copy)] pub struct Offset { pub from_beginning_of_file: u32, pub num_entries: u32, diff --git a/git-index/tests/index/file/mod.rs b/git-index/tests/index/file/mod.rs index af129dd95ad..64ef01058d7 100644 --- a/git-index/tests/index/file/mod.rs +++ b/git-index/tests/index/file/mod.rs @@ -5,17 +5,28 @@ mod init { fn file(name: &str) -> git_index::File { git_index::File::at(crate::index::fixture_path(name), git_index::decode::Options::default()).unwrap() } + fn file_opt(name: &str, opts: git_index::decode::Options) -> git_index::File { + git_index::File::at(crate::index::fixture_path(name), opts).unwrap() + } #[test] fn read_v2_with_single_entry_tree_and_eoie_ext() { - let file = file("v2"); - assert_eq!(file.version(), Version::V2); + let file_disallow_threaded_loading = file_opt( + "v2", + git_index::decode::Options { + min_extension_block_in_bytes_for_threading: 100000, + ..Default::default() + }, + ); + for file in [file("v2"), file_disallow_threaded_loading] { + assert_eq!(file.version(), Version::V2); - assert_eq!(file.entries().len(), 1); + assert_eq!(file.entries().len(), 1); - let entry = &file.entries()[0]; - assert_eq!(entry.id, hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391")); - assert_eq!(entry.path(&file.state), "a"); + let entry = &file.entries()[0]; + assert_eq!(entry.id, hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391")); + assert_eq!(entry.path(&file.state), "a"); + } } #[test] From cb7e4e784d615f9fa3d6fb9c36240f0592403358 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 12 Jan 2022 12:24:08 +0800 Subject: [PATCH 54/57] feat: Add InOrderIter to 'parallel' module (#293) This iterator makes possible identifies results using a sequence id and returns only consecutive items. Use it to collect unordered results produced by threads. It's advantage to collecting yourself and sorting is the potential for a smaller memory footprint of in-flight results, one doesn't have to collect them all for ordering, necessarily. --- git-features/src/parallel/in_order.rs | 86 ++++++++++++++++++++ git-features/src/parallel/mod.rs | 3 + git-features/tests/parallel/in_order_iter.rs | 53 ++++++++++++ git-features/tests/parallel/mod.rs | 2 + 4 files changed, 144 insertions(+) create mode 100644 git-features/src/parallel/in_order.rs create mode 100644 git-features/tests/parallel/in_order_iter.rs diff --git a/git-features/src/parallel/in_order.rs b/git-features/src/parallel/in_order.rs new file mode 100644 index 00000000000..378218bafb6 --- /dev/null +++ b/git-features/src/parallel/in_order.rs @@ -0,0 +1,86 @@ +use std::{cmp::Ordering, collections::BTreeMap}; + +/// A counter for chunks to be able to put them back into original order later. +pub type ChunkId = usize; + +/// An iterator which olds iterated items with a **sequential** ID starting at 0 long enough to dispense them in order. +/// +/// Note that this iterator is made specifically to support the signature of the iterator returned +/// by [from_counts_iter(…)][super::entry::iter_from_counts()]. +pub struct InOrderIter { + /// The iterator yielding the out-of-order elements we are to yield in order. + pub inner: I, + store: BTreeMap, + next_chunk: ChunkId, + is_done: bool, +} + +impl From for InOrderIter +where + I: Iterator>, +{ + fn from(iter: I) -> Self { + InOrderIter { + inner: iter, + store: Default::default(), + next_chunk: 0, + is_done: false, + } + } +} + +impl Iterator for InOrderIter +where + I: Iterator>, +{ + type Item = Result; + + fn next(&mut self) -> Option { + if self.is_done { + return None; + } + 'find_next_in_sequence: loop { + match self.inner.next() { + Some(Ok((c, v))) => match c.cmp(&self.next_chunk) { + Ordering::Equal => { + self.next_chunk += 1; + return Some(Ok(v)); + } + Ordering::Less => { + unreachable!("in a correctly ordered sequence we can never see keys again, got {}", c) + } + Ordering::Greater => { + let previous = self.store.insert(c, v); + assert!( + previous.is_none(), + "Chunks are returned only once, input is an invalid sequence" + ); + if let Some(v) = self.store.remove(&self.next_chunk) { + self.next_chunk += 1; + return Some(Ok(v)); + } + continue 'find_next_in_sequence; + } + }, + Some(Err(e)) => { + self.is_done = true; + self.store.clear(); + return Some(Err(e)); + } + None => match self.store.remove(&self.next_chunk) { + Some(v) => { + self.next_chunk += 1; + return Some(Ok(v)); + } + None => { + debug_assert!( + self.store.is_empty(), + "When iteration is done we should not have stored items left" + ); + return None; + } + }, + } + } + } +} diff --git a/git-features/src/parallel/mod.rs b/git-features/src/parallel/mod.rs index 7a4aca40eac..4be817b081e 100644 --- a/git-features/src/parallel/mod.rs +++ b/git-features/src/parallel/mod.rs @@ -41,6 +41,9 @@ mod serial; #[cfg(not(feature = "parallel"))] pub use serial::{in_parallel, join, threads}; +mod in_order; +pub use in_order::InOrderIter; + mod eager_iter; pub use eager_iter::{EagerIter, EagerIterIf}; diff --git a/git-features/tests/parallel/in_order_iter.rs b/git-features/tests/parallel/in_order_iter.rs new file mode 100644 index 00000000000..bca012ad4e2 --- /dev/null +++ b/git-features/tests/parallel/in_order_iter.rs @@ -0,0 +1,53 @@ +use git_features::parallel::InOrderIter; +use std::convert::Infallible; + +#[test] +fn in_order_stays_in_order() { + assert_eq!( + InOrderIter::from(vec![Ok::<_, Infallible>((0usize, 'a')), Ok((1, 'b')), Ok((2, 'c'))].into_iter()) + .collect::, _>>() + .expect("infallible"), + vec!['a', 'b', 'c'] + ) +} + +#[test] +fn out_of_order_items_are_held_until_the_sequence_is_complete() { + assert_eq!( + InOrderIter::from( + vec![ + Ok::<_, Infallible>((2usize, 'c')), + Ok((1, 'b')), + Ok((0, 'a')), + Ok((3, 'd')) + ] + .into_iter() + ) + .collect::, _>>() + .expect("infallible"), + vec!['a', 'b', 'c', 'd'] + ) +} + +#[test] +fn in_sequence_errors_immediately_trigger_a_fuse() { + let mut iter = InOrderIter::from(vec![Ok::<_, &'static str>((0usize, 'a')), Err("err"), Ok((1, 'b'))].into_iter()); + assert_eq!(iter.next(), Some(Ok('a'))); + assert_eq!(iter.next(), Some(Err("err"))); + assert_eq!( + iter.next(), + None, + "fuse should have triggered so we don't see anything else" + ); +} + +#[test] +fn out_of_sequence_errors_immediately_trigger_a_fuse() { + let mut iter = InOrderIter::from(vec![Ok::<_, &'static str>((1usize, 'b')), Err("err"), Ok((0, 'a'))].into_iter()); + assert_eq!(iter.next(), Some(Err("err"))); + assert_eq!( + iter.next(), + None, + "fuse should have triggered so we don't see anything else" + ); +} diff --git a/git-features/tests/parallel/mod.rs b/git-features/tests/parallel/mod.rs index e70bbcca092..95801c6b588 100644 --- a/git-features/tests/parallel/mod.rs +++ b/git-features/tests/parallel/mod.rs @@ -1,6 +1,8 @@ //! Tests that are working similarly in parallel and serial mode use git_features::parallel; +mod in_order_iter; + #[derive(Default)] struct Adder { count: usize, From 7721b5fc7cba86d785e0936fdfab2ea41163219f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 12 Jan 2022 12:30:03 +0800 Subject: [PATCH 55/57] Use InOrderIter from git-features (#293) --- git-features/src/parallel/in_order.rs | 12 +-- git-features/src/parallel/mod.rs | 2 +- .../src/data/output/entry/iter_from_counts.rs | 14 +-- git-pack/src/data/output/in_order.rs | 86 ------------------- git-pack/src/data/output/mod.rs | 3 - .../pack/data/output/count_and_entries.rs | 3 +- .../tests/pack/data/output/in_order_iter.rs | 54 ------------ git-pack/tests/pack/data/output/mod.rs | 1 - 8 files changed, 17 insertions(+), 158 deletions(-) delete mode 100644 git-pack/src/data/output/in_order.rs delete mode 100644 git-pack/tests/pack/data/output/in_order_iter.rs diff --git a/git-features/src/parallel/in_order.rs b/git-features/src/parallel/in_order.rs index 378218bafb6..2d5de0a0729 100644 --- a/git-features/src/parallel/in_order.rs +++ b/git-features/src/parallel/in_order.rs @@ -1,7 +1,7 @@ use std::{cmp::Ordering, collections::BTreeMap}; -/// A counter for chunks to be able to put them back into original order later. -pub type ChunkId = usize; +/// A counter for items that are in sequence, to be able to put them back into original order later. +pub type SequenceId = usize; /// An iterator which olds iterated items with a **sequential** ID starting at 0 long enough to dispense them in order. /// @@ -10,14 +10,14 @@ pub type ChunkId = usize; pub struct InOrderIter { /// The iterator yielding the out-of-order elements we are to yield in order. pub inner: I, - store: BTreeMap, - next_chunk: ChunkId, + store: BTreeMap, + next_chunk: SequenceId, is_done: bool, } impl From for InOrderIter where - I: Iterator>, + I: Iterator>, { fn from(iter: I) -> Self { InOrderIter { @@ -31,7 +31,7 @@ where impl Iterator for InOrderIter where - I: Iterator>, + I: Iterator>, { type Item = Result; diff --git a/git-features/src/parallel/mod.rs b/git-features/src/parallel/mod.rs index 4be817b081e..ebdedd3f308 100644 --- a/git-features/src/parallel/mod.rs +++ b/git-features/src/parallel/mod.rs @@ -42,7 +42,7 @@ mod serial; pub use serial::{in_parallel, join, threads}; mod in_order; -pub use in_order::InOrderIter; +pub use in_order::{InOrderIter, SequenceId}; mod eager_iter; pub use eager_iter::{EagerIter, EagerIterIf}; diff --git a/git-pack/src/data/output/entry/iter_from_counts.rs b/git-pack/src/data/output/entry/iter_from_counts.rs index 1c92123e8d2..8fa37dc4a75 100644 --- a/git-pack/src/data/output/entry/iter_from_counts.rs +++ b/git-pack/src/data/output/entry/iter_from_counts.rs @@ -1,8 +1,9 @@ use std::{cmp::Ordering, sync::Arc}; +use git_features::parallel::SequenceId; use git_features::{parallel, progress::Progress}; -use crate::data::{output, output::ChunkId}; +use crate::data::output; /// Given a known list of object `counts`, calculate entries ready to be put into a data pack. /// @@ -44,7 +45,7 @@ pub fn iter_from_counts( thread_limit, chunk_size, }: Options, -) -> impl Iterator), Error>> +) -> impl Iterator), Error>> + parallel::reduce::Finalize>> where Find: crate::Find + Send + Clone + 'static, @@ -152,7 +153,7 @@ where }, { let counts = Arc::clone(&counts); - move |(chunk_id, chunk_range): (ChunkId, std::ops::Range), (buf, progress)| { + move |(chunk_id, chunk_range): (SequenceId, std::ops::Range), (buf, progress)| { let mut out = Vec::new(); let chunk = &counts[chunk_range]; let mut stats = Outcome::default(); @@ -277,8 +278,9 @@ mod reduce { use std::marker::PhantomData; use git_features::parallel; + use git_features::parallel::SequenceId; - use super::{ChunkId, Outcome}; + use super::Outcome; use crate::data::output; pub struct Statistics { @@ -296,8 +298,8 @@ mod reduce { } impl parallel::Reduce for Statistics { - type Input = Result<(ChunkId, Vec, Outcome), Error>; - type FeedProduce = (ChunkId, Vec); + type Input = Result<(SequenceId, Vec, Outcome), Error>; + type FeedProduce = (SequenceId, Vec); type Output = Outcome; type Error = Error; diff --git a/git-pack/src/data/output/in_order.rs b/git-pack/src/data/output/in_order.rs deleted file mode 100644 index 378218bafb6..00000000000 --- a/git-pack/src/data/output/in_order.rs +++ /dev/null @@ -1,86 +0,0 @@ -use std::{cmp::Ordering, collections::BTreeMap}; - -/// A counter for chunks to be able to put them back into original order later. -pub type ChunkId = usize; - -/// An iterator which olds iterated items with a **sequential** ID starting at 0 long enough to dispense them in order. -/// -/// Note that this iterator is made specifically to support the signature of the iterator returned -/// by [from_counts_iter(…)][super::entry::iter_from_counts()]. -pub struct InOrderIter { - /// The iterator yielding the out-of-order elements we are to yield in order. - pub inner: I, - store: BTreeMap, - next_chunk: ChunkId, - is_done: bool, -} - -impl From for InOrderIter -where - I: Iterator>, -{ - fn from(iter: I) -> Self { - InOrderIter { - inner: iter, - store: Default::default(), - next_chunk: 0, - is_done: false, - } - } -} - -impl Iterator for InOrderIter -where - I: Iterator>, -{ - type Item = Result; - - fn next(&mut self) -> Option { - if self.is_done { - return None; - } - 'find_next_in_sequence: loop { - match self.inner.next() { - Some(Ok((c, v))) => match c.cmp(&self.next_chunk) { - Ordering::Equal => { - self.next_chunk += 1; - return Some(Ok(v)); - } - Ordering::Less => { - unreachable!("in a correctly ordered sequence we can never see keys again, got {}", c) - } - Ordering::Greater => { - let previous = self.store.insert(c, v); - assert!( - previous.is_none(), - "Chunks are returned only once, input is an invalid sequence" - ); - if let Some(v) = self.store.remove(&self.next_chunk) { - self.next_chunk += 1; - return Some(Ok(v)); - } - continue 'find_next_in_sequence; - } - }, - Some(Err(e)) => { - self.is_done = true; - self.store.clear(); - return Some(Err(e)); - } - None => match self.store.remove(&self.next_chunk) { - Some(v) => { - self.next_chunk += 1; - return Some(Ok(v)); - } - None => { - debug_assert!( - self.store.is_empty(), - "When iteration is done we should not have stored items left" - ); - return None; - } - }, - } - } - } -} diff --git a/git-pack/src/data/output/mod.rs b/git-pack/src/data/output/mod.rs index bae9342d1f2..0c3e6bfdcec 100644 --- a/git-pack/src/data/output/mod.rs +++ b/git-pack/src/data/output/mod.rs @@ -39,6 +39,3 @@ pub mod entry; /// pub mod bytes; - -mod in_order; -pub use in_order::{ChunkId, InOrderIter}; diff --git a/git-pack/tests/pack/data/output/count_and_entries.rs b/git-pack/tests/pack/data/output/count_and_entries.rs index 4cad9f8aea7..777c65db784 100644 --- a/git-pack/tests/pack/data/output/count_and_entries.rs +++ b/git-pack/tests/pack/data/output/count_and_entries.rs @@ -1,5 +1,6 @@ use std::{convert::Infallible, sync::atomic::AtomicBool}; +use git_features::parallel::InOrderIter; use git_features::{parallel::reduce::Finalize, progress}; use git_odb::{compound, pack, pack::FindExt}; use git_pack::data::{ @@ -291,7 +292,7 @@ fn traversals() -> crate::Result { ..Default::default() }, ); - let entries: Vec<_> = output::InOrderIter::from(entries_iter.by_ref()) + let entries: Vec<_> = InOrderIter::from(entries_iter.by_ref()) .collect::, _>>()? .into_iter() .flatten() diff --git a/git-pack/tests/pack/data/output/in_order_iter.rs b/git-pack/tests/pack/data/output/in_order_iter.rs deleted file mode 100644 index 5ddc0199c5d..00000000000 --- a/git-pack/tests/pack/data/output/in_order_iter.rs +++ /dev/null @@ -1,54 +0,0 @@ -use std::convert::Infallible; - -use git_odb::pack::data::output::InOrderIter; - -#[test] -fn in_order_stays_in_order() { - assert_eq!( - InOrderIter::from(vec![Ok::<_, Infallible>((0usize, 'a')), Ok((1, 'b')), Ok((2, 'c'))].into_iter()) - .collect::, _>>() - .expect("infallible"), - vec!['a', 'b', 'c'] - ) -} - -#[test] -fn out_of_order_items_are_held_until_the_sequence_is_complete() { - assert_eq!( - InOrderIter::from( - vec![ - Ok::<_, Infallible>((2usize, 'c')), - Ok((1, 'b')), - Ok((0, 'a')), - Ok((3, 'd')) - ] - .into_iter() - ) - .collect::, _>>() - .expect("infallible"), - vec!['a', 'b', 'c', 'd'] - ) -} - -#[test] -fn in_sequence_errors_immediately_trigger_a_fuse() { - let mut iter = InOrderIter::from(vec![Ok::<_, &'static str>((0usize, 'a')), Err("err"), Ok((1, 'b'))].into_iter()); - assert_eq!(iter.next(), Some(Ok('a'))); - assert_eq!(iter.next(), Some(Err("err"))); - assert_eq!( - iter.next(), - None, - "fuse should have triggered so we don't see anything else" - ); -} - -#[test] -fn out_of_sequence_errors_immediately_trigger_a_fuse() { - let mut iter = InOrderIter::from(vec![Ok::<_, &'static str>((1usize, 'b')), Err("err"), Ok((0, 'a'))].into_iter()); - assert_eq!(iter.next(), Some(Err("err"))); - assert_eq!( - iter.next(), - None, - "fuse should have triggered so we don't see anything else" - ); -} diff --git a/git-pack/tests/pack/data/output/mod.rs b/git-pack/tests/pack/data/output/mod.rs index 73e63a0d132..fdf93cfb5ea 100644 --- a/git-pack/tests/pack/data/output/mod.rs +++ b/git-pack/tests/pack/data/output/mod.rs @@ -44,4 +44,3 @@ fn db(kind: DbKind) -> crate::Result { } mod count_and_entries; -mod in_order_iter; From e3977fe033550bfd3297cdd674934e40476aa38b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 12 Jan 2022 14:18:05 +0800 Subject: [PATCH 56/57] fix build (#293) --- git-features/tests/parallel/in_order_iter.rs | 3 ++- git-pack/src/data/output/entry/iter_from_counts.rs | 6 ++---- git-pack/tests/pack/data/output/count_and_entries.rs | 6 ++++-- gitoxide-core/src/pack/create.rs | 3 ++- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/git-features/tests/parallel/in_order_iter.rs b/git-features/tests/parallel/in_order_iter.rs index bca012ad4e2..2daad4a071a 100644 --- a/git-features/tests/parallel/in_order_iter.rs +++ b/git-features/tests/parallel/in_order_iter.rs @@ -1,6 +1,7 @@ -use git_features::parallel::InOrderIter; use std::convert::Infallible; +use git_features::parallel::InOrderIter; + #[test] fn in_order_stays_in_order() { assert_eq!( diff --git a/git-pack/src/data/output/entry/iter_from_counts.rs b/git-pack/src/data/output/entry/iter_from_counts.rs index 8fa37dc4a75..2071212cad8 100644 --- a/git-pack/src/data/output/entry/iter_from_counts.rs +++ b/git-pack/src/data/output/entry/iter_from_counts.rs @@ -1,7 +1,6 @@ use std::{cmp::Ordering, sync::Arc}; -use git_features::parallel::SequenceId; -use git_features::{parallel, progress::Progress}; +use git_features::{parallel, parallel::SequenceId, progress::Progress}; use crate::data::output; @@ -277,8 +276,7 @@ mod util { mod reduce { use std::marker::PhantomData; - use git_features::parallel; - use git_features::parallel::SequenceId; + use git_features::{parallel, parallel::SequenceId}; use super::Outcome; use crate::data::output; diff --git a/git-pack/tests/pack/data/output/count_and_entries.rs b/git-pack/tests/pack/data/output/count_and_entries.rs index 777c65db784..e20616fa9f1 100644 --- a/git-pack/tests/pack/data/output/count_and_entries.rs +++ b/git-pack/tests/pack/data/output/count_and_entries.rs @@ -1,7 +1,9 @@ use std::{convert::Infallible, sync::atomic::AtomicBool}; -use git_features::parallel::InOrderIter; -use git_features::{parallel::reduce::Finalize, progress}; +use git_features::{ + parallel::{reduce::Finalize, InOrderIter}, + progress, +}; use git_odb::{compound, pack, pack::FindExt}; use git_pack::data::{ output, diff --git a/gitoxide-core/src/pack/create.rs b/gitoxide-core/src/pack/create.rs index 8a31bef326e..dcd0df23a65 100644 --- a/gitoxide-core/src/pack/create.rs +++ b/gitoxide-core/src/pack/create.rs @@ -8,6 +8,7 @@ use git_repository::{ interrupt, objs::bstr::ByteVec, odb::{pack, pack::FindExt}, + parallel::InOrderIter, prelude::Finalize, progress, traverse, Progress, }; @@ -237,7 +238,7 @@ where let num_objects = counts.len(); let mut in_order_entries = { let progress = progress.add_child("creating entries"); - pack::data::output::InOrderIter::from(pack::data::output::entry::iter_from_counts( + InOrderIter::from(pack::data::output::entry::iter_from_counts( counts, handle, progress, From 995994a895a6faa4537ae1a6564edc005be96a1a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 12 Jan 2022 14:52:38 +0800 Subject: [PATCH 57/57] Aggregation for index entries loaded in parallel (#293) --- etc/check-package-size.sh | 2 +- git-index/src/decode/mod.rs | 53 ++++++++++++++++++++++++------- git-index/src/extension/decode.rs | 8 ++--- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/etc/check-package-size.sh b/etc/check-package-size.sh index 43b25767e00..8ec4c8f25c6 100755 --- a/etc/check-package-size.sh +++ b/etc/check-package-size.sh @@ -24,7 +24,7 @@ echo "in root: gitoxide CLI" (enter git-config && indent cargo diet -n --package-size-limit 65KB) (enter git-hash && indent cargo diet -n --package-size-limit 10KB) (enter git-chunk && indent cargo diet -n --package-size-limit 10KB) -(enter git-features && indent cargo diet -n --package-size-limit 35KB) +(enter git-features && indent cargo diet -n --package-size-limit 40KB) (enter git-ref && indent cargo diet -n --package-size-limit 50KB) (enter git-diff && indent cargo diet -n --package-size-limit 10KB) (enter git-traverse && indent cargo diet -n --package-size-limit 10KB) diff --git a/git-index/src/decode/mod.rs b/git-index/src/decode/mod.rs index 9e93cfe3e2c..9e81877609d 100644 --- a/git-index/src/decode/mod.rs +++ b/git-index/src/decode/mod.rs @@ -28,6 +28,7 @@ mod error { } } pub use error::Error; +use git_features::parallel::InOrderIter; #[derive(Default)] pub struct Options { @@ -82,16 +83,18 @@ impl State { for (id, chunks) in entry_offsets.chunks(chunk_size).enumerate() { let chunks = chunks.to_vec(); threads.push(scope.spawn(move |_| { - let num_entries = chunks.iter().map(|c| c.num_entries).sum::() as usize; - let mut entries = Vec::with_capacity(num_entries); - let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes( - num_entries as u32, - data.len() / num_chunks, - start_of_extensions.map(|ofs| ofs / num_chunks), - object_hash, - version, - ); - let mut path_backing = Vec::with_capacity(path_backing_buffer_size); + let num_entries_for_chunks = + chunks.iter().map(|c| c.num_entries).sum::() as usize; + let mut entries = Vec::with_capacity(num_entries_for_chunks); + let path_backing_buffer_size_for_chunks = + entries::estimate_path_storage_requirements_in_bytes( + num_entries_for_chunks as u32, + data.len() / num_chunks, + start_of_extensions.map(|ofs| ofs / num_chunks), + object_hash, + version, + ); + let mut path_backing = Vec::with_capacity(path_backing_buffer_size_for_chunks); let mut is_sparse = false; for offset in chunks { let ( @@ -119,7 +122,35 @@ impl State { )) })); } - todo!("combined thread results in order ") + let mut results = + InOrderIter::from(threads.into_iter().map(|thread| thread.join().unwrap())); + let mut acc = results.next().expect("have at least two results, one per thread"); + // We explicitly don't adjust the reserve in acc and rather allow for more copying + // to happens as vectors grow to keep the peak memory size low. + // NOTE: one day, we might use a memory pool for paths. We could encode the block of memory + // in some bytes in the path offset. That way there is more indirection/slower access + // to the path, but it would save time here. + // As it stands, `git` is definitely more efficient at this and probably uses less memory too. + // Maybe benchmarks can tell if that is noticeable later at 200/400GB/s memory bandwidth, or maybe just + // 100GB/s on a single core. + while let (Ok(lhs), Some(res)) = (acc.as_mut(), results.next()) { + match res { + Ok(rhs) => { + lhs.is_sparse |= rhs.is_sparse; + let ofs = lhs.path_backing.len(); + lhs.path_backing.extend(rhs.path_backing); + lhs.entries.extend(rhs.entries.into_iter().map(|mut e| { + e.path.start += ofs; + e.path.end += ofs; + e + })); + } + Err(err) => { + acc = Err(err); + } + } + } + acc.map(|acc| (acc, &data[data.len() - object_hash.len_in_bytes()..])) } None => load_entries( post_header_data, diff --git a/git-index/src/extension/decode.rs b/git-index/src/extension/decode.rs index a9ea372b506..af28cc537ea 100644 --- a/git-index/src/extension/decode.rs +++ b/git-index/src/extension/decode.rs @@ -6,8 +6,8 @@ pub fn header(data: &[u8]) -> (Signature, u32, &[u8]) { (signature.try_into().unwrap(), from_be_u32(size), data) } -pub fn all(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> (Outcome, &[u8]) { - extension::Iter::new_without_checksum(beginning_of_extensions, object_hash) +pub fn all(maybe_beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> (Outcome, &[u8]) { + extension::Iter::new_without_checksum(maybe_beginning_of_extensions, object_hash) .map(|mut ext_iter| { let mut ext = Outcome::default(); for (signature, ext_data) in ext_iter.by_ref() { @@ -20,9 +20,9 @@ pub fn all(beginning_of_extensions: &[u8], object_hash: git_hash::Kind) -> (Outc _unknown => {} // skip unknown extensions, too } } - (ext, &beginning_of_extensions[ext_iter.consumed..]) + (ext, &maybe_beginning_of_extensions[ext_iter.consumed..]) }) - .unwrap_or_else(|| (Outcome::default(), beginning_of_extensions)) + .unwrap_or_else(|| (Outcome::default(), maybe_beginning_of_extensions)) } #[derive(Default)]