Skip to content

Commit 056683f

Browse files
committed
perf: Use stack-allocated OidHash in FileHashes and skip expanded hashes on normal runs
Two memory optimizations for large monorepos: 1. FileHashes now stores OidHash (40 bytes, stack, Copy) instead of heap-allocated String (~80-96 bytes each). OidHash moved from turborepo-scm to turborepo-hash where it belongs architecturally. The OidHash->String conversion only happens at the JSON serialization boundary for --summarize/--dry output. 2. calculate_file_hashes takes a needs_expanded_hashes flag. When false (normal turbo run), the per-file hash maps are computed for the collapsed task hash but not retained in the TaskHashTracker. This skips O(tasks * files) of memory that previously lived for the entire run duration but was never read.
1 parent 2a5522a commit 056683f

8 files changed

Lines changed: 201 additions & 155 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/turborepo-hash/src/lib.rs

Lines changed: 56 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44
//! deterministic serialization across languages and platforms, then applies
55
//! xxHash64 for fast hashing.
66
7+
mod oid_hash;
78
mod traits;
89

910
use std::{collections::HashMap, sync::Arc};
1011

1112
use capnp::message::{Builder, HeapAllocator};
13+
pub use oid_hash::OidHash;
1214
pub use traits::TurboHash;
1315
// Re-export for backward compatibility. New code should import from `turborepo_types`.
1416
#[deprecated(
@@ -108,7 +110,7 @@ pub struct LockFilePackages(pub Vec<turborepo_lockfiles::Package>);
108110
pub struct LockFilePackagesRef<'a>(pub Vec<&'a turborepo_lockfiles::Package>);
109111

110112
#[derive(Debug, Clone)]
111-
pub struct FileHashes(pub Vec<(turbopath::RelativeUnixPathBuf, String)>);
113+
pub struct FileHashes(pub Vec<(turbopath::RelativeUnixPathBuf, OidHash)>);
112114

113115
/// Wrapper type for TaskOutputs to enable capnp serialization.
114116
/// This is needed due to Rust's orphan rule - we can't implement From
@@ -244,7 +246,7 @@ impl From<FileHashes> for Builder<HeapAllocator> {
244246
for (i, (key, value)) in file_hashes.iter().enumerate() {
245247
let mut entry = entries.reborrow().get(i as u32);
246248
entry.set_key(key.as_str());
247-
entry.set_value(value);
249+
entry.set_value(&**value);
248250
}
249251
}
250252

@@ -286,7 +288,7 @@ impl From<&FileHashes> for Builder<HeapAllocator> {
286288
for (i, (key, value)) in file_hashes.iter().enumerate() {
287289
let mut entry = entries.reborrow().get(i as u32);
288290
entry.set_key(key.as_str());
289-
entry.set_value(value);
291+
entry.set_value(&**value);
290292
}
291293
}
292294

@@ -504,7 +506,8 @@ mod test {
504506
use turborepo_types::{EnvMode, TaskOutputs};
505507

506508
use super::{
507-
FileHashes, GlobalHashable, LockFilePackages, LockFilePackagesRef, TaskHashable, TurboHash,
509+
FileHashes, GlobalHashable, LockFilePackages, LockFilePackagesRef, OidHash, TaskHashable,
510+
TurboHash,
508511
};
509512

510513
#[test]
@@ -623,39 +626,46 @@ mod test {
623626
lock_file_packages(packages.collect(), "4fd770c37194168e");
624627
}
625628

626-
fn sorted_file_hashes(pairs: Vec<(String, String)>) -> FileHashes {
627-
let mut v: Vec<_> = pairs
629+
fn sorted_file_hashes(pairs: Vec<(&str, &str)>) -> FileHashes {
630+
let mut v: Vec<(turbopath::RelativeUnixPathBuf, OidHash)> = pairs
628631
.into_iter()
629-
.map(|(a, b)| (turbopath::RelativeUnixPathBuf::new(a).unwrap(), b))
632+
.map(|(a, b)| {
633+
(
634+
turbopath::RelativeUnixPathBuf::new(a).unwrap(),
635+
OidHash::from_hex_str(b),
636+
)
637+
})
630638
.collect();
631639
v.sort_by(|(a, _), (b, _)| a.cmp(b));
632640
FileHashes(v)
633641
}
634642

643+
// OID-sized test hashes (40 hex chars each)
644+
const OID_A: &str = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
645+
const OID_B: &str = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
646+
const OID_C: &str = "cccccccccccccccccccccccccccccccccccccccc";
647+
const OID_D: &str = "dddddddddddddddddddddddddddddddddddddddd";
648+
635649
#[test_case(vec![], "459c029558afe716" ; "empty")]
636650
#[test_case(vec![
637-
("a".to_string(), "b".to_string()),
638-
("c".to_string(), "d".to_string()),
639-
], "c9301c0bf1899c07" ; "non-empty")]
651+
("a", OID_A),
652+
("c", OID_B),
653+
], "03e24e42bd35dcaf" ; "non-empty")]
640654
#[test_case(vec![
641-
("c".to_string(), "d".to_string()),
642-
("a".to_string(), "b".to_string()),
643-
], "c9301c0bf1899c07" ; "order resistant")]
644-
fn file_hashes(pairs: Vec<(String, String)>, expected: &str) {
655+
("c", OID_B),
656+
("a", OID_A),
657+
], "03e24e42bd35dcaf" ; "order resistant")]
658+
fn file_hashes(pairs: Vec<(&str, &str)>, expected: &str) {
645659
assert_eq!(sorted_file_hashes(pairs).hash(), expected);
646660
}
647661

648662
#[test]
649663
fn file_hashes_ref_matches_owned() {
650-
let file_hashes = sorted_file_hashes(vec![
651-
("c".to_string(), "d".to_string()),
652-
("a".to_string(), "b".to_string()),
653-
]);
664+
let file_hashes = sorted_file_hashes(vec![("c", OID_B), ("a", OID_A)]);
654665

655666
let ref_hash = (&file_hashes).hash();
656667
let owned_hash = file_hashes.hash();
657668
assert_eq!(ref_hash, owned_hash);
658-
assert_eq!(ref_hash, "c9301c0bf1899c07");
659669
}
660670

661671
#[test]
@@ -668,7 +678,7 @@ mod test {
668678
(
669679
turbopath::RelativeUnixPathBuf::new(format!("path/to/file_{i:03}"))
670680
.unwrap(),
671-
format!("hash_{i}"),
681+
OidHash::from_hex_str(&format!("{i:040x}")),
672682
)
673683
})
674684
.collect(),
@@ -717,58 +727,49 @@ mod test {
717727
}
718728

719729
// Regression: FileHashes constructed from a pre-sorted Vec must produce
720-
// identical hashes to FileHashes constructed from a HashMap. This captures
721-
// the invariant that must hold when switching FileHashes from HashMap to
722-
// sorted Vec.
723-
// Regression: sorted Vec construction must produce identical hashes to what
724-
// the old HashMap-based construction produced. Pinned hash values.
730+
// identical hashes regardless of input order.
725731
#[test]
726732
fn file_hashes_sorted_vec_pinned_values() {
727-
let pairs = [
728-
("c/z.ts", "hash_cz"),
729-
("a/b.ts", "hash_ab"),
730-
("a/a.ts", "hash_aa"),
731-
("b.ts", "hash_b"),
733+
let pairs = vec![
734+
("c/z.ts", OID_C),
735+
("a/b.ts", OID_A),
736+
("a/a.ts", OID_B),
737+
("b.ts", OID_D),
732738
];
733739

734-
let fh = sorted_file_hashes(
735-
pairs
736-
.iter()
737-
.map(|(p, h)| (p.to_string(), h.to_string()))
738-
.collect(),
739-
);
740+
let fh = sorted_file_hashes(pairs.clone());
740741
let hash = fh.hash();
741742

742743
// Verify ref and owned produce same hash
743-
let fh2 = sorted_file_hashes(
744-
pairs
745-
.iter()
746-
.map(|(p, h)| (p.to_string(), h.to_string()))
747-
.collect(),
748-
);
744+
let fh2 = sorted_file_hashes(pairs);
749745
assert_eq!((&fh2).hash(), hash);
750746
}
751747

752748
// Regression: large FileHashes must produce deterministic hashes regardless
753749
// of original insertion order.
754750
#[test]
755751
fn file_hashes_large_deterministic() {
756-
let fh_forward = sorted_file_hashes(
757-
(0..1000)
758-
.map(|i| (format!("pkg/file_{:04}", i), format!("{:040x}", i)))
759-
.collect(),
760-
);
752+
let pairs_forward: Vec<_> = (0..1000)
753+
.map(|i| {
754+
// Leak to get &'static str for the test helper
755+
let path: &str = Box::leak(format!("pkg/file_{i:04}").into_boxed_str());
756+
let hash: &str = Box::leak(format!("{i:040x}").into_boxed_str());
757+
(path, hash)
758+
})
759+
.collect();
761760

762-
let fh_reverse = sorted_file_hashes(
763-
(0..1000)
764-
.rev()
765-
.map(|i| (format!("pkg/file_{:04}", i), format!("{:040x}", i)))
766-
.collect(),
767-
);
761+
let pairs_reverse: Vec<_> = (0..1000)
762+
.rev()
763+
.map(|i| {
764+
let path: &str = Box::leak(format!("pkg/file_{i:04}").into_boxed_str());
765+
let hash: &str = Box::leak(format!("{i:040x}").into_boxed_str());
766+
(path, hash)
767+
})
768+
.collect();
768769

769770
assert_eq!(
770-
fh_forward.hash(),
771-
fh_reverse.hash(),
771+
sorted_file_hashes(pairs_forward).hash(),
772+
sorted_file_hashes(pairs_reverse).hash(),
772773
"insertion order must not affect hash output"
773774
);
774775
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/// A fixed-size, stack-allocated git OID hex string (40 bytes, SHA-1).
2+
///
3+
/// Avoids heap allocation for the ~10K+ file hashes created during index
4+
/// building and per-package hash computation. Implements `Deref<Target=str>`
5+
/// so all existing `&str` consumers work unchanged.
6+
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
7+
pub struct OidHash([u8; 40]);
8+
9+
impl OidHash {
10+
/// Create from a pre-filled 40-byte hex buffer.
11+
/// Caller must ensure `buf` contains valid lowercase ASCII hex.
12+
pub fn from_hex_buf(buf: [u8; 40]) -> Self {
13+
Self(buf)
14+
}
15+
16+
/// Create from a hex-encoded string slice.
17+
pub fn from_hex_str(s: &str) -> Self {
18+
debug_assert_eq!(s.len(), 40, "OID hex must be exactly 40 chars");
19+
let mut buf = [0u8; 40];
20+
buf.copy_from_slice(s.as_bytes());
21+
Self(buf)
22+
}
23+
}
24+
25+
impl std::ops::Deref for OidHash {
26+
type Target = str;
27+
28+
fn deref(&self) -> &str {
29+
// SAFETY: OidHash is always constructed from valid ASCII hex bytes.
30+
unsafe { std::str::from_utf8_unchecked(&self.0) }
31+
}
32+
}
33+
34+
impl AsRef<str> for OidHash {
35+
fn as_ref(&self) -> &str {
36+
self
37+
}
38+
}
39+
40+
impl std::borrow::Borrow<str> for OidHash {
41+
fn borrow(&self) -> &str {
42+
self
43+
}
44+
}
45+
46+
impl std::fmt::Debug for OidHash {
47+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48+
f.write_str(self)
49+
}
50+
}
51+
52+
impl std::fmt::Display for OidHash {
53+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54+
f.write_str(self)
55+
}
56+
}
57+
58+
impl PartialEq<str> for OidHash {
59+
fn eq(&self, other: &str) -> bool {
60+
self.0 == other.as_bytes()
61+
}
62+
}
63+
64+
impl PartialEq<&str> for OidHash {
65+
fn eq(&self, other: &&str) -> bool {
66+
self.0 == other.as_bytes()
67+
}
68+
}
69+
70+
impl From<OidHash> for String {
71+
fn from(oid: OidHash) -> Self {
72+
// SAFETY: OidHash is always valid ASCII hex.
73+
unsafe { String::from_utf8_unchecked(oid.0.to_vec()) }
74+
}
75+
}

crates/turborepo-lib/src/run/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,9 @@ impl Run {
617617
rayon::scope(|s| {
618618
s.spawn(|_| {
619619
let _span = tracing::info_span!("calculate_file_hashes_task").entered();
620+
let needs_expanded = self.opts.run_opts.dry_run.is_some()
621+
|| self.opts.run_opts.summarize
622+
|| self.observability_handle.is_some();
620623
file_hash_result = Some(PackageInputsHashes::calculate_file_hashes(
621624
&self.scm,
622625
self.engine.tasks(),
@@ -625,6 +628,7 @@ impl Run {
625628
&self.repo_root,
626629
&self.run_telemetry,
627630
repo_index,
631+
needs_expanded,
628632
));
629633
});
630634
s.spawn(|_| {

crates/turborepo-run-cache/src/lib.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -579,10 +579,7 @@ impl ConfigCache {
579579
Err(_) => return Err(CacheError::ConfigCacheError),
580580
};
581581

582-
let mut file_hashes: Vec<_> = hash_object
583-
.into_iter()
584-
.map(|(k, v)| (k, String::from(v)))
585-
.collect();
582+
let mut file_hashes: Vec<_> = hash_object.into_iter().collect();
586583
file_hashes.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
587584
Ok(FileHashes(file_hashes).hash())
588585
}

crates/turborepo-scm/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ thiserror = { workspace = true }
2626
tracing = { workspace = true }
2727
turbopath = { workspace = true }
2828
turborepo-ci = { workspace = true }
29+
turborepo-hash = { path = "../turborepo-hash" }
2930
turborepo-telemetry = { path = "../turborepo-telemetry" }
3031
wax = { workspace = true }
3132
which = { workspace = true }

0 commit comments

Comments
 (0)