Skip to content

Commit ab4ffaf

Browse files
Add document-container crate (#4191)
* Add document-container crate: container backends and archive codecs * Address PR review: path/prefix split, safe size casts, OPFS stream aborts * Address PR review round 2: mmap read check, UTF8 entry names, prefix normalization * Make MmappedBytes::new fallible so mmap reads can't silently degrade * Address PR review round 3: backend contract uniformity (symlinks, remove, list) * Apply symlink-component check to FolderBackend listing paths * Address PR review: idempotent OPFS delete logging, tar default-features, shared entry-size cap * Fix validate_path doc: dotfiles pass, CurDir/ParentDir rejected * Omit symlink entries from FolderBackend listings for consistency with resolve * Preserve zip I/O errors and reject non-canonical paths in validate_path * Extend archive apis to return the archive writer * Rename document/document-container directory to document/container * Add archive format sniffing and deserialize_auto * Drop temporal hedge from checked_entry_size comment * Tighten verbose doc comments in document-container * Coalesce consecutive same-path OPFS appends to avoid O(n^2) file copies * Review * Update document-container for the deserialize/store_non_blocking renames --------- Co-authored-by: Timon <me@timon.zip>
1 parent ee7daa5 commit ab4ffaf

13 files changed

Lines changed: 1897 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 48 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ members = [
77
"desktop/platform/linux",
88
"desktop/platform/mac",
99
"desktop/platform/win",
10+
"document/container",
1011
"editor",
1112
"frontend/wrapper",
1213
"libraries/dyn-any",

document/container/Cargo.toml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
[package]
2+
name = "document-container"
3+
description = "Container abstraction for the on-disk side of the .gdd document format"
4+
edition.workspace = true
5+
version.workspace = true
6+
license.workspace = true
7+
authors.workspace = true
8+
9+
[features]
10+
default = []
11+
zip = ["dep:zip"]
12+
xz = ["dep:lzma-rust2", "dep:tar"]
13+
14+
[dependencies]
15+
thiserror = "2.0"
16+
log = { workspace = true }
17+
zip = { workspace = true, optional = true, features = ["deflate-flate2-zlib-rs"], default-features = false}
18+
lzma-rust2 = { workspace = true, optional = true }
19+
tar = { version = "0.4", optional = true, default-features = false }
20+
21+
[target.'cfg(not(target_family = "wasm"))'.dependencies]
22+
mmap-io = { workspace = true }
23+
24+
[target.'cfg(target_family = "wasm")'.dependencies]
25+
web-sys = { workspace = true, features = [
26+
"Navigator",
27+
"DomException",
28+
"Window",
29+
"StorageManager",
30+
"FileSystemCreateWritableOptions",
31+
"FileSystemDirectoryHandle",
32+
"FileSystemFileHandle",
33+
"FileSystemGetFileOptions",
34+
"FileSystemGetDirectoryOptions",
35+
"FileSystemHandle",
36+
"FileSystemHandleKind",
37+
"FileSystemWritableFileStream",
38+
"WritableStream",
39+
"Blob",
40+
] }
41+
js-sys = { workspace = true }
42+
wasm-bindgen = { workspace = true }
43+
wasm-bindgen-futures = { workspace = true }
44+
futures = { workspace = true }
45+
46+
[dev-dependencies]
47+
tempfile = "3"
48+
futures = { workspace = true }

document/container/src/archive.rs

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
//! Archive codecs (zip, xz).
2+
//!
3+
//! Each codec streams entries in both directions: writers wrap an `io::Write` sink, and
4+
//! `deserialize` reads from any `io::Read + Seek` source and streams entries into any [`Container`].
5+
6+
#[cfg(any(feature = "zip", feature = "xz"))]
7+
use crate::ContainerError;
8+
use crate::{Container, Result};
9+
use std::io::{Read, Seek, Write};
10+
11+
/// Hard cap on the total decompressed size a codec will materialize from one archive.
12+
/// Defends against decompression bombs at the cost of refusing legitimately huge archives.
13+
#[cfg(any(feature = "zip", feature = "xz"))]
14+
pub(crate) const MAX_DECOMPRESSED_SIZE: u64 = 4 * 1024 * 1024 * 1024; // 4GB
15+
16+
/// Fold one entry's declared `size` into the running `total` and return it as a `usize` for `write_sized`.
17+
/// Both codecs route entries through here so the decompression-bomb cap and 32-bit-safe conversion live in
18+
/// one place. `write_sized` pre-allocates the declared size, so an over-large one is rejected before that.
19+
#[cfg(any(feature = "zip", feature = "xz"))]
20+
pub(crate) fn checked_entry_size(total: &mut u64, size: u64) -> Result<usize> {
21+
*total = total.saturating_add(size);
22+
if *total > MAX_DECOMPRESSED_SIZE {
23+
return Err(ContainerError::SizeLimitExceeded {
24+
declared: *total,
25+
limit: MAX_DECOMPRESSED_SIZE,
26+
});
27+
}
28+
29+
// `usize` is 32-bit on wasm, so convert fallibly to rule out a silent truncation into a smaller allocation.
30+
usize::try_from(size).map_err(|_| ContainerError::SizeLimitExceeded {
31+
declared: size,
32+
limit: usize::MAX as u64,
33+
})
34+
}
35+
36+
#[cfg(feature = "zip")]
37+
mod zip;
38+
#[cfg(feature = "zip")]
39+
pub use zip::{Zip, ZipWriter};
40+
41+
#[cfg(feature = "xz")]
42+
mod xz;
43+
#[cfg(feature = "xz")]
44+
pub use xz::{Xz, XzWriter};
45+
46+
/// Streaming archive codec. The associated `Writer` type wraps a `Write + Seek` sink (zip needs
47+
/// `Seek` for the central directory; xz doesn't but `Seek` is free on file-like sinks) and
48+
/// accepts entries one at a time. `finish` flushes the codec's trailer and consumes the wrapper.
49+
pub trait Archive {
50+
type Writer<W: Write + Seek>: ArchiveWriter
51+
where
52+
W: Write + Seek;
53+
54+
fn writer<W: Write + Seek>(output: W) -> Result<Self::Writer<W>>;
55+
56+
/// Read entries from `source` and write each into `dest`, streaming so neither the full
57+
/// archive nor the full container ever sits in memory at once.
58+
fn open<R: Read + Seek, C: Container>(source: R, dest: &mut C) -> Result<()>;
59+
}
60+
61+
pub trait ArchiveWriter {
62+
fn write_entry(&mut self, path: &str, bytes: &[u8]) -> Result<()>;
63+
fn finish(self) -> Result<()>;
64+
}
65+
66+
/// Archive container formats distinguishable by their leading magic bytes.
67+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
68+
pub enum ArchiveFormat {
69+
Xz,
70+
Zip,
71+
}
72+
73+
impl ArchiveFormat {
74+
/// Sniff the format from the leading magic bytes: xz streams start with `FD 37 7A 58 5A 00`,
75+
/// zip archives with `50 4B 03 04` (`PK\x03\x04`). Returns `None` for anything else.
76+
pub fn detect(bytes: &[u8]) -> Option<Self> {
77+
if bytes.starts_with(&[0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00]) {
78+
Some(Self::Xz)
79+
} else if bytes.starts_with(&[0x50, 0x4B, 0x03, 0x04]) {
80+
Some(Self::Zip)
81+
} else {
82+
None
83+
}
84+
}
85+
}
86+
87+
/// Deserialize an archive into `dest`, auto-detecting the format from `bytes`' magic header.
88+
/// Errors if the bytes are neither a recognized xz nor zip archive.
89+
#[cfg(all(feature = "xz", feature = "zip"))]
90+
pub fn open_auto<C: Container>(bytes: &[u8], dest: &mut C) -> Result<()> {
91+
let source = std::io::Cursor::new(bytes);
92+
match ArchiveFormat::detect(bytes) {
93+
Some(ArchiveFormat::Xz) => Xz::open(source, dest),
94+
Some(ArchiveFormat::Zip) => Zip::open(source, dest),
95+
None => Err(ContainerError::Codec("unrecognized archive format (not xz or zip)".into())),
96+
}
97+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
//! Xz-compressed tarball archive codec.
2+
3+
use crate::archive::{Archive, ArchiveWriter, MAX_DECOMPRESSED_SIZE, checked_entry_size};
4+
use crate::{Container, ContainerError, Result, validate_path};
5+
use lzma_rust2::{XzOptions, XzReader, XzWriter as InnerXzWriter};
6+
use std::io::{Read, Seek, Write};
7+
8+
pub struct Xz;
9+
10+
/// xz-tar writer. Held as an `Option` so `finish` can take ownership and unwind the layered
11+
/// writers in the right order: drop the tar builder first to flush its trailer, then finish xz.
12+
pub struct XzWriter<W: Write + Seek> {
13+
tar: Option<tar::Builder<InnerXzWriter<W>>>,
14+
}
15+
16+
impl Archive for Xz {
17+
type Writer<W: Write + Seek> = XzWriter<W>;
18+
19+
fn writer<W: Write + Seek>(output: W) -> Result<Self::Writer<W>> {
20+
let xz_writer = InnerXzWriter::new(output, XzOptions::default()).map_err(lzma_err)?;
21+
Ok(XzWriter {
22+
tar: Some(tar::Builder::new(xz_writer)),
23+
})
24+
}
25+
26+
fn open<R: Read + Seek, C: Container>(source: R, dest: &mut C) -> Result<()> {
27+
// `take` bounds how many bytes we decompress from the xz stream, but each tar entry's declared
28+
// size is fed to `write_sized`, which pre-allocates from it before reading. Cap the cumulative
29+
// declared size too so a header claiming a huge size can't trigger a giant allocation up front.
30+
let xz_reader = XzReader::new(source, false);
31+
let bounded = xz_reader.take(MAX_DECOMPRESSED_SIZE);
32+
33+
let mut tar_reader = tar::Archive::new(bounded);
34+
let mut total_size = 0u64;
35+
36+
for entry in tar_reader.entries()? {
37+
let mut entry = entry?;
38+
if entry.header().entry_type() != tar::EntryType::Regular {
39+
continue;
40+
}
41+
// Reject non-UTF8 entry names rather than lossily rewriting them, so the path we store matches
42+
// the archive exactly.
43+
let path = entry.path()?;
44+
let path = path.to_str().ok_or_else(|| ContainerError::Codec(format!("tar: non-UTF8 entry name {path:?}")))?.to_owned();
45+
validate_path(&path)?;
46+
47+
let size = checked_entry_size(&mut total_size, entry.size())?;
48+
49+
dest.write_sized(&path, size, &mut |buffer| {
50+
entry.read_exact(buffer).map_err(ContainerError::Io)?;
51+
Ok(())
52+
})?;
53+
}
54+
55+
Ok(())
56+
}
57+
}
58+
59+
impl<W: Write + Seek> ArchiveWriter for XzWriter<W> {
60+
fn write_entry(&mut self, path: &str, bytes: &[u8]) -> Result<()> {
61+
validate_path(path)?;
62+
let tar = self.tar.as_mut().ok_or_else(|| ContainerError::Codec("XzWriter already finished".into()))?;
63+
let mut header = tar::Header::new_gnu();
64+
header.set_path(path).map_err(|error| ContainerError::Codec(format!("tar: invalid path {path}: {error}")))?;
65+
header.set_size(bytes.len() as u64);
66+
header.set_mode(0o644);
67+
header.set_cksum();
68+
tar.append(&header, bytes)?;
69+
Ok(())
70+
}
71+
72+
fn finish(mut self) -> Result<()> {
73+
self.finish_inner()?;
74+
Ok(())
75+
}
76+
}
77+
78+
impl<W: Write + Seek> XzWriter<W> {
79+
/// Finish the archive and return the underlying sink, for in-memory archives where the caller
80+
/// wants the written bytes (e.g. `Cursor<Vec<u8>>`) back.
81+
pub fn finish_into(mut self) -> Result<W> {
82+
self.finish_inner()
83+
}
84+
85+
/// Unwind the layered writers in order (flush the tar trailer, then finish xz) and hand back the
86+
/// innermost sink. Shared by `finish` and `finish_into`.
87+
fn finish_inner(&mut self) -> Result<W> {
88+
let mut tar = self.tar.take().ok_or_else(|| ContainerError::Codec("XzWriter already finished".into()))?;
89+
tar.finish()?;
90+
let xz_writer = tar.into_inner()?;
91+
xz_writer.finish().map_err(lzma_err)
92+
}
93+
}
94+
95+
fn lzma_err(error: std::io::Error) -> ContainerError {
96+
ContainerError::Codec(format!("lzma: {error}"))
97+
}

0 commit comments

Comments
 (0)