From 8258ca4a0d574b4ab69b0fbd93f5166e78be153f Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 25 Apr 2019 14:20:17 -0700 Subject: [PATCH 1/7] Only execute `exclude_from_backups` once This was currently getting executed on all builds, even if the directory already exists. There shouldn't be any reason though to exclude the directory from backups on all builds, and after seeing this get a stack sample in a profile I figured it's best to ensure it only executes once in case the backing system implementation isn't the speediest. --- src/cargo/core/compiler/layout.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cargo/core/compiler/layout.rs b/src/cargo/core/compiler/layout.rs index 9ea14800a4d..cc48a41ad69 100644 --- a/src/cargo/core/compiler/layout.rs +++ b/src/cargo/core/compiler/layout.rs @@ -160,10 +160,9 @@ impl Layout { pub fn prepare(&mut self) -> io::Result<()> { if fs::metadata(&self.root).is_err() { fs::create_dir_all(&self.root)?; + self.exclude_from_backups(&self.root); } - self.exclude_from_backups(&self.root); - mkdir(&self.deps)?; mkdir(&self.native)?; mkdir(&self.incremental)?; From c7e1b68918eccc8e289f378c5178ea5852a7feda Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 25 Apr 2019 17:46:16 -0700 Subject: [PATCH 2/7] Don't allocate in `SourceId::is_default_registry` This gets called quite a lot and doesn't need to allocate in the first place! --- src/cargo/core/source/source_id.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cargo/core/source/source_id.rs b/src/cargo/core/source/source_id.rs index 16b329336c1..f0ac5f52087 100644 --- a/src/cargo/core/source/source_id.rs +++ b/src/cargo/core/source/source_id.rs @@ -337,7 +337,7 @@ impl SourceId { Kind::Registry => {} _ => return false, } - self.inner.url.to_string() == CRATES_IO_INDEX + self.inner.url.as_str() == CRATES_IO_INDEX } /// Hashes `self`. From 6babe72e7c755989f19f0abb953aae3fb4ad6262 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 25 Apr 2019 14:21:27 -0700 Subject: [PATCH 3/7] Parse less JSON on null builds This commit fixes a performance pathology in Cargo today. Whenever Cargo generates a lock file (which happens on all invocations of `cargo build` for example) Cargo will parse the crates.io index to learn about dependencies. Currently, however, when it parses a crate it parses the JSON blob for every single version of the crate. With a lock file, however, or with incremental builds only one of these lines of JSON is relevant. Measured today Cargo building Cargo parses 3700 JSON dependencies in the registry. This commit implements an optimization that brings down the number of parsed JSON lines in the registry to precisely the right number necessary to build a project. For example Cargo has 150 crates in its lock file, so now it only parses 150 JSON lines (a 20x reduction from 3700). This in turn can greatly improve Cargo's null build time. Cargo building Cargo dropped from 120ms to 60ms on a Linux machine and 400ms to 200ms on a Mac. The commit internally has a lot more details about how this is done but the general idea is to have a cache which is optimized for Cargo to read which is maintained automatically by Cargo. Closes #6866 --- Cargo.toml | 1 + src/cargo/core/mod.rs | 1 + src/cargo/sources/registry/index.rs | 635 +++++++++++++++++++++------ src/cargo/sources/registry/mod.rs | 15 +- src/cargo/sources/registry/remote.rs | 7 +- 5 files changed, 523 insertions(+), 136 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a598dc37747..1fc3ae2ed99 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ lazycell = "1.2.0" libc = "0.2" log = "0.4.6" libgit2-sys = "0.7.9" +memchr = "2.1.3" num_cpus = "1.0" opener = "0.3.0" rustfix = "0.4.4" diff --git a/src/cargo/core/mod.rs b/src/cargo/core/mod.rs index 27265541130..761eadf4525 100644 --- a/src/cargo/core/mod.rs +++ b/src/cargo/core/mod.rs @@ -14,6 +14,7 @@ pub use self::shell::{Shell, Verbosity}; pub use self::source::{GitReference, Source, SourceId, SourceMap}; pub use self::summary::{FeatureMap, FeatureValue, Summary}; pub use self::workspace::{Members, Workspace, WorkspaceConfig, WorkspaceRootConfig}; +pub use self::interning::InternedString; pub mod compiler; pub mod dependency; diff --git a/src/cargo/sources/registry/index.rs b/src/cargo/sources/registry/index.rs index 9d47091b29a..3c439588c1d 100644 --- a/src/cargo/sources/registry/index.rs +++ b/src/cargo/sources/registry/index.rs @@ -1,12 +1,83 @@ +//! Management of the index of a registry source +//! +//! This module contains management of the index and various operations, such as +//! actually parsing the index, looking for crates, etc. This is intended to be +//! abstract over remote indices (downloaded via git) and local registry indices +//! (which are all just present on the filesystem). +//! +//! ## Index Performance +//! +//! One important aspect of the index is that we want to optimize the "happy +//! path" as much as possible. Whenever you type `cargo build` Cargo will +//! *always* reparse the registry and learn about dependency information. This +//! is done because Cargo needs to learn about the upstream crates.io crates +//! that you're using and ensure that the preexisting `Cargo.lock` still matches +//! the current state of the world. +//! +//! Consequently, Cargo "null builds" (the index that Cargo adds to each build +//! itself) need to be fast when accessing the index. The primary performance +//! optimization here is to avoid parsing JSON blobs from the registry if we +//! don't need them. Most secondary optimizations are centered around removing +//! allocations and such, but avoiding parsing JSON is the #1 optimization. +//! +//! When we get queries from the resolver we're given a `Dependency`. This +//! dependency in turn has a version requirement, and with lock files that +//! already exist these version requirements are exact version requirements +//! `=a.b.c`. This means that we in theory only need to parse one line of JSON +//! per query in the registry, the one that matches version `a.b.c`. +//! +//! The crates.io index, however, is not amenable to this form of query. Instead +//! the crates.io index simply is a file where each line is a JSON blob. To +//! learn about the versions in each JSON blob we would need to parse the JSON, +//! defeating the purpose of trying to parse as little as possible. +//! +//! > Note that as a small aside even *loading* the JSON from the registry is +//! > actually pretty slow. For crates.io and remote registries we don't +//! > actually check out the git index on disk because that takes quite some +//! > time and is quite large. Instead we use `libgit2` to read the JSON from +//! > the raw git objects. This in turn can be slow (aka show up high in +//! > profiles) because libgit2 has to do deflate decompression and such. +//! +//! To solve all these issues a strategy is employed here where Cargo basically +//! creates an index into the index. The first time a package is queried about +//! (first time being for an entire computer) Cargo will load the contents +//! (slowly via libgit2) from the registry. It will then (slowly) parse every +//! single line to learn about its versions. Afterwards, however, Cargo will +//! emit a new file (a cache) which is amenable for speedily parsing in future +//! invocations. +//! +//! This cache file is currently organized by basically having the semver +//! version extracted from each JSON blob. That way Cargo can quickly and easily +//! parse all versions contained and which JSON blob they're associated with. +//! The JSON blob then doesn't actually need to get parsed unless the version is +//! parsed. +//! +//! Altogether the initial measurements of this shows a massive improvement for +//! Cargo null build performance. It's expected that the improvements earned +//! here will continue to grow over time in the sense that the previous +//! implementation (parse all lines each time) actually continues to slow down +//! over time as new versions of a crate are published. In any case when first +//! implemented a null build of Cargo itself would parse 3700 JSON blobs from +//! the registry and load 150 blobs from git. Afterwards it parses 150 JSON +//! blobs and loads 0 files git. Removing 200ms or more from Cargo's startup +//! time is certainly nothing to sneeze at! +//! +//! Note that this is just a high-level overview, there's of course lots of +//! details like invalidating caches and whatnot which are handled below, but +//! hopefully those are more obvious inline in the code itself. + use std::collections::{HashMap, HashSet}; +use std::fs::{self, File}; +use std::io::Read; use std::path::Path; use std::str; -use log::{info, trace}; -use semver::Version; +use filetime::FileTime; +use log::info; +use semver::{Version, VersionReq}; use crate::core::dependency::Dependency; -use crate::core::{PackageId, SourceId, Summary}; +use crate::core::{InternedString, PackageId, SourceId, Summary}; use crate::sources::registry::RegistryData; use crate::sources::registry::{RegistryPackage, INDEX_LOCK}; use crate::util::{internal, CargoResult, Config, Filesystem, ToSemver}; @@ -99,13 +170,68 @@ fn overflow_hyphen() { pub struct RegistryIndex<'cfg> { source_id: SourceId, path: Filesystem, - cache: HashMap<&'static str, Vec<(Summary, bool)>>, - // `(name, vers)` -> `checksum` - hashes: HashMap<&'static str, HashMap>, + summaries_cache: HashMap, config: &'cfg Config, locked: bool, } +/// An internal cache of summaries for a particular package. +/// +/// A list of summaries are loaded from disk via one of two methods: +/// +/// 1. Primarily Cargo will parse the corresponding file for a crate in the +/// upstream crates.io registry. That's just a JSON blob per line which we +/// can parse, extract the version, and then store here. +/// +/// 2. Alternatively, if Cargo has previously run, we'll have a cached index of +/// dependencies for the upstream index. This is a file that Cargo maintains +/// lazily on the local filesystem and is much faster to parse since it +/// doesn't involve parsing all of the JSON. +/// +/// The outward-facing interface of this doesn't matter too much where it's +/// loaded from, but it's important when reading the implementation to note that +/// we try to parse as little as possible! +#[derive(Default)] +struct Summaries { + /// A raw vector of uninterpreted bytes. This is what `Unparsed` start/end + /// fields are indexes into. If a `Summaries` is loaded from the crates.io + /// index then this field will be empty since nothing is `Unparsed`. + raw_data: Vec, + + /// All known versions of a crate, keyed from their `Version` to the + /// possibly parsed or unparsed version of the full summary. + versions: HashMap, +} + +/// A lazily parsed `IndexSummary`. +enum MaybeIndexSummary { + /// A summary which has not been parsed, The `start` and `end` are pointers + /// into `Summaries::raw_data` which this is an entry of. + Unparsed { start: usize, end: usize }, + + /// An actually parsed summary. + Parsed(IndexSummary), +} + +/// A parsed representation of a summary from the index. +/// +/// In addition to a full `Summary` we have a few auxiliary pieces of +/// information liked `yanked` and what the checksum hash is. +pub struct IndexSummary { + pub summary: Summary, + pub yanked: bool, + pub hash: String, +} + +/// A representation of the cache on disk that Cargo maintains of summaries. +/// Cargo will initially parse all summaries in the registry and will then +/// serialize that into this form and place it in a new location on disk, +/// ensuring that access in the future is much speedier. +#[derive(Default)] +struct SummariesCache<'a> { + versions: Vec<(Version, &'a [u8])>, +} + impl<'cfg> RegistryIndex<'cfg> { pub fn new( source_id: SourceId, @@ -116,8 +242,7 @@ impl<'cfg> RegistryIndex<'cfg> { RegistryIndex { source_id, path: path.clone(), - cache: HashMap::new(), - hashes: HashMap::new(), + summaries_cache: HashMap::new(), config, locked, } @@ -125,141 +250,138 @@ impl<'cfg> RegistryIndex<'cfg> { /// Returns the hash listed for a specified `PackageId`. pub fn hash(&mut self, pkg: PackageId, load: &mut dyn RegistryData) -> CargoResult { - let name = pkg.name().as_str(); - let version = pkg.version(); - if let Some(s) = self.hashes.get(name).and_then(|v| v.get(version)) { - return Ok(s.clone()); - } - // Ok, we're missing the key, so parse the index file to load it. - self.summaries(name, load)?; - self.hashes - .get(name) - .and_then(|v| v.get(version)) - .ok_or_else(|| internal(format!("no hash listed for {}", pkg))) - .map(|s| s.clone()) + let req = VersionReq::exact(pkg.version()); + let summary = self + .summaries(pkg.name(), &req, load)? + .next() + .ok_or_else(|| internal(format!("no hash listed for {}", pkg)))?; + Ok(summary.hash.clone()) } - /// Parses the on-disk metadata for the package provided. + /// Load a list of summaries for `name` package in this registry which + /// match `req` /// - /// Returns a list of pairs of `(summary, yanked)` for the package name specified. - pub fn summaries( - &mut self, - name: &'static str, + /// This function will semantically parse the on-disk index, match all + /// versions, and then return an iterator over all summaries which matched. + /// Internally there's quite a few layer of caching to amortize this cost + /// though since this method is called quite a lot on null builds in Cargo. + pub fn summaries<'a, 'b>( + &'a mut self, + name: InternedString, + req: &'b VersionReq, load: &mut dyn RegistryData, - ) -> CargoResult<&Vec<(Summary, bool)>> { - if self.cache.contains_key(name) { - return Ok(&self.cache[name]); - } + ) -> CargoResult + 'b> + where + 'a: 'b, + { + let source_id = self.source_id.clone(); + + // First up actually parse what summaries we have available. If Cargo + // has run previously this will parse a Cargo-specific cache file rather + // than the registry itself. In effect this is intended to be a quite + // cheap operation. let summaries = self.load_summaries(name, load)?; - self.cache.insert(name, summaries); - Ok(&self.cache[name]) + + // Iterate over our summaries, extract all relevant ones which match our + // version requirement, and then parse all corresponding rows in the + // registry. As a reminder this `summaries` method is called for each + // entry in a lock file on every build, so we want to absolutely + // minimize the amount of work being done here and parse as little as + // necessary. + let raw_data = &summaries.raw_data; + Ok(summaries + .versions + .iter_mut() + .filter_map(move |(k, v)| if req.matches(k) { Some(v) } else { None }) + .filter_map(move |maybe| match maybe.parse(raw_data, source_id) { + Ok(summary) => Some(summary), + Err(e) => { + info!("failed to parse `{}` registry package: {}", name, e); + None + } + })) } fn load_summaries( &mut self, - name: &str, + name: InternedString, load: &mut dyn RegistryData, - ) -> CargoResult> { + ) -> CargoResult<&mut Summaries> { + // If we've previously loaded what versions are present for `name`, just + // return that since our cache should still be valid. + if self.summaries_cache.contains_key(&name) { + return Ok(self.summaries_cache.get_mut(&name).unwrap()); + } + // Prepare the `RegistryData` which will lazily initialize internal data // structures. Note that this is also importantly needed to initialize // to avoid deadlocks where we acquire a lock below but the `load` // function inside *also* wants to acquire a lock. See an instance of // this on #5551. load.prepare()?; - let (root, _lock) = if self.locked { + + // Synchronize access to the index. For remote indices we want to make + // sure that while we're reading the index no one is trying to update + // it. + let (root, lock) = if self.locked { let lock = self .path .open_ro(Path::new(INDEX_LOCK), self.config, "the registry index"); match lock { Ok(lock) => (lock.path().parent().unwrap().to_path_buf(), Some(lock)), - Err(_) => return Ok(Vec::new()), + Err(_) => { + self.summaries_cache.insert(name, Summaries::default()); + return Ok(self.summaries_cache.get_mut(&name).unwrap()); + } } } else { (self.path.clone().into_path_unlocked(), None) }; + let cache_root = root.join(".cache"); + // TODO: comment + let lock_mtime = lock + .as_ref() + .and_then(|l| l.file().metadata().ok()) + .map(|t| FileTime::from_last_modification_time(&t)); + + // See module comment in `registry/mod.rs` for why this is structured + // the way it is. let fs_name = name .chars() .flat_map(|c| c.to_lowercase()) .collect::(); - - // See module comment for why this is structured the way it is. let raw_path = match fs_name.len() { 1 => format!("1/{}", fs_name), 2 => format!("2/{}", fs_name), 3 => format!("3/{}/{}", &fs_name[..1], fs_name), _ => format!("{}/{}/{}", &fs_name[0..2], &fs_name[2..4], fs_name), }; - let mut ret = Vec::new(); - for path in UncanonicalizedIter::new(&raw_path).take(1024) { - let mut hit_closure = false; - let err = load.load(&root, Path::new(&path), &mut |contents| { - hit_closure = true; - let contents = str::from_utf8(contents) - .map_err(|_| failure::format_err!("registry index file was not valid utf-8"))?; - ret.reserve(contents.lines().count()); - let lines = contents.lines().map(|s| s.trim()).filter(|l| !l.is_empty()); - // Attempt forwards-compatibility on the index by ignoring - // everything that we ourselves don't understand, that should - // allow future cargo implementations to break the - // interpretation of each line here and older cargo will simply - // ignore the new lines. - ret.extend(lines.filter_map(|line| { - let (summary, locked) = match self.parse_registry_package(line) { - Ok(p) => p, - Err(e) => { - info!("failed to parse `{}` registry package: {}", name, e); - trace!("line: {}", line); - return None; - } - }; - Some((summary, locked)) - })); - - Ok(()) - }); - - // We ignore lookup failures as those are just crates which don't exist - // or we haven't updated the registry yet. If we actually ran the - // closure though then we care about those errors. - if hit_closure { - err?; - // Crates.io ensures that there is only one hyphen and underscore equivalent - // result in the index so return when we find it. - return Ok(ret); + // Attempt to handle misspellings by searching for a chain of related + // names to the original `raw_path` name. Only return summaries + // associated with the first hit, however. The resolver will later + // reject any candidates that have the wrong name, and with this it'll + // along the way produce helpful "did you mean?" suggestions. + for path in UncanonicalizedIter::new(&raw_path).take(1024) { + let summaries = Summaries::parse( + lock_mtime, + &root, + &cache_root, + path.as_ref(), + self.source_id, + load, + )?; + if let Some(summaries) = summaries { + self.summaries_cache.insert(name, summaries); + return Ok(self.summaries_cache.get_mut(&name).unwrap()); } } - Ok(ret) - } - - /// Parses a line from the registry's index file into a `Summary` for a package. - /// - /// The returned boolean is whether or not the summary has been yanked. - fn parse_registry_package(&mut self, line: &str) -> CargoResult<(Summary, bool)> { - let RegistryPackage { - name, - vers, - cksum, - deps, - features, - yanked, - links, - } = serde_json::from_str(line)?; - let pkgid = PackageId::new(&name, &vers, self.source_id)?; - let name = pkgid.name(); - let deps = deps - .into_iter() - .map(|dep| dep.into_dep(self.source_id)) - .collect::>>()?; - let mut summary = Summary::new(pkgid, deps, &features, links, false)?; - summary.set_checksum(cksum.clone()); - self.hashes - .entry(name.as_str()) - .or_insert_with(HashMap::new) - .insert(vers, cksum); - Ok((summary, yanked.unwrap_or(false))) + // If nothing was found then this crate doesn't exists, so just use an + // empty `Summaries` list. + self.summaries_cache.insert(name, Summaries::default()); + Ok(self.summaries_cache.get_mut(&name).unwrap()) } pub fn query_inner( @@ -295,31 +417,31 @@ impl<'cfg> RegistryIndex<'cfg> { online: bool, ) -> CargoResult { let source_id = self.source_id; - let name = dep.package_name().as_str(); - let summaries = self.summaries(name, load)?; - let summaries = summaries - .iter() - .filter(|&(summary, yanked)| { - // Note: This particular logic can cause problems with - // optional dependencies when offline. If at least 1 version - // of an optional dependency is downloaded, but that version - // does not satisfy the requirements, then resolution will - // fail. Unfortunately, whether or not something is optional - // is not known here. - (online || load.is_crate_downloaded(summary.package_id())) - && (!yanked || { - log::debug!("{:?}", yanked_whitelist); - log::debug!("{:?}", summary.package_id()); - yanked_whitelist.contains(&summary.package_id()) - }) - }) - .map(|s| s.0.clone()); + let summaries = self + .summaries(dep.package_name(), dep.version_req(), load)? + // First filter summaries for `--offline`. If we're online then + // everything is a candidate, otherwise if we're offline we're only + // going to consider candidates which are actually present on disk. + // + // Note: This particular logic can cause problems with + // optional dependencies when offline. If at least 1 version + // of an optional dependency is downloaded, but that version + // does not satisfy the requirements, then resolution will + // fail. Unfortunately, whether or not something is optional + // is not known here. + .filter(|s| (online || load.is_crate_downloaded(s.summary.package_id()))) + // Next filter out all yanked packages. Some yanked packages may + // leak throguh if they're in a whitelist (aka if they were + // previously in `Cargo.lock` + .filter(|s| !s.yanked || yanked_whitelist.contains(&s.summary.package_id())) + .map(|s| s.summary.clone()); // Handle `cargo update --precise` here. If specified, our own source // will have a precise version listed of the form // `=o->` where `` is the name of a crate on // this source, `` is the version installed and ` is the // version requested (argument to `--precise`). + let name = dep.package_name().as_str(); let summaries = summaries.filter(|s| match source_id.precise() { Some(p) if p.starts_with(name) && p[name.len()..].starts_with('=') => { let mut vers = p[name.len() + 1..].splitn(2, "->"); @@ -344,11 +466,270 @@ impl<'cfg> RegistryIndex<'cfg> { } pub fn is_yanked(&mut self, pkg: PackageId, load: &mut dyn RegistryData) -> CargoResult { - let summaries = self.summaries(pkg.name().as_str(), load)?; - let found = summaries - .iter() - .filter(|&(summary, _yanked)| summary.version() == pkg.version()) - .any(|(_summary, yanked)| *yanked); + let req = VersionReq::exact(pkg.version()); + let found = self + .summaries(pkg.name(), &req, load)? + .any(|summary| summary.yanked); Ok(found) } } + +impl Summaries { + /// Parse out a `Summaries` instances from on-disk state. + /// + /// This will attempt to prefer parsing a previous cache file that already + /// exists from a previous invocation of Cargo (aka you're typing `cargo + /// build` again after typing it previously). If parsing fails or the cache + /// isn't found, then we take a slower path which loads the full descriptor + /// for `relative` from the underlying index (aka typically libgit2 with + /// crates.io) and then parse everything in there. + /// + /// * `lock_mtime` - this is a file modification time where if any cache + /// file is older than this the cache should be considered out of date and + /// needs to be rebuilt. + /// * `root` - this is the root argument passed to `load` + /// * `cache_root` - this is the root on the filesystem itself of where to + /// store cache files. + /// * `relative` - this is the file we're loading from cache or the index + /// data + /// * `source_id` - the registry's SourceId used when parsing JSON blobs to + /// create summaries. + /// * `load` - the actual index implementation which may be very slow to + /// call. We avoid this if we can. + pub fn parse( + lock_mtime: Option, + root: &Path, + cache_root: &Path, + relative: &Path, + source_id: SourceId, + load: &mut dyn RegistryData, + ) -> CargoResult> { + // First up, attempt to load the cache. This could fail for all manner + // of reasons, but consider all of them non-fatal and just log their + // occurrence in case anyone is debugging anything. + let cache_path = cache_root.join(relative); + if let Some(lock_mtime) = lock_mtime { + match File::open(&cache_path) { + Ok(file) => { + let metadata = file.metadata()?; + let cache_mtime = FileTime::from_last_modification_time(&metadata); + if cache_mtime > lock_mtime { + log::debug!("cache for {:?} is fresh", relative); + match Summaries::parse_cache(&file, &metadata) { + Ok(s) => return Ok(Some(s)), + Err(e) => { + log::debug!("failed to parse {:?} cache: {}", relative, e); + } + } + } else { + log::debug!("cache for {:?} is out of date", relative); + } + } + Err(e) => log::debug!("cache for {:?} error: {}", relative, e), + } + } + + // This is the fallback path where we actually talk to libgit2 to load + // information. Here we parse every single line in the index (as we need + // to find the versions) + log::debug!("slow path for {:?}", relative); + let mut ret = Summaries::default(); + let mut hit_closure = false; + let mut cache_bytes = Vec::new(); + let err = load.load(root, relative, &mut |contents| { + ret.raw_data = contents.to_vec(); + let mut cache = SummariesCache::default(); + hit_closure = true; + let mut start = 0; + for end in memchr::Memchr::new(b'\n', contents) { + // Attempt forwards-compatibility on the index by ignoring + // everything that we ourselves don't understand, that should + // allow future cargo implementations to break the + // interpretation of each line here and older cargo will simply + // ignore the new lines. + let line = &contents[start..end]; + let summary = match IndexSummary::parse(line, source_id) { + Ok(summary) => summary, + Err(e) => { + log::info!("failed to parse {:?} registry package: {}", relative, e); + continue; + } + }; + let version = summary.summary.package_id().version().clone(); + cache.versions.push((version.clone(), line)); + ret.versions.insert(version, summary.into()); + start = end + 1; + } + cache_bytes = cache.serialize(); + Ok(()) + }); + + // We ignore lookup failures as those are just crates which don't exist + // or we haven't updated the registry yet. If we actually ran the + // closure though then we care about those errors. + if !hit_closure { + return Ok(None); + } + err?; + + // Once we have our `cache_bytes` which represents the `Summaries` we're + // about to return, write that back out to disk so future Cargo + // invocations can use it. + // + // This is opportunistic so we ignore failure here but are sure to log + // something in case of error. + if fs::create_dir_all(cache_path.parent().unwrap()).is_ok() { + // TODO: somehow need to globally synchronize this + if let Err(e) = fs::write(cache_path, cache_bytes) { + log::info!("failed to write cache: {}", e); + } + } + Ok(Some(ret)) + } + + /// Parses an open `File` which represents information previously cached by + /// Cargo. + pub fn parse_cache(mut file: &File, meta: &fs::Metadata) -> CargoResult { + let mut contents = Vec::new(); + contents.reserve(meta.len() as usize + 1); + file.read_to_end(&mut contents)?; + let cache = SummariesCache::parse(&contents)?; + let mut ret = Summaries::default(); + for (version, summary) in cache.versions { + let (start, end) = subslice_bounds(&contents, summary); + ret.versions + .insert(version, MaybeIndexSummary::Unparsed { start, end }); + } + ret.raw_data = contents; + return Ok(ret); + + // Returns the start/end offsets of `inner` with `outer`. Asserts that + // `inner` is a subslice of `outer`. + fn subslice_bounds(outer: &[u8], inner: &[u8]) -> (usize, usize) { + let outer_start = outer.as_ptr() as usize; + let outer_end = outer_start + outer.len(); + let inner_start = inner.as_ptr() as usize; + let inner_end = inner_start + inner.len(); + assert!(inner_start >= outer_start); + assert!(inner_end <= outer_end); + (inner_start - outer_start, inner_end - outer_start) + } + } +} + +// Implementation of serializing/deserializing the cache of summaries on disk. +// Currently the format looks like: +// +// +--------------+----------------+---+----------------+---+ +// | version byte | version string | 0 | JSON blob ... | 0 | ... +// +--------------+----------------+---+----------------+---+ +// +// The idea is that this is a very easy file for Cargo to parse in future +// invocations. The read from disk should be quite fast and then afterwards all +// we need to know is what versions correspond to which JSON blob. +// +// The leading version byte is intended to ensure that there's some level of +// future compatibility against changes to this cache format so if different +// versions of Cargo share the same cache they don't get too confused. + +const CURRENT_CACHE_VERSION: u8 = 1; + +impl<'a> SummariesCache<'a> { + fn parse(data: &'a [u8]) -> CargoResult> { + // NB: keep this method in sync with `serialize` below + let (first_byte, rest) = data + .split_first() + .ok_or_else(|| failure::format_err!("malformed cache"))?; + if *first_byte != CURRENT_CACHE_VERSION { + failure::bail!("looks like a different Cargo's cache, bailing out"); + } + let mut iter = memchr::Memchr::new(0, rest); + let mut start = 0; + let mut ret = SummariesCache::default(); + while let Some(version_end) = iter.next() { + let version = &rest[start..version_end]; + let version = str::from_utf8(version)?; + let version = Version::parse(version)?; + let summary_end = iter.next().unwrap(); + let summary = &rest[version_end + 1..summary_end]; + ret.versions.push((version, summary)); + start = summary_end + 1; + } + Ok(ret) + } + + fn serialize(&self) -> Vec { + // NB: keep this method in sync with `parse` above + let size = self + .versions + .iter() + .map(|(_version, data)| (10 + data.len())) + .sum(); + let mut contents = Vec::with_capacity(size); + contents.push(CURRENT_CACHE_VERSION); + for (version, data) in self.versions.iter() { + contents.extend_from_slice(version.to_string().as_bytes()); + contents.push(0); + contents.extend_from_slice(data); + contents.push(0); + } + return contents; + } +} + +impl MaybeIndexSummary { + /// Parses this "maybe a summary" into a `Parsed` for sure variant. + /// + /// Does nothing if this is already `Parsed`, and otherwise the `raw_data` + /// passed in is sliced with the bounds in `Unparsed` and then actually + /// parsed. + fn parse(&mut self, raw_data: &[u8], source_id: SourceId) -> CargoResult<&IndexSummary> { + let (start, end) = match self { + MaybeIndexSummary::Unparsed { start, end } => (*start, *end), + MaybeIndexSummary::Parsed(summary) => return Ok(summary), + }; + let summary = IndexSummary::parse(&raw_data[start..end], source_id)?; + *self = MaybeIndexSummary::Parsed(summary); + match self { + MaybeIndexSummary::Unparsed { .. } => unreachable!(), + MaybeIndexSummary::Parsed(summary) => Ok(summary), + } + } +} + +impl From for MaybeIndexSummary { + fn from(summary: IndexSummary) -> MaybeIndexSummary { + MaybeIndexSummary::Parsed(summary) + } +} + +impl IndexSummary { + /// Parses a line from the registry's index file into an `IndexSummary` for + /// a package. + /// + /// The `line` provided is expected to be valid JSON. + fn parse(line: &[u8], source_id: SourceId) -> CargoResult { + let RegistryPackage { + name, + vers, + cksum, + deps, + features, + yanked, + links, + } = serde_json::from_slice(line)?; + log::trace!("json parsed registry {}/{}", name, vers); + let pkgid = PackageId::new(&name, &vers, source_id)?; + let deps = deps + .into_iter() + .map(|dep| dep.into_dep(source_id)) + .collect::>>()?; + let mut summary = Summary::new(pkgid, deps, &features, links, false)?; + summary.set_checksum(cksum.clone()); + Ok(IndexSummary { + summary, + yanked: yanked.unwrap_or(false), + hash: cksum, + }) + } +} diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs index fb4bf5fcf49..9576a77ae6a 100644 --- a/src/cargo/sources/registry/mod.rs +++ b/src/cargo/sources/registry/mod.rs @@ -166,7 +166,7 @@ use std::path::{Path, PathBuf}; use flate2::read::GzDecoder; use log::debug; -use semver::Version; +use semver::{Version, VersionReq}; use serde::Deserialize; use tar::Archive; @@ -358,7 +358,7 @@ pub trait RegistryData { fn index_path(&self) -> &Filesystem; fn load( &self, - _root: &Path, + root: &Path, path: &Path, data: &mut dyn FnMut(&[u8]) -> CargoResult<()>, ) -> CargoResult<()>; @@ -548,13 +548,12 @@ impl<'cfg> RegistrySource<'cfg> { // After we've loaded the package configure it's summary's `checksum` // field with the checksum we know for this `PackageId`. - let summaries = self + let req = VersionReq::exact(package.version()); + let summary_with_cksum = self .index - .summaries(package.name().as_str(), &mut *self.ops)?; - let summary_with_cksum = summaries - .iter() - .map(|s| &s.0) - .find(|s| s.package_id() == package) + .summaries(package.name(), &req, &mut *self.ops)? + .map(|s| s.summary.clone()) + .next() .expect("summary not found"); if let Some(cksum) = summary_with_cksum.checksum() { pkg.manifest_mut() diff --git a/src/cargo/sources/registry/remote.rs b/src/cargo/sources/registry/remote.rs index 7a0dcbf9b59..650770b2e43 100644 --- a/src/cargo/sources/registry/remote.rs +++ b/src/cargo/sources/registry/remote.rs @@ -213,7 +213,7 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { self.prepare()?; self.head.set(None); *self.tree.borrow_mut() = None; - let _lock = + let lock = self.index_path .open_rw(Path::new(INDEX_LOCK), self.config, "the registry index")?; self.config @@ -227,6 +227,11 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { git::fetch(repo, url, refspec, self.config) .chain_err(|| format!("failed to fetch `{}`", url))?; self.config.updated_sources().insert(self.source_id); + + // Make a write to the lock file to record the mtime on the filesystem + // of when the last update happened. + lock.file().set_len(0)?; + lock.file().write(&[0])?; Ok(()) } From 5217280ee33ba4e4a090a05b53c903ca99f4851e Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 26 Apr 2019 11:09:25 -0700 Subject: [PATCH 4/7] Make registry locking more coarse This commit updates the locking strategy in Cargo handle the recent addition of creating a cache of the on-disk index also on disk. The goal here is reduce the overhead of locking both cognitively when reading but also performance wise by requiring fewer locks. Previously Cargo had a bunch of fine-grained locks throughout the index and git repositories, but after this commit there's just one global "package cache" lock. This global lock now serves to basically synchronize the entire crate graph resolution step. This shouldn't really take that long unless it's downloading, in which case there's not a ton of benefit to running in parallel anyway. The other intention of this single global lock is to make it much easier on the sources to not worry so much about lock ordering or when to acquire locks, but rather they just assert in their various operations that they're locked. Cargo now has a few coarse-grained locations where locks are held (for example during resolution and during package downloading). These locks are a bit sprinkled about but they have in-code asserts which assert that they're held, so we'll find bugs quickly if any lock isn't held (before a race condition is hit that is) --- src/cargo/core/package.rs | 5 ++ src/cargo/ops/cargo_generate_lockfile.rs | 5 ++ src/cargo/ops/cargo_install.rs | 5 ++ src/cargo/ops/cargo_package.rs | 4 + .../ops/common_for_install_and_uninstall.rs | 5 ++ src/cargo/ops/registry.rs | 1 + src/cargo/ops/resolve.rs | 4 + src/cargo/sources/git/source.rs | 23 ++--- src/cargo/sources/registry/index.rs | 45 +++------- src/cargo/sources/registry/local.rs | 22 +++-- src/cargo/sources/registry/mod.rs | 87 ++++++------------- src/cargo/sources/registry/remote.rs | 59 ++++++------- src/cargo/util/config.rs | 52 +++++++++++ src/cargo/util/flock.rs | 9 ++ tests/testsuite/search.rs | 2 + 15 files changed, 189 insertions(+), 139 deletions(-) diff --git a/src/cargo/core/package.rs b/src/cargo/core/package.rs index d5f7d89fafc..405c71e0661 100644 --- a/src/cargo/core/package.rs +++ b/src/cargo/core/package.rs @@ -25,6 +25,7 @@ use crate::ops; use crate::util::errors::{CargoResult, CargoResultExt, HttpNot200}; use crate::util::network::Retry; use crate::util::{self, internal, lev_distance, Config, Progress, ProgressStyle}; +use crate::util::config::PackageCacheLock; /// Information about a package that is available somewhere in the file system. /// @@ -339,6 +340,9 @@ pub struct Downloads<'a, 'cfg: 'a> { /// trigger a timeout; reset `next_speed_check` and set this back to the /// configured threshold. next_speed_check_bytes_threshold: Cell, + /// Global filesystem lock to ensure only one Cargo is downloading at a + /// time. + _lock: PackageCacheLock<'cfg>, } struct Download<'cfg> { @@ -437,6 +441,7 @@ impl<'cfg> PackageSet<'cfg> { timeout, next_speed_check: Cell::new(Instant::now()), next_speed_check_bytes_threshold: Cell::new(0), + _lock: self.config.acquire_package_cache_lock()?, }) } diff --git a/src/cargo/ops/cargo_generate_lockfile.rs b/src/cargo/ops/cargo_generate_lockfile.rs index d99c29e122b..88600ec11e8 100644 --- a/src/cargo/ops/cargo_generate_lockfile.rs +++ b/src/cargo/ops/cargo_generate_lockfile.rs @@ -40,6 +40,10 @@ pub fn update_lockfile(ws: &Workspace<'_>, opts: &UpdateOptions<'_>) -> CargoRes failure::bail!("you can't update in the offline mode"); } + // Updates often require a lot of modifications to the registry, so ensure + // that we're synchronized against other Cargos. + let _lock = ws.config().acquire_package_cache_lock()?; + let previous_resolve = match ops::load_pkg_lockfile(ws)? { Some(resolve) => resolve, None => return generate_lockfile(ws), @@ -73,6 +77,7 @@ pub fn update_lockfile(ws: &Workspace<'_>, opts: &UpdateOptions<'_>) -> CargoRes }); } } + registry.add_sources(sources)?; } diff --git a/src/cargo/ops/cargo_install.rs b/src/cargo/ops/cargo_install.rs index 352f3712ff8..973df18b330 100644 --- a/src/cargo/ops/cargo_install.rs +++ b/src/cargo/ops/cargo_install.rs @@ -488,6 +488,11 @@ fn check_yanked_install(ws: &Workspace<'_>) -> CargoResult<()> { // wouldn't be available for `compile_ws`. let (pkg_set, resolve) = ops::resolve_ws_with_method(ws, Method::Everything, &specs)?; let mut sources = pkg_set.sources_mut(); + + // Checking the yanked status invovles taking a look at the registry and + // maybe updating files, so be sure to lock it here. + let _lock = ws.config().acquire_package_cache_lock()?; + for pkg_id in resolve.iter() { if let Some(source) = sources.get_mut(pkg_id.source_id()) { if source.is_yanked(pkg_id)? { diff --git a/src/cargo/ops/cargo_package.rs b/src/cargo/ops/cargo_package.rs index 87ce0816a0c..8dc39a0c7e6 100644 --- a/src/cargo/ops/cargo_package.rs +++ b/src/cargo/ops/cargo_package.rs @@ -548,6 +548,10 @@ fn compare_resolve( } fn check_yanked(config: &Config, pkg_set: &PackageSet<'_>, resolve: &Resolve) -> CargoResult<()> { + // Checking the yanked status invovles taking a look at the registry and + // maybe updating files, so be sure to lock it here. + let _lock = config.acquire_package_cache_lock()?; + let mut sources = pkg_set.sources_mut(); for pkg_id in resolve.iter() { if let Some(source) = sources.get_mut(pkg_id.source_id()) { diff --git a/src/cargo/ops/common_for_install_and_uninstall.rs b/src/cargo/ops/common_for_install_and_uninstall.rs index 9eba393ec37..8502d190649 100644 --- a/src/cargo/ops/common_for_install_and_uninstall.rs +++ b/src/cargo/ops/common_for_install_and_uninstall.rs @@ -564,6 +564,11 @@ pub fn select_pkg<'a, T>( where T: Source + 'a, { + // This operation may involve updating some sources or making a few queries + // which may involve frobbing caches, as a result make sure we synchronize + // with other global Cargos + let _lock = config.acquire_package_cache_lock()?; + if needs_update { source.update()?; } diff --git a/src/cargo/ops/registry.rs b/src/cargo/ops/registry.rs index 83d97121b8e..4cab5bb644f 100644 --- a/src/cargo/ops/registry.rs +++ b/src/cargo/ops/registry.rs @@ -353,6 +353,7 @@ fn registry( let token = token.or(token_config); let sid = get_source_id(config, index_config.or(index), registry)?; let api_host = { + let _lock = config.acquire_package_cache_lock()?; let mut src = RegistrySource::remote(sid, &HashSet::new(), config); // Only update the index if the config is not available or `force` is set. let cfg = src.config(); diff --git a/src/cargo/ops/resolve.rs b/src/cargo/ops/resolve.rs index be273026caa..14da2061179 100644 --- a/src/cargo/ops/resolve.rs +++ b/src/cargo/ops/resolve.rs @@ -146,6 +146,10 @@ pub fn resolve_with_previous<'cfg>( specs: &[PackageIdSpec], register_patches: bool, ) -> CargoResult { + // We only want one Cargo at a time resolving a crate graph since this can + // involve a lot of frobbing of the global caches. + let _lock = ws.config().acquire_package_cache_lock()?; + // Here we place an artificial limitation that all non-registry sources // cannot be locked at more than one revision. This means that if a Git // repository provides more than one package, they must all be updated in diff --git a/src/cargo/sources/git/source.rs b/src/cargo/sources/git/source.rs index da96f6d0ce7..553f0581cf7 100644 --- a/src/cargo/sources/git/source.rs +++ b/src/cargo/sources/git/source.rs @@ -154,12 +154,9 @@ impl<'cfg> Source for GitSource<'cfg> { } fn update(&mut self) -> CargoResult<()> { - let lock = - self.config - .git_path() - .open_rw(".cargo-lock-git", self.config, "the git checkouts")?; - - let db_path = lock.parent().join("db").join(&self.ident); + let git_path = self.config.git_path(); + let git_path = self.config.assert_package_cache_locked(&git_path); + let db_path = git_path.join("db").join(&self.ident); if self.config.cli_unstable().offline && !db_path.exists() { failure::bail!( @@ -189,21 +186,17 @@ impl<'cfg> Source for GitSource<'cfg> { (self.remote.db_at(&db_path)?, actual_rev.unwrap()) }; - // Don’t use the full hash, in order to contribute less to reaching the path length limit - // on Windows. See . + // Don’t use the full hash, in order to contribute less to reaching the + // path length limit on Windows. See + // . let short_id = db.to_short_id(&actual_rev).unwrap(); - let checkout_path = lock - .parent() + let checkout_path = git_path .join("checkouts") .join(&self.ident) .join(short_id.as_str()); - // Copy the database to the checkout location. After this we could drop - // the lock on the database as we no longer needed it, but we leave it - // in scope so the destructors here won't tamper with too much. - // Checkout is immutable, so we don't need to protect it with a lock once - // it is created. + // Copy the database to the checkout location. db.copy_to(actual_rev.clone(), &checkout_path, self.config)?; let source_id = self.source_id.with_precise(Some(actual_rev.to_string())); diff --git a/src/cargo/sources/registry/index.rs b/src/cargo/sources/registry/index.rs index 3c439588c1d..57bc720aaa5 100644 --- a/src/cargo/sources/registry/index.rs +++ b/src/cargo/sources/registry/index.rs @@ -78,8 +78,7 @@ use semver::{Version, VersionReq}; use crate::core::dependency::Dependency; use crate::core::{InternedString, PackageId, SourceId, Summary}; -use crate::sources::registry::RegistryData; -use crate::sources::registry::{RegistryPackage, INDEX_LOCK}; +use crate::sources::registry::{RegistryData, RegistryPackage}; use crate::util::{internal, CargoResult, Config, Filesystem, ToSemver}; /// Crates.io treats hyphen and underscores as interchangeable, but the index and old Cargo do not. @@ -172,7 +171,6 @@ pub struct RegistryIndex<'cfg> { path: Filesystem, summaries_cache: HashMap, config: &'cfg Config, - locked: bool, } /// An internal cache of summaries for a particular package. @@ -237,14 +235,12 @@ impl<'cfg> RegistryIndex<'cfg> { source_id: SourceId, path: &Filesystem, config: &'cfg Config, - locked: bool, ) -> RegistryIndex<'cfg> { RegistryIndex { source_id, path: path.clone(), summaries_cache: HashMap::new(), config, - locked, } } @@ -314,35 +310,19 @@ impl<'cfg> RegistryIndex<'cfg> { } // Prepare the `RegistryData` which will lazily initialize internal data - // structures. Note that this is also importantly needed to initialize - // to avoid deadlocks where we acquire a lock below but the `load` - // function inside *also* wants to acquire a lock. See an instance of - // this on #5551. + // structures. load.prepare()?; - // Synchronize access to the index. For remote indices we want to make - // sure that while we're reading the index no one is trying to update - // it. - let (root, lock) = if self.locked { - let lock = self - .path - .open_ro(Path::new(INDEX_LOCK), self.config, "the registry index"); - match lock { - Ok(lock) => (lock.path().parent().unwrap().to_path_buf(), Some(lock)), - Err(_) => { - self.summaries_cache.insert(name, Summaries::default()); - return Ok(self.summaries_cache.get_mut(&name).unwrap()); - } - } - } else { - (self.path.clone().into_path_unlocked(), None) - }; + // let root = self.config.assert_package_cache_locked(&self.path); + let root = load.assert_index_locked(&self.path); let cache_root = root.join(".cache"); + // TODO: comment - let lock_mtime = lock - .as_ref() - .and_then(|l| l.file().metadata().ok()) - .map(|t| FileTime::from_last_modification_time(&t)); + let lock_mtime = None; + // let lock_mtime = lock + // .as_ref() + // .and_then(|l| l.file().metadata().ok()) + // .map(|t| FileTime::from_last_modification_time(&t)); // See module comment in `registry/mod.rs` for why this is structured @@ -371,6 +351,7 @@ impl<'cfg> RegistryIndex<'cfg> { path.as_ref(), self.source_id, load, + self.config, )?; if let Some(summaries) = summaries { self.summaries_cache.insert(name, summaries); @@ -503,6 +484,7 @@ impl Summaries { relative: &Path, source_id: SourceId, load: &mut dyn RegistryData, + config: &Config, ) -> CargoResult> { // First up, attempt to load the cache. This could fail for all manner // of reasons, but consider all of them non-fatal and just log their @@ -579,7 +561,8 @@ impl Summaries { // This is opportunistic so we ignore failure here but are sure to log // something in case of error. if fs::create_dir_all(cache_path.parent().unwrap()).is_ok() { - // TODO: somehow need to globally synchronize this + let path = Filesystem::new(cache_path.clone()); + config.assert_package_cache_locked(&path); if let Err(e) = fs::write(cache_path, cache_bytes) { log::info!("failed to write cache: {}", e); } diff --git a/src/cargo/sources/registry/local.rs b/src/cargo/sources/registry/local.rs index ed649c9e559..8069361b987 100644 --- a/src/cargo/sources/registry/local.rs +++ b/src/cargo/sources/registry/local.rs @@ -1,12 +1,13 @@ -use std::io::prelude::*; +use std::fs::File; use std::io::SeekFrom; +use std::io::prelude::*; use std::path::Path; use crate::core::PackageId; use crate::sources::registry::{MaybeLock, RegistryConfig, RegistryData}; use crate::util::errors::{CargoResult, CargoResultExt}; use crate::util::paths; -use crate::util::{Config, FileLock, Filesystem, Sha256}; +use crate::util::{Config, Filesystem, Sha256}; use hex; pub struct LocalRegistry<'cfg> { @@ -36,6 +37,12 @@ impl<'cfg> RegistryData for LocalRegistry<'cfg> { &self.index_path } + fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path { + // Note that the `*_unlocked` variant is used here since we're not + // modifying the index and it's required to be externally synchronized. + path.as_path_unlocked() + } + fn load( &self, root: &Path, @@ -71,7 +78,12 @@ impl<'cfg> RegistryData for LocalRegistry<'cfg> { fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult { let crate_file = format!("{}-{}.crate", pkg.name(), pkg.version()); - let mut crate_file = self.root.open_ro(&crate_file, self.config, "crate file")?; + + // Note that the usage of `into_path_unlocked` here is because the local + // crate files here never change in that we're not the one writing them, + // so it's not our responsibility to synchronize access to them. + let path = self.root.join(&crate_file).into_path_unlocked(); + let mut crate_file = File::open(&path)?; // If we've already got an unpacked version of this crate, then skip the // checksum below as it is in theory already verified. @@ -89,7 +101,7 @@ impl<'cfg> RegistryData for LocalRegistry<'cfg> { loop { let n = crate_file .read(&mut buf) - .chain_err(|| format!("failed to read `{}`", crate_file.path().display()))?; + .chain_err(|| format!("failed to read `{}`", path.display()))?; if n == 0 { break; } @@ -109,7 +121,7 @@ impl<'cfg> RegistryData for LocalRegistry<'cfg> { _pkg: PackageId, _checksum: &str, _data: &[u8], - ) -> CargoResult { + ) -> CargoResult { panic!("this source doesn't download") } } diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs index 9576a77ae6a..4090d59d854 100644 --- a/src/cargo/sources/registry/mod.rs +++ b/src/cargo/sources/registry/mod.rs @@ -161,6 +161,7 @@ use std::borrow::Cow; use std::collections::BTreeMap; use std::collections::HashSet; +use std::fs::{File, OpenOptions}; use std::io::Write; use std::path::{Path, PathBuf}; @@ -177,9 +178,8 @@ use crate::sources::PathSource; use crate::util::errors::CargoResultExt; use crate::util::hex; use crate::util::to_url::ToUrl; -use crate::util::{internal, CargoResult, Config, FileLock, Filesystem}; +use crate::util::{internal, CargoResult, Config, Filesystem}; -const INDEX_LOCK: &str = ".cargo-index-lock"; const PACKAGE_SOURCE_LOCK: &str = ".cargo-ok"; pub const CRATES_IO_INDEX: &str = "https://github.com/rust-lang/crates.io-index"; pub const CRATES_IO_REGISTRY: &str = "crates-io"; @@ -194,7 +194,6 @@ pub struct RegistrySource<'cfg> { ops: Box, index: index::RegistryIndex<'cfg>, yanked_whitelist: HashSet, - index_locked: bool, } #[derive(Deserialize)] @@ -365,20 +364,17 @@ pub trait RegistryData { fn config(&mut self) -> CargoResult>; fn update_index(&mut self) -> CargoResult<()>; fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult; - fn finish_download( - &mut self, - pkg: PackageId, - checksum: &str, - data: &[u8], - ) -> CargoResult; + fn finish_download(&mut self, pkg: PackageId, checksum: &str, data: &[u8]) + -> CargoResult; fn is_crate_downloaded(&self, _pkg: PackageId) -> bool { true } + fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path; } pub enum MaybeLock { - Ready(FileLock), + Ready(File), Download { url: String, descriptor: String }, } @@ -400,14 +396,7 @@ impl<'cfg> RegistrySource<'cfg> { ) -> RegistrySource<'cfg> { let name = short_name(source_id); let ops = remote::RemoteRegistry::new(source_id, config, &name); - RegistrySource::new( - source_id, - config, - &name, - Box::new(ops), - yanked_whitelist, - true, - ) + RegistrySource::new(source_id, config, &name, Box::new(ops), yanked_whitelist) } pub fn local( @@ -418,14 +407,7 @@ impl<'cfg> RegistrySource<'cfg> { ) -> RegistrySource<'cfg> { let name = short_name(source_id); let ops = local::LocalRegistry::new(path, config, &name); - RegistrySource::new( - source_id, - config, - &name, - Box::new(ops), - yanked_whitelist, - false, - ) + RegistrySource::new(source_id, config, &name, Box::new(ops), yanked_whitelist) } fn new( @@ -434,16 +416,14 @@ impl<'cfg> RegistrySource<'cfg> { name: &str, ops: Box, yanked_whitelist: &HashSet, - index_locked: bool, ) -> RegistrySource<'cfg> { RegistrySource { src_path: config.registry_source_path().join(name), config, source_id, updated: false, - index: index::RegistryIndex::new(source_id, ops.index_path(), config, index_locked), + index: index::RegistryIndex::new(source_id, ops.index_path(), config), yanked_whitelist: yanked_whitelist.clone(), - index_locked, ops, } } @@ -459,36 +439,26 @@ impl<'cfg> RegistrySource<'cfg> { /// compiled. /// /// No action is taken if the source looks like it's already unpacked. - fn unpack_package(&self, pkg: PackageId, tarball: &FileLock) -> CargoResult { + fn unpack_package(&self, pkg: PackageId, tarball: &File) -> CargoResult { // The `.cargo-ok` file is used to track if the source is already - // unpacked and to lock the directory for unpacking. - let mut ok = { - let package_dir = format!("{}-{}", pkg.name(), pkg.version()); - let dst = self.src_path.join(&package_dir); - dst.create_dir()?; - - // Attempt to open a read-only copy first to avoid an exclusive write - // lock and also work with read-only filesystems. If the file has - // any data, assume the source is already unpacked. - if let Ok(ok) = dst.open_ro(PACKAGE_SOURCE_LOCK, self.config, &package_dir) { - let meta = ok.file().metadata()?; - if meta.len() > 0 { - let unpack_dir = ok.parent().to_path_buf(); - return Ok(unpack_dir); - } - } - - dst.open_rw(PACKAGE_SOURCE_LOCK, self.config, &package_dir)? - }; - let unpack_dir = ok.parent().to_path_buf(); - - // If the file has any data, assume the source is already unpacked. - let meta = ok.file().metadata()?; + // unpacked. + let package_dir = format!("{}-{}", pkg.name(), pkg.version()); + let dst = self.src_path.join(&package_dir); + dst.create_dir()?; + let path = dst.join(PACKAGE_SOURCE_LOCK); + let path = self.config.assert_package_cache_locked(&path); + let unpack_dir = path.parent().unwrap(); + let mut ok = OpenOptions::new() + .create(true) + .read(true) + .write(true) + .open(&path)?; + let meta = ok.metadata()?; if meta.len() > 0 { - return Ok(unpack_dir); + return Ok(unpack_dir.to_path_buf()); } - let gz = GzDecoder::new(tarball.file()); + let gz = GzDecoder::new(tarball); let mut tar = Archive::new(gz); let prefix = unpack_dir.file_name().unwrap(); let parent = unpack_dir.parent().unwrap(); @@ -523,19 +493,18 @@ impl<'cfg> RegistrySource<'cfg> { // Write to the lock file to indicate that unpacking was successful. write!(ok, "ok")?; - Ok(unpack_dir) + Ok(unpack_dir.to_path_buf()) } fn do_update(&mut self) -> CargoResult<()> { self.ops.update_index()?; let path = self.ops.index_path(); - self.index = - index::RegistryIndex::new(self.source_id, path, self.config, self.index_locked); + self.index = index::RegistryIndex::new(self.source_id, path, self.config); self.updated = true; Ok(()) } - fn get_pkg(&mut self, package: PackageId, path: &FileLock) -> CargoResult { + fn get_pkg(&mut self, package: PackageId, path: &File) -> CargoResult { let path = self .unpack_package(package, path) .chain_err(|| internal(format!("failed to unpack package `{}`", package)))?; diff --git a/src/cargo/sources/registry/remote.rs b/src/cargo/sources/registry/remote.rs index 650770b2e43..f2bb4374b60 100644 --- a/src/cargo/sources/registry/remote.rs +++ b/src/cargo/sources/registry/remote.rs @@ -1,5 +1,6 @@ use std::cell::{Cell, Ref, RefCell}; use std::fmt::Write as FmtWrite; +use std::fs::{self, File, OpenOptions}; use std::io::prelude::*; use std::io::SeekFrom; use std::mem; @@ -12,12 +13,9 @@ use log::{debug, trace}; use crate::core::{PackageId, SourceId}; use crate::sources::git; use crate::sources::registry::MaybeLock; -use crate::sources::registry::{ - RegistryConfig, RegistryData, CRATE_TEMPLATE, INDEX_LOCK, VERSION_TEMPLATE, -}; +use crate::sources::registry::{RegistryConfig, RegistryData, CRATE_TEMPLATE, VERSION_TEMPLATE}; use crate::util::errors::{CargoResult, CargoResultExt}; -use crate::util::{Config, Sha256}; -use crate::util::{FileLock, Filesystem}; +use crate::util::{Config, Filesystem, Sha256}; pub struct RemoteRegistry<'cfg> { index_path: Filesystem, @@ -44,7 +42,7 @@ impl<'cfg> RemoteRegistry<'cfg> { fn repo(&self) -> CargoResult<&git2::Repository> { self.repo.try_borrow_with(|| { - let path = self.index_path.clone().into_path_unlocked(); + let path = self.config.assert_package_cache_locked(&self.index_path); // Fast path without a lock if let Ok(repo) = git2::Repository::open(&path) { @@ -54,15 +52,11 @@ impl<'cfg> RemoteRegistry<'cfg> { // Ok, now we need to lock and try the whole thing over again. trace!("acquiring registry index lock"); - let lock = self.index_path.open_rw( - Path::new(INDEX_LOCK), - self.config, - "the registry index", - )?; match git2::Repository::open(&path) { Ok(repo) => Ok(repo), Err(_) => { - let _ = lock.remove_siblings(); + drop(fs::remove_dir_all(&path)); + fs::create_dir_all(&path)?; // Note that we'd actually prefer to use a bare repository // here as we're not actually going to check anything out. @@ -139,6 +133,10 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { &self.index_path } + fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path { + self.config.assert_package_cache_locked(path) + } + fn load( &self, _root: &Path, @@ -162,9 +160,7 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { fn config(&mut self) -> CargoResult> { debug!("loading config"); self.prepare()?; - let _lock = - self.index_path - .open_ro(Path::new(INDEX_LOCK), self.config, "the registry index")?; + self.config.assert_package_cache_locked(&self.index_path); let mut config = None; self.load(Path::new(""), Path::new("config.json"), &mut |json| { config = Some(serde_json::from_slice(json)?); @@ -213,9 +209,7 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { self.prepare()?; self.head.set(None); *self.tree.borrow_mut() = None; - let lock = - self.index_path - .open_rw(Path::new(INDEX_LOCK), self.config, "the registry index")?; + self.config.assert_package_cache_locked(&self.index_path); self.config .shell() .status("Updating", self.source_id.display_index())?; @@ -228,10 +222,6 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { .chain_err(|| format!("failed to fetch `{}`", url))?; self.config.updated_sources().insert(self.source_id); - // Make a write to the lock file to record the mtime on the filesystem - // of when the last update happened. - lock.file().set_len(0)?; - lock.file().write(&[0])?; Ok(()) } @@ -244,8 +234,10 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { // // If this fails then we fall through to the exclusive path where we may // have to redownload the file. - if let Ok(dst) = self.cache_path.open_ro(&filename, self.config, &filename) { - let meta = dst.file().metadata()?; + let path = self.cache_path.join(&filename); + let path = self.config.assert_package_cache_locked(&path); + if let Ok(dst) = File::open(&path) { + let meta = dst.metadata()?; if meta.len() > 0 { return Ok(MaybeLock::Ready(dst)); } @@ -271,7 +263,7 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { pkg: PackageId, checksum: &str, data: &[u8], - ) -> CargoResult { + ) -> CargoResult { // Verify what we just downloaded let mut state = Sha256::new(); state.update(data); @@ -280,8 +272,15 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { } let filename = self.filename(pkg); - let mut dst = self.cache_path.open_rw(&filename, self.config, &filename)?; - let meta = dst.file().metadata()?; + self.cache_path.create_dir()?; + let path = self.cache_path.join(&filename); + let path = self.config.assert_package_cache_locked(&path); + let mut dst = OpenOptions::new() + .create(true) + .read(true) + .write(true) + .open(&path)?; + let meta = dst.metadata()?; if meta.len() > 0 { return Ok(dst); } @@ -295,8 +294,10 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { let filename = format!("{}-{}.crate", pkg.name(), pkg.version()); let path = Path::new(&filename); - if let Ok(dst) = self.cache_path.open_ro(path, self.config, &filename) { - if let Ok(meta) = dst.file().metadata() { + let path = self.cache_path.join(path); + let path = self.config.assert_package_cache_locked(&path); + if let Ok(dst) = File::open(path) { + if let Ok(meta) = dst.metadata() { return meta.len() > 0; } } diff --git a/src/cargo/util/config.rs b/src/cargo/util/config.rs index b2014dda8d3..5ed8f5140c1 100644 --- a/src/cargo/util/config.rs +++ b/src/cargo/util/config.rs @@ -29,8 +29,13 @@ use crate::util::errors::{self, internal, CargoResult, CargoResultExt}; use crate::util::toml as cargo_toml; use crate::util::Filesystem; use crate::util::Rustc; +<<<<<<< HEAD use crate::util::{paths, validate_package_name}; use crate::util::{ToUrl, ToUrlWithBase}; +======= +use crate::util::{ToUrl, ToUrlWithBase}; +use crate::util::{paths, validate_package_name, FileLock}; +>>>>>>> Make registry locking more coarse /// Configuration information for cargo. This is not specific to a build, it is information /// relating to cargo itself. @@ -76,6 +81,9 @@ pub struct Config { profiles: LazyCell, /// Tracks which sources have been updated to avoid multiple updates. updated_sources: LazyCell>>, + /// Lock, if held, of the global package cache along with the number of + /// acquisitions so far. + package_cache_lock: RefCell>, } impl Config { @@ -132,6 +140,7 @@ impl Config { env, profiles: LazyCell::new(), updated_sources: LazyCell::new(), + package_cache_lock: RefCell::new(None), } } @@ -828,6 +837,36 @@ impl Config { }; T::deserialize(d).map_err(|e| e.into()) } + + pub fn assert_package_cache_locked<'a>(&self, f: &'a Filesystem) -> &'a Path { + let ret = f.as_path_unlocked(); + assert!( + self.package_cache_lock.borrow().is_some(), + "pacakge cache lock is not currently held, Cargo forgot to call \ + `acquire_package_cache_lock` before we got to this stack frame", + ); + assert!(ret.starts_with(self.home_path.as_path_unlocked())); + return ret; + } + + pub fn acquire_package_cache_lock<'a>(&'a self) -> CargoResult> { + let mut slot = self.package_cache_lock.borrow_mut(); + match *slot { + Some((_, ref mut cnt)) => { + *cnt += 1; + } + None => { + let lock = self + .home_path + .open_rw(".package-cache", self, "package cache lock") + .chain_err(|| "failed to acquire package cache lock")?; + *slot = Some((lock, 1)); + } + } + Ok(PackageCacheLock(self)) + } + + pub fn release_package_cache_lock(&self) {} } /// A segment of a config key. @@ -1664,3 +1703,16 @@ pub fn save_credentials(cfg: &Config, token: String, registry: Option) - Ok(()) } } + +pub struct PackageCacheLock<'a>(&'a Config); + +impl Drop for PackageCacheLock<'_> { + fn drop(&mut self) { + let mut slot = self.0.package_cache_lock.borrow_mut(); + let (_, cnt) = slot.as_mut().unwrap(); + *cnt -= 1; + if *cnt == 0 { + *slot = None; + } + } +} diff --git a/src/cargo/util/flock.rs b/src/cargo/util/flock.rs index e49a4daaa38..96458bdf356 100644 --- a/src/cargo/util/flock.rs +++ b/src/cargo/util/flock.rs @@ -12,6 +12,7 @@ use crate::util::errors::{CargoResult, CargoResultExt}; use crate::util::paths; use crate::util::Config; +#[derive(Debug)] pub struct FileLock { f: Option, path: PathBuf, @@ -136,6 +137,14 @@ impl Filesystem { self.root } + /// Returns the underlying `Path`. + /// + /// Note that this is a relatively dangerous operation and should be used + /// with great caution!. + pub fn as_path_unlocked(&self) -> &Path { + &self.root + } + /// Creates the directory pointed to by this filesystem. /// /// Handles errors where other Cargo processes are also attempting to diff --git a/tests/testsuite/search.rs b/tests/testsuite/search.rs index cb6f2174898..466e182e5cb 100644 --- a/tests/testsuite/search.rs +++ b/tests/testsuite/search.rs @@ -108,8 +108,10 @@ fn not_update() { let sid = SourceId::for_registry(®istry_url()).unwrap(); let cfg = Config::new(Shell::new(), paths::root(), paths::home().join(".cargo")); + let lock = cfg.acquire_package_cache_lock().unwrap(); let mut regsrc = RegistrySource::remote(sid, &HashSet::new(), &cfg); regsrc.update().unwrap(); + drop(lock); cargo_process("search postgres") .with_stdout_contains("hoare = \"0.1.1\" # Design by contract style assertions for Rust") From 783f22bf8c6ddd62d3c5429b44a382b5a7aaa77f Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 26 Apr 2019 11:49:12 -0700 Subject: [PATCH 5/7] Thread through last update time to index cache Removed in the previous commit this now adds dedicated tracking for the last update. --- src/cargo/sources/registry/index.rs | 24 +++++++--------- src/cargo/sources/registry/local.rs | 14 ++++++---- src/cargo/sources/registry/mod.rs | 2 ++ src/cargo/sources/registry/remote.rs | 41 ++++++++++++++++++++-------- tests/testsuite/concurrent.rs | 4 +-- 5 files changed, 53 insertions(+), 32 deletions(-) diff --git a/src/cargo/sources/registry/index.rs b/src/cargo/sources/registry/index.rs index 57bc720aaa5..6a02a582146 100644 --- a/src/cargo/sources/registry/index.rs +++ b/src/cargo/sources/registry/index.rs @@ -316,14 +316,7 @@ impl<'cfg> RegistryIndex<'cfg> { // let root = self.config.assert_package_cache_locked(&self.path); let root = load.assert_index_locked(&self.path); let cache_root = root.join(".cache"); - - // TODO: comment - let lock_mtime = None; - // let lock_mtime = lock - // .as_ref() - // .and_then(|l| l.file().metadata().ok()) - // .map(|t| FileTime::from_last_modification_time(&t)); - + let last_index_update = load.last_modified();; // See module comment in `registry/mod.rs` for why this is structured // the way it is. @@ -345,7 +338,7 @@ impl<'cfg> RegistryIndex<'cfg> { // along the way produce helpful "did you mean?" suggestions. for path in UncanonicalizedIter::new(&raw_path).take(1024) { let summaries = Summaries::parse( - lock_mtime, + last_index_update, &root, &cache_root, path.as_ref(), @@ -465,7 +458,7 @@ impl Summaries { /// for `relative` from the underlying index (aka typically libgit2 with /// crates.io) and then parse everything in there. /// - /// * `lock_mtime` - this is a file modification time where if any cache + /// * `last_index_update` - this is a file modification time where if any cache /// file is older than this the cache should be considered out of date and /// needs to be rebuilt. /// * `root` - this is the root argument passed to `load` @@ -478,7 +471,7 @@ impl Summaries { /// * `load` - the actual index implementation which may be very slow to /// call. We avoid this if we can. pub fn parse( - lock_mtime: Option, + last_index_update: Option, root: &Path, cache_root: &Path, relative: &Path, @@ -490,12 +483,12 @@ impl Summaries { // of reasons, but consider all of them non-fatal and just log their // occurrence in case anyone is debugging anything. let cache_path = cache_root.join(relative); - if let Some(lock_mtime) = lock_mtime { + if let Some(last_index_update) = last_index_update { match File::open(&cache_path) { Ok(file) => { let metadata = file.metadata()?; let cache_mtime = FileTime::from_last_modification_time(&metadata); - if cache_mtime > lock_mtime { + if cache_mtime > last_index_update { log::debug!("cache for {:?} is fresh", relative); match Summaries::parse_cache(&file, &metadata) { Ok(s) => return Ok(Some(s)), @@ -560,7 +553,10 @@ impl Summaries { // // This is opportunistic so we ignore failure here but are sure to log // something in case of error. - if fs::create_dir_all(cache_path.parent().unwrap()).is_ok() { + // + // Note that we also skip this when `last_index_update` is `None` because it + // means we can't handle the cache anyway. + if last_index_update.is_some() && fs::create_dir_all(cache_path.parent().unwrap()).is_ok() { let path = Filesystem::new(cache_path.clone()); config.assert_package_cache_locked(&path); if let Err(e) = fs::write(cache_path, cache_bytes) { diff --git a/src/cargo/sources/registry/local.rs b/src/cargo/sources/registry/local.rs index 8069361b987..cfa3e0686fc 100644 --- a/src/cargo/sources/registry/local.rs +++ b/src/cargo/sources/registry/local.rs @@ -1,14 +1,14 @@ -use std::fs::File; -use std::io::SeekFrom; -use std::io::prelude::*; -use std::path::Path; - use crate::core::PackageId; use crate::sources::registry::{MaybeLock, RegistryConfig, RegistryData}; use crate::util::errors::{CargoResult, CargoResultExt}; use crate::util::paths; use crate::util::{Config, Filesystem, Sha256}; +use filetime::FileTime; use hex; +use std::fs::File; +use std::io::prelude::*; +use std::io::SeekFrom; +use std::path::Path; pub struct LocalRegistry<'cfg> { index_path: Filesystem, @@ -43,6 +43,10 @@ impl<'cfg> RegistryData for LocalRegistry<'cfg> { path.as_path_unlocked() } + fn last_modified(&self) -> Option { + None + } + fn load( &self, root: &Path, diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs index 4090d59d854..3f4b2fe4d52 100644 --- a/src/cargo/sources/registry/mod.rs +++ b/src/cargo/sources/registry/mod.rs @@ -165,6 +165,7 @@ use std::fs::{File, OpenOptions}; use std::io::Write; use std::path::{Path, PathBuf}; +use filetime::FileTime; use flate2::read::GzDecoder; use log::debug; use semver::{Version, VersionReq}; @@ -371,6 +372,7 @@ pub trait RegistryData { true } fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path; + fn last_modified(&self) -> Option; } pub enum MaybeLock { diff --git a/src/cargo/sources/registry/remote.rs b/src/cargo/sources/registry/remote.rs index f2bb4374b60..5b6b7bdeda8 100644 --- a/src/cargo/sources/registry/remote.rs +++ b/src/cargo/sources/registry/remote.rs @@ -1,3 +1,13 @@ +use crate::core::{PackageId, SourceId}; +use crate::sources::git; +use crate::sources::registry::MaybeLock; +use crate::sources::registry::{RegistryConfig, RegistryData, CRATE_TEMPLATE, VERSION_TEMPLATE}; +use crate::util::errors::{CargoResult, CargoResultExt}; +use crate::util::{Config, Filesystem, Sha256}; +use crate::util::paths; +use filetime::FileTime; +use lazycell::LazyCell; +use log::{debug, trace}; use std::cell::{Cell, Ref, RefCell}; use std::fmt::Write as FmtWrite; use std::fs::{self, File, OpenOptions}; @@ -7,16 +17,6 @@ use std::mem; use std::path::Path; use std::str; -use lazycell::LazyCell; -use log::{debug, trace}; - -use crate::core::{PackageId, SourceId}; -use crate::sources::git; -use crate::sources::registry::MaybeLock; -use crate::sources::registry::{RegistryConfig, RegistryData, CRATE_TEMPLATE, VERSION_TEMPLATE}; -use crate::util::errors::{CargoResult, CargoResultExt}; -use crate::util::{Config, Filesystem, Sha256}; - pub struct RemoteRegistry<'cfg> { index_path: Filesystem, cache_path: Filesystem, @@ -25,6 +25,7 @@ pub struct RemoteRegistry<'cfg> { tree: RefCell>>, repo: LazyCell, head: Cell>, + last_updated: Cell>, } impl<'cfg> RemoteRegistry<'cfg> { @@ -37,6 +38,7 @@ impl<'cfg> RemoteRegistry<'cfg> { tree: RefCell::new(None), repo: LazyCell::new(), head: Cell::new(None), + last_updated: Cell::new(None), } } @@ -123,6 +125,8 @@ impl<'cfg> RemoteRegistry<'cfg> { } } +const LAST_UPDATED_FILE: &str = ".last-updated"; + impl<'cfg> RegistryData for RemoteRegistry<'cfg> { fn prepare(&self) -> CargoResult<()> { self.repo()?; // create intermediate dirs and initialize the repo @@ -137,6 +141,16 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { self.config.assert_package_cache_locked(path) } + fn last_modified(&self) -> Option { + if let Some(time) = self.last_updated.get() { + return Some(time); + } + let path = self.config.assert_package_cache_locked(&self.index_path); + let mtime = paths::mtime(&path.join(LAST_UPDATED_FILE)).ok(); + self.last_updated.set(mtime); + self.last_updated.get() + } + fn load( &self, _root: &Path, @@ -209,7 +223,8 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { self.prepare()?; self.head.set(None); *self.tree.borrow_mut() = None; - self.config.assert_package_cache_locked(&self.index_path); + self.last_updated.set(None); + let path = self.config.assert_package_cache_locked(&self.index_path); self.config .shell() .status("Updating", self.source_id.display_index())?; @@ -222,6 +237,10 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { .chain_err(|| format!("failed to fetch `{}`", url))?; self.config.updated_sources().insert(self.source_id); + // Create a dummy file to record the mtime for when we updated the + // index. + File::create(&path.join(LAST_UPDATED_FILE))?; + Ok(()) } diff --git a/tests/testsuite/concurrent.rs b/tests/testsuite/concurrent.rs index 0368f48f739..64404038157 100644 --- a/tests/testsuite/concurrent.rs +++ b/tests/testsuite/concurrent.rs @@ -453,7 +453,7 @@ fn debug_release_ok() { let a = a.join().unwrap(); execs() - .with_stderr( + .with_stderr_contains( "\ [COMPILING] foo v0.0.1 [..] [FINISHED] dev [unoptimized + debuginfo] target(s) in [..] @@ -461,7 +461,7 @@ fn debug_release_ok() { ) .run_output(&a); execs() - .with_stderr( + .with_stderr_contains( "\ [COMPILING] foo v0.0.1 [..] [FINISHED] release [optimized] target(s) in [..] From 1daff03f1fdf997e017793e9b477d0f4f7ccc167 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Tue, 30 Apr 2019 14:23:04 -0700 Subject: [PATCH 6/7] Avoid using mtime information for reusing cache files Using mtime information is pretty finnicky across platforms, so instead take a different strategy where we embed the sha that a cache file was generated from into the cache file itself. If the registry's sha has changed then we regenerate the cache file, otherwise we can reuse the cache file. This should make cache file generation more robust (any command can generate a cache file to get used at any time) as well as works better across platforms (doesn't run into issues with coarse mtime systems and the like). --- src/cargo/sources/registry/index.rs | 85 +++++++++++++++------------- src/cargo/sources/registry/local.rs | 5 +- src/cargo/sources/registry/mod.rs | 5 +- src/cargo/sources/registry/remote.rs | 23 ++++---- 4 files changed, 59 insertions(+), 59 deletions(-) diff --git a/src/cargo/sources/registry/index.rs b/src/cargo/sources/registry/index.rs index 6a02a582146..679f03916a8 100644 --- a/src/cargo/sources/registry/index.rs +++ b/src/cargo/sources/registry/index.rs @@ -67,12 +67,10 @@ //! hopefully those are more obvious inline in the code itself. use std::collections::{HashMap, HashSet}; -use std::fs::{self, File}; -use std::io::Read; +use std::fs; use std::path::Path; use std::str; -use filetime::FileTime; use log::info; use semver::{Version, VersionReq}; @@ -316,7 +314,7 @@ impl<'cfg> RegistryIndex<'cfg> { // let root = self.config.assert_package_cache_locked(&self.path); let root = load.assert_index_locked(&self.path); let cache_root = root.join(".cache"); - let last_index_update = load.last_modified();; + let index_version = load.current_version(); // See module comment in `registry/mod.rs` for why this is structured // the way it is. @@ -338,7 +336,7 @@ impl<'cfg> RegistryIndex<'cfg> { // along the way produce helpful "did you mean?" suggestions. for path in UncanonicalizedIter::new(&raw_path).take(1024) { let summaries = Summaries::parse( - last_index_update, + index_version.as_ref().map(|s| &**s), &root, &cache_root, path.as_ref(), @@ -471,7 +469,7 @@ impl Summaries { /// * `load` - the actual index implementation which may be very slow to /// call. We avoid this if we can. pub fn parse( - last_index_update: Option, + index_version: Option<&str>, root: &Path, cache_root: &Path, relative: &Path, @@ -483,24 +481,18 @@ impl Summaries { // of reasons, but consider all of them non-fatal and just log their // occurrence in case anyone is debugging anything. let cache_path = cache_root.join(relative); - if let Some(last_index_update) = last_index_update { - match File::open(&cache_path) { - Ok(file) => { - let metadata = file.metadata()?; - let cache_mtime = FileTime::from_last_modification_time(&metadata); - if cache_mtime > last_index_update { - log::debug!("cache for {:?} is fresh", relative); - match Summaries::parse_cache(&file, &metadata) { - Ok(s) => return Ok(Some(s)), - Err(e) => { - log::debug!("failed to parse {:?} cache: {}", relative, e); - } - } - } else { - log::debug!("cache for {:?} is out of date", relative); + if let Some(index_version) = index_version { + match fs::read(&cache_path) { + Ok(contents) => match Summaries::parse_cache(contents, index_version) { + Ok(s) => { + log::debug!("fast path for registry cache of {:?}", relative); + return Ok(Some(s)) } - } - Err(e) => log::debug!("cache for {:?} error: {}", relative, e), + Err(e) => { + log::debug!("failed to parse {:?} cache: {}", relative, e); + } + }, + Err(e) => log::debug!("cache missing for {:?} error: {}", relative, e), } } @@ -510,7 +502,7 @@ impl Summaries { log::debug!("slow path for {:?}", relative); let mut ret = Summaries::default(); let mut hit_closure = false; - let mut cache_bytes = Vec::new(); + let mut cache_bytes = None; let err = load.load(root, relative, &mut |contents| { ret.raw_data = contents.to_vec(); let mut cache = SummariesCache::default(); @@ -535,7 +527,9 @@ impl Summaries { ret.versions.insert(version, summary.into()); start = end + 1; } - cache_bytes = cache.serialize(); + if let Some(index_version) = index_version { + cache_bytes = Some(cache.serialize(index_version)); + } Ok(()) }); @@ -553,14 +547,13 @@ impl Summaries { // // This is opportunistic so we ignore failure here but are sure to log // something in case of error. - // - // Note that we also skip this when `last_index_update` is `None` because it - // means we can't handle the cache anyway. - if last_index_update.is_some() && fs::create_dir_all(cache_path.parent().unwrap()).is_ok() { - let path = Filesystem::new(cache_path.clone()); - config.assert_package_cache_locked(&path); - if let Err(e) = fs::write(cache_path, cache_bytes) { - log::info!("failed to write cache: {}", e); + if let Some(cache_bytes) = cache_bytes { + if fs::create_dir_all(cache_path.parent().unwrap()).is_ok() { + let path = Filesystem::new(cache_path.clone()); + config.assert_package_cache_locked(&path); + if let Err(e) = fs::write(cache_path, cache_bytes) { + log::info!("failed to write cache: {}", e); + } } } Ok(Some(ret)) @@ -568,11 +561,8 @@ impl Summaries { /// Parses an open `File` which represents information previously cached by /// Cargo. - pub fn parse_cache(mut file: &File, meta: &fs::Metadata) -> CargoResult { - let mut contents = Vec::new(); - contents.reserve(meta.len() as usize + 1); - file.read_to_end(&mut contents)?; - let cache = SummariesCache::parse(&contents)?; + pub fn parse_cache(contents: Vec, last_index_update: &str) -> CargoResult { + let cache = SummariesCache::parse(&contents, last_index_update)?; let mut ret = Summaries::default(); for (version, summary) in cache.versions { let (start, end) = subslice_bounds(&contents, summary); @@ -614,7 +604,7 @@ impl Summaries { const CURRENT_CACHE_VERSION: u8 = 1; impl<'a> SummariesCache<'a> { - fn parse(data: &'a [u8]) -> CargoResult> { + fn parse(data: &'a [u8], last_index_update: &str) -> CargoResult> { // NB: keep this method in sync with `serialize` below let (first_byte, rest) = data .split_first() @@ -624,6 +614,19 @@ impl<'a> SummariesCache<'a> { } let mut iter = memchr::Memchr::new(0, rest); let mut start = 0; + if let Some(end) = iter.next() { + let update = &rest[start..end]; + if update != last_index_update.as_bytes() { + failure::bail!( + "cache out of date: current index ({}) != cache ({})", + last_index_update, + str::from_utf8(update)?, + ) + } + start = end + 1; + } else { + failure::bail!("malformed file"); + } let mut ret = SummariesCache::default(); while let Some(version_end) = iter.next() { let version = &rest[start..version_end]; @@ -637,7 +640,7 @@ impl<'a> SummariesCache<'a> { Ok(ret) } - fn serialize(&self) -> Vec { + fn serialize(&self, index_version: &str) -> Vec { // NB: keep this method in sync with `parse` above let size = self .versions @@ -646,6 +649,8 @@ impl<'a> SummariesCache<'a> { .sum(); let mut contents = Vec::with_capacity(size); contents.push(CURRENT_CACHE_VERSION); + contents.extend_from_slice(index_version.as_bytes()); + contents.push(0); for (version, data) in self.versions.iter() { contents.extend_from_slice(version.to_string().as_bytes()); contents.push(0); diff --git a/src/cargo/sources/registry/local.rs b/src/cargo/sources/registry/local.rs index cfa3e0686fc..a9c16a0ea3b 100644 --- a/src/cargo/sources/registry/local.rs +++ b/src/cargo/sources/registry/local.rs @@ -1,9 +1,8 @@ -use crate::core::PackageId; +use crate::core::{PackageId, InternedString}; use crate::sources::registry::{MaybeLock, RegistryConfig, RegistryData}; use crate::util::errors::{CargoResult, CargoResultExt}; use crate::util::paths; use crate::util::{Config, Filesystem, Sha256}; -use filetime::FileTime; use hex; use std::fs::File; use std::io::prelude::*; @@ -43,7 +42,7 @@ impl<'cfg> RegistryData for LocalRegistry<'cfg> { path.as_path_unlocked() } - fn last_modified(&self) -> Option { + fn current_version(&self) -> Option { None } diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs index 3f4b2fe4d52..cffdc943cbf 100644 --- a/src/cargo/sources/registry/mod.rs +++ b/src/cargo/sources/registry/mod.rs @@ -165,7 +165,6 @@ use std::fs::{File, OpenOptions}; use std::io::Write; use std::path::{Path, PathBuf}; -use filetime::FileTime; use flate2::read::GzDecoder; use log::debug; use semver::{Version, VersionReq}; @@ -174,7 +173,7 @@ use tar::Archive; use crate::core::dependency::{Dependency, Kind}; use crate::core::source::MaybePackage; -use crate::core::{Package, PackageId, Source, SourceId, Summary}; +use crate::core::{Package, PackageId, Source, SourceId, Summary, InternedString}; use crate::sources::PathSource; use crate::util::errors::CargoResultExt; use crate::util::hex; @@ -372,7 +371,7 @@ pub trait RegistryData { true } fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path; - fn last_modified(&self) -> Option; + fn current_version(&self) -> Option; } pub enum MaybeLock { diff --git a/src/cargo/sources/registry/remote.rs b/src/cargo/sources/registry/remote.rs index 5b6b7bdeda8..a5c580c336f 100644 --- a/src/cargo/sources/registry/remote.rs +++ b/src/cargo/sources/registry/remote.rs @@ -1,11 +1,9 @@ -use crate::core::{PackageId, SourceId}; +use crate::core::{PackageId, SourceId, InternedString}; use crate::sources::git; use crate::sources::registry::MaybeLock; use crate::sources::registry::{RegistryConfig, RegistryData, CRATE_TEMPLATE, VERSION_TEMPLATE}; use crate::util::errors::{CargoResult, CargoResultExt}; use crate::util::{Config, Filesystem, Sha256}; -use crate::util::paths; -use filetime::FileTime; use lazycell::LazyCell; use log::{debug, trace}; use std::cell::{Cell, Ref, RefCell}; @@ -25,7 +23,7 @@ pub struct RemoteRegistry<'cfg> { tree: RefCell>>, repo: LazyCell, head: Cell>, - last_updated: Cell>, + current_sha: Cell>, } impl<'cfg> RemoteRegistry<'cfg> { @@ -38,7 +36,7 @@ impl<'cfg> RemoteRegistry<'cfg> { tree: RefCell::new(None), repo: LazyCell::new(), head: Cell::new(None), - last_updated: Cell::new(None), + current_sha: Cell::new(None), } } @@ -141,14 +139,13 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { self.config.assert_package_cache_locked(path) } - fn last_modified(&self) -> Option { - if let Some(time) = self.last_updated.get() { - return Some(time); + fn current_version(&self) -> Option { + if let Some(sha) = self.current_sha.get() { + return Some(sha); } - let path = self.config.assert_package_cache_locked(&self.index_path); - let mtime = paths::mtime(&path.join(LAST_UPDATED_FILE)).ok(); - self.last_updated.set(mtime); - self.last_updated.get() + let sha = InternedString::new(&self.head().ok()?.to_string()); + self.current_sha.set(Some(sha)); + Some(sha) } fn load( @@ -223,7 +220,7 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { self.prepare()?; self.head.set(None); *self.tree.borrow_mut() = None; - self.last_updated.set(None); + self.current_sha.set(None); let path = self.config.assert_package_cache_locked(&self.index_path); self.config .shell() From e33881bccba18eb0f3516062b65e9dde55afe3e4 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Tue, 30 Apr 2019 14:38:55 -0700 Subject: [PATCH 7/7] Add debug assertions for cache contents This will largely only get tested during Cargo's own tests, but this commit adds debug assertions where the cache of registry JSON files is always valid and up to date when we consider it being up to date. --- src/cargo/sources/registry/index.rs | 39 +++++++++++++++++++++++------ src/cargo/util/config.rs | 7 +----- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/cargo/sources/registry/index.rs b/src/cargo/sources/registry/index.rs index 679f03916a8..f30ac76ea2a 100644 --- a/src/cargo/sources/registry/index.rs +++ b/src/cargo/sources/registry/index.rs @@ -456,9 +456,9 @@ impl Summaries { /// for `relative` from the underlying index (aka typically libgit2 with /// crates.io) and then parse everything in there. /// - /// * `last_index_update` - this is a file modification time where if any cache - /// file is older than this the cache should be considered out of date and - /// needs to be rebuilt. + /// * `index_version` - a version string to describe the current state of + /// the index which for remote registries is the current git sha and + /// for local registries is not available. /// * `root` - this is the root argument passed to `load` /// * `cache_root` - this is the root on the filesystem itself of where to /// store cache files. @@ -481,12 +481,17 @@ impl Summaries { // of reasons, but consider all of them non-fatal and just log their // occurrence in case anyone is debugging anything. let cache_path = cache_root.join(relative); + let mut cache_contents = None; if let Some(index_version) = index_version { match fs::read(&cache_path) { Ok(contents) => match Summaries::parse_cache(contents, index_version) { Ok(s) => { log::debug!("fast path for registry cache of {:?}", relative); - return Ok(Some(s)) + if cfg!(debug_assertions) { + cache_contents = Some(s.raw_data); + } else { + return Ok(Some(s)) + } } Err(e) => { log::debug!("failed to parse {:?} cache: {}", relative, e); @@ -537,10 +542,19 @@ impl Summaries { // or we haven't updated the registry yet. If we actually ran the // closure though then we care about those errors. if !hit_closure { + debug_assert!(cache_contents.is_none()); return Ok(None); } err?; + // If we've got debug assertions enabled and the cache was previously + // present and considered fresh this is where the debug assertions + // actually happens to verify that our cache is indeed fresh and + // computes exactly the same value as before. + if cfg!(debug_assertions) && cache_contents.is_some() { + assert_eq!(cache_bytes, cache_contents); + } + // Once we have our `cache_bytes` which represents the `Summaries` we're // about to return, write that back out to disk so future Cargo // invocations can use it. @@ -556,6 +570,7 @@ impl Summaries { } } } + Ok(Some(ret)) } @@ -589,9 +604,15 @@ impl Summaries { // Implementation of serializing/deserializing the cache of summaries on disk. // Currently the format looks like: // -// +--------------+----------------+---+----------------+---+ -// | version byte | version string | 0 | JSON blob ... | 0 | ... -// +--------------+----------------+---+----------------+---+ +// +--------------+-------------+---+ +// | version byte | git sha rev | 0 | +// +--------------+-------------+---+ +// +// followed by... +// +// +----------------+---+------------+---+ +// | semver version | 0 | JSON blob | 0 | ... +// +----------------+---+------------+---+ // // The idea is that this is a very easy file for Cargo to parse in future // invocations. The read from disk should be quite fast and then afterwards all @@ -599,7 +620,9 @@ impl Summaries { // // The leading version byte is intended to ensure that there's some level of // future compatibility against changes to this cache format so if different -// versions of Cargo share the same cache they don't get too confused. +// versions of Cargo share the same cache they don't get too confused. The git +// sha lets us know when the file needs to be regenerated (it needs regeneration +// whenever the index itself updates). const CURRENT_CACHE_VERSION: u8 = 1; diff --git a/src/cargo/util/config.rs b/src/cargo/util/config.rs index 5ed8f5140c1..c3935d7281e 100644 --- a/src/cargo/util/config.rs +++ b/src/cargo/util/config.rs @@ -29,13 +29,8 @@ use crate::util::errors::{self, internal, CargoResult, CargoResultExt}; use crate::util::toml as cargo_toml; use crate::util::Filesystem; use crate::util::Rustc; -<<<<<<< HEAD -use crate::util::{paths, validate_package_name}; -use crate::util::{ToUrl, ToUrlWithBase}; -======= -use crate::util::{ToUrl, ToUrlWithBase}; use crate::util::{paths, validate_package_name, FileLock}; ->>>>>>> Make registry locking more coarse +use crate::util::{ToUrl, ToUrlWithBase}; /// Configuration information for cargo. This is not specific to a build, it is information /// relating to cargo itself.