Auto merge of #119367 - Mark-Simulacrum:relative-spans, r=wesleywiser

bors · bors · commit fe2cfd45055a · 2023-12-30T05:43:39.000Z
Shrink span encoding further Spans are now stored in a more compact form which cuts down on at least 1 byte per span (indirect/direct encoding) and at most 3 bytes per span (indirect/direct encoding, context byte, length byte). As a result, libcore metadata shrinks by 1.5MB. I'm not a huge fan of the fairly manual encoding/decoding from bits implemented here. Something like Tokio's pack abstraction (https://github.com/tokio-rs/tokio/blob/master/tokio/src/util/bit.rs) might be desirable to cut down on some of the shifting etc. We might also say that this isn't worth doing :) I took a look at copying the span encoding we use in memory (described [here](https://github.com/rust-lang/rust/blob/master/compiler/rustc_span/src/span_encoding.rs)). I think the format there makes a lot more sense for in-memory storage where prioritizing a fixed length (i.e., 4 or 8 bytes) is much more important. In metadata, it's much easier for us to have variable-length values, so there's less of a cliff if we don't quite fit. The bit packing scheme there would need changes to fit the varint scheme since it has a lot of all-1s patterns as the "relative offset" form.
diff --git a/compiler/rustc_metadata/src/rmeta/decoder.rs b/compiler/rustc_metadata/src/rmeta/decoder.rs
@@ -508,39 +508,37 @@ impl<'a, 'tcx> Decodable<DecodeContext<'a, 'tcx>> for ExpnId {
 impl<'a, 'tcx> Decodable<DecodeContext<'a, 'tcx>> for Span {
     fn decode(decoder: &mut DecodeContext<'a, 'tcx>) -> Span {
         let start = decoder.position();
-        let mode = SpanEncodingMode::decode(decoder);
-        let data = match mode {
-            SpanEncodingMode::Direct => SpanData::decode(decoder),
-            SpanEncodingMode::RelativeOffset(offset) => {
-                decoder.with_position(start - offset, |decoder| {
-                    let mode = SpanEncodingMode::decode(decoder);
-                    debug_assert!(matches!(mode, SpanEncodingMode::Direct));
-                    SpanData::decode(decoder)
-                })
-            }
-            SpanEncodingMode::AbsoluteOffset(addr) => decoder.with_position(addr, |decoder| {
-                let mode = SpanEncodingMode::decode(decoder);
-                debug_assert!(matches!(mode, SpanEncodingMode::Direct));
-                SpanData::decode(decoder)
-            }),
+        let tag = SpanTag(decoder.peek_byte());
+        let data = if tag.kind() == SpanKind::Indirect {
+            // Skip past the tag we just peek'd.
+            decoder.read_u8();
+            let offset_or_position = decoder.read_usize();
+            let position = if tag.is_relative_offset() {
+                start - offset_or_position
+            } else {
+                offset_or_position
+            };
+            decoder.with_position(position, SpanData::decode)
+        } else {
+            SpanData::decode(decoder)
         };
         Span::new(data.lo, data.hi, data.ctxt, data.parent)
     }
 }
 
 impl<'a, 'tcx> Decodable<DecodeContext<'a, 'tcx>> for SpanData {
     fn decode(decoder: &mut DecodeContext<'a, 'tcx>) -> SpanData {
-        let ctxt = SyntaxContext::decode(decoder);
-        let tag = u8::decode(decoder);
+        let tag = SpanTag::decode(decoder);
+        let ctxt = tag.context().unwrap_or_else(|| SyntaxContext::decode(decoder));
 
-        if tag == TAG_PARTIAL_SPAN {
+        if tag.kind() == SpanKind::Partial {
             return DUMMY_SP.with_ctxt(ctxt).data();
         }
 
-        debug_assert!(tag == TAG_VALID_SPAN_LOCAL || tag == TAG_VALID_SPAN_FOREIGN);
+        debug_assert!(tag.kind() == SpanKind::Local || tag.kind() == SpanKind::Foreign);
 
         let lo = BytePos::decode(decoder);
-        let len = BytePos::decode(decoder);
+        let len = tag.length().unwrap_or_else(|| BytePos::decode(decoder));
         let hi = lo + len;
 
         let Some(sess) = decoder.sess else {
@@ -581,7 +579,7 @@ impl<'a, 'tcx> Decodable<DecodeContext<'a, 'tcx>> for SpanData {
         // treat the 'local' and 'foreign' cases almost identically during deserialization:
         // we can call `imported_source_file` for the proper crate, and binary search
         // through the returned slice using our span.
-        let source_file = if tag == TAG_VALID_SPAN_LOCAL {
+        let source_file = if tag.kind() == SpanKind::Local {
             decoder.cdata().imported_source_file(metadata_index, sess)
         } else {
             // When we encode a proc-macro crate, all `Span`s should be encoded
diff --git a/compiler/rustc_metadata/src/rmeta/encoder.rs b/compiler/rustc_metadata/src/rmeta/encoder.rs
@@ -177,15 +177,17 @@ impl<'a, 'tcx> Encodable<EncodeContext<'a, 'tcx>> for Span {
                 // previously saved offset must be smaller than the current position.
                 let offset = s.opaque.position() - last_location;
                 if offset < last_location {
-                    SpanEncodingMode::RelativeOffset(offset).encode(s)
+                    SpanTag::indirect(true).encode(s);
+                    offset.encode(s);
                 } else {
-                    SpanEncodingMode::AbsoluteOffset(last_location).encode(s)
+                    SpanTag::indirect(false).encode(s);
+                    last_location.encode(s);
                 }
             }
             Entry::Vacant(v) => {
                 let position = s.opaque.position();
                 v.insert(position);
-                SpanEncodingMode::Direct.encode(s);
+                // Data is encoded with a SpanTag prefix (see below).
                 self.data().encode(s);
             }
         }
@@ -225,14 +227,15 @@ impl<'a, 'tcx> Encodable<EncodeContext<'a, 'tcx>> for SpanData {
         // IMPORTANT: If this is ever changed, be sure to update
         // `rustc_span::hygiene::raw_encode_expn_id` to handle
         // encoding `ExpnData` for proc-macro crates.
-        if s.is_proc_macro {
-            SyntaxContext::root().encode(s);
-        } else {
-            self.ctxt.encode(s);
-        }
+        let ctxt = if s.is_proc_macro { SyntaxContext::root() } else { self.ctxt };
 
         if self.is_dummy() {
-            return TAG_PARTIAL_SPAN.encode(s);
+            let tag = SpanTag::new(SpanKind::Partial, ctxt, 0);
+            tag.encode(s);
+            if tag.context().is_none() {
+                ctxt.encode(s);
+            }
+            return;
         }
 
         // The Span infrastructure should make sure that this invariant holds:
@@ -250,7 +253,12 @@ impl<'a, 'tcx> Encodable<EncodeContext<'a, 'tcx>> for SpanData {
         if !source_file.contains(self.hi) {
             // Unfortunately, macro expansion still sometimes generates Spans
             // that malformed in this way.
-            return TAG_PARTIAL_SPAN.encode(s);
+            let tag = SpanTag::new(SpanKind::Partial, ctxt, 0);
+            tag.encode(s);
+            if tag.context().is_none() {
+                ctxt.encode(s);
+            }
+            return;
         }
 
         // There are two possible cases here:
@@ -269,7 +277,7 @@ impl<'a, 'tcx> Encodable<EncodeContext<'a, 'tcx>> for SpanData {
         // if we're a proc-macro crate.
         // This allows us to avoid loading the dependencies of proc-macro crates: all of
         // the information we need to decode `Span`s is stored in the proc-macro crate.
-        let (tag, metadata_index) = if source_file.is_imported() && !s.is_proc_macro {
+        let (kind, metadata_index) = if source_file.is_imported() && !s.is_proc_macro {
             // To simplify deserialization, we 'rebase' this span onto the crate it originally came
             // from (the crate that 'owns' the file it references. These rebased 'lo' and 'hi'
             // values are relative to the source map information for the 'foreign' crate whose
@@ -287,7 +295,7 @@ impl<'a, 'tcx> Encodable<EncodeContext<'a, 'tcx>> for SpanData {
                 }
             };
 
-            (TAG_VALID_SPAN_FOREIGN, metadata_index)
+            (SpanKind::Foreign, metadata_index)
         } else {
             // Record the fact that we need to encode the data for this `SourceFile`
             let source_files =
@@ -296,7 +304,7 @@ impl<'a, 'tcx> Encodable<EncodeContext<'a, 'tcx>> for SpanData {
             let metadata_index: u32 =
                 metadata_index.try_into().expect("cannot export more than U32_MAX files");
 
-            (TAG_VALID_SPAN_LOCAL, metadata_index)
+            (SpanKind::Local, metadata_index)
         };
 
         // Encode the start position relative to the file start, so we profit more from the
@@ -307,14 +315,20 @@ impl<'a, 'tcx> Encodable<EncodeContext<'a, 'tcx>> for SpanData {
         // from the variable-length integer encoding that we use.
         let len = self.hi - self.lo;
 
+        let tag = SpanTag::new(kind, ctxt, len.0 as usize);
         tag.encode(s);
+        if tag.context().is_none() {
+            ctxt.encode(s);
+        }
         lo.encode(s);
-        len.encode(s);
+        if tag.length().is_none() {
+            len.encode(s);
+        }
 
         // Encode the index of the `SourceFile` for the span, in order to make decoding faster.
         metadata_index.encode(s);
 
-        if tag == TAG_VALID_SPAN_FOREIGN {
+        if kind == SpanKind::Foreign {
             // This needs to be two lines to avoid holding the `s.source_file_cache`
             // while calling `cnum.encode(s)`
             let cnum = s.source_file_cache.0.cnum;
diff --git a/compiler/rustc_metadata/src/rmeta/mod.rs b/compiler/rustc_metadata/src/rmeta/mod.rs
@@ -66,13 +66,6 @@ const METADATA_VERSION: u8 = 9;
 /// unsigned integer, and further followed by the rustc version string.
 pub const METADATA_HEADER: &[u8] = &[b'r', b'u', b's', b't', 0, 0, 0, METADATA_VERSION];
 
-#[derive(Encodable, Decodable)]
-enum SpanEncodingMode {
-    RelativeOffset(usize),
-    AbsoluteOffset(usize),
-    Direct,
-}
-
 /// A value of type T referred to by its absolute position
 /// in the metadata, and which can be decoded lazily.
 ///
@@ -488,10 +481,88 @@ bitflags::bitflags! {
     }
 }
 
-// Tags used for encoding Spans:
-const TAG_VALID_SPAN_LOCAL: u8 = 0;
-const TAG_VALID_SPAN_FOREIGN: u8 = 1;
-const TAG_PARTIAL_SPAN: u8 = 2;
+/// A span tag byte encodes a bunch of data, so that we can cut out a few extra bytes from span
+/// encodings (which are very common, for example, libcore has ~650,000 unique spans and over 1.1
+/// million references to prior-written spans).
+///
+/// The byte format is split into several parts:
+///
+/// [ a a a a a c d d ]
+///
+/// `a` bits represent the span length. We have 5 bits, so we can store lengths up to 30 inline, with
+/// an all-1s pattern representing that the length is stored separately.
+///
+/// `c` represents whether the span context is zero (and then it is not stored as a separate varint)
+/// for direct span encodings, and whether the offset is absolute or relative otherwise (zero for
+/// absolute).
+///
+/// d bits represent the kind of span we are storing (local, foreign, partial, indirect).
+#[derive(Encodable, Decodable, Copy, Clone)]
+struct SpanTag(u8);
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+enum SpanKind {
+    Local = 0b00,
+    Foreign = 0b01,
+    Partial = 0b10,
+    // Indicates the actual span contents are elsewhere.
+    // If this is the kind, then the span context bit represents whether it is a relative or
+    // absolute offset.
+    Indirect = 0b11,
+}
+
+impl SpanTag {
+    fn new(kind: SpanKind, context: rustc_span::SyntaxContext, length: usize) -> SpanTag {
+        let mut data = 0u8;
+        data |= kind as u8;
+        if context.is_root() {
+            data |= 0b100;
+        }
+        let all_1s_len = (0xffu8 << 3) >> 3;
+        // strictly less than - all 1s pattern is a sentinel for storage being out of band.
+        if length < all_1s_len as usize {
+            data |= (length as u8) << 3;
+        } else {
+            data |= all_1s_len << 3;
+        }
+
+        SpanTag(data)
+    }
+
+    fn indirect(relative: bool) -> SpanTag {
+        let mut tag = SpanTag(SpanKind::Indirect as u8);
+        if relative {
+            tag.0 |= 0b100;
+        }
+        tag
+    }
+
+    fn kind(self) -> SpanKind {
+        let masked = self.0 & 0b11;
+        match masked {
+            0b00 => SpanKind::Local,
+            0b01 => SpanKind::Foreign,
+            0b10 => SpanKind::Partial,
+            0b11 => SpanKind::Indirect,
+            _ => unreachable!(),
+        }
+    }
+
+    fn is_relative_offset(self) -> bool {
+        debug_assert_eq!(self.kind(), SpanKind::Indirect);
+        self.0 & 0b100 != 0
+    }
+
+    fn context(self) -> Option<rustc_span::SyntaxContext> {
+        if self.0 & 0b100 != 0 { Some(rustc_span::SyntaxContext::root()) } else { None }
+    }
+
+    fn length(self) -> Option<rustc_span::BytePos> {
+        let all_1s_len = (0xffu8 << 3) >> 3;
+        let len = self.0 >> 3;
+        if len != all_1s_len { Some(rustc_span::BytePos(u32::from(len))) } else { None }
+    }
+}
 
 // Tags for encoding Symbol's
 const SYMBOL_STR: u8 = 0;