replicators: Add initial support for latin1 replication

mvzink · mvzink · commit edbf0afc5fec · 2025-03-27T23:29:17.000Z
Note that this only allows us to snapshot latin1 columns and read latin1 columns from the binlog. It simply causes the data to be decoded to utf8 before storage. This means that comparisons (cache lookup and ordering), retrieval, and functions operating on this data will not match upstream. This introduces a new `readyset_data::encoding::Encoding` type which maps from MySQL collation IDs to corresponding encoders. We are using `encoding_rs`, which implements the WHATWG Encoding Standard. This means it does not precisely match MySQL's behavior for all encodings (e.g. it uses a different replacement character than MySQL for invalid byte sequences for some encodings). We may want to replace it in the future, but it should be simple to swap it out within `readyset-data` to avoid updating all the callsites (of which more will soon be added). Release-Note-Core: As a first step toward supporting latin1 text columns in MySQL, we now allow replicating these columns but storing them re-encoded as utf8. Data will be retrieved as utf8, so comparisons and retrieval will not match upstream. Future work will improve support. Change-Id: I86c4fbf8da2361ebbfa8ad6224bbce8763ad8e2d Reviewed-on: https://gerrit.readyset.name/c/readyset/+/9101 Reviewed-by: Johnathan Davis <jcd@readyset.io> Tested-by: Buildkite CI
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -125,6 +125,7 @@ derive_more = { version = "1.0.0", features = [
     "display",
 ] }
 diff = "0.1.13"
+encoding_rs = "0.8.35"
 enum-display-derive = "0.1.1"
 enum-kinds = "0.5.1"
 enum_dispatch = "0.3.13"
diff --git a/readyset-data/Cargo.toml b/readyset-data/Cargo.toml
@@ -11,6 +11,7 @@ bit-vec = { workspace = true, features = ["serde"] }
 bytes = { workspace = true }
 chrono = { workspace = true, features = ["serde"] }
 chrono-tz = { workspace = true, features = ["serde"] }
+encoding_rs = { workspace = true }
 eui48 = { workspace = true }
 itertools = { workspace = true }
 lazy_static = { workspace = true }
diff --git a/readyset-data/src/encoding.rs b/readyset-data/src/encoding.rs
@@ -0,0 +1,169 @@
+use std::fmt;
+
+use encoding_rs::{UTF_8, WINDOWS_1252};
+use readyset_errors::ReadySetError;
+use readyset_errors::ReadySetResult;
+
+/// Supported character encodings for string data
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Encoding {
+    /// UTF-8
+    ///
+    /// Note, we don't distinguish between MySQL's default utf8mb4 and deprecated utf8mb3 (which
+    /// only supports the BMP).
+    Utf8,
+    /// latin1 (CP1252/ISO-8859-1)
+    Latin1,
+    /// Binary data (not interpreted as text)
+    Binary,
+}
+
+impl fmt::Display for Encoding {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Encoding::Utf8 => write!(f, "utf8"),
+            Encoding::Latin1 => write!(f, "latin1"),
+            Encoding::Binary => write!(f, "binary"),
+        }
+    }
+}
+
+impl Encoding {
+    /// For reference, see [`mysql_common::collations::CollationId`]
+    pub fn from_mysql_collation_id(collation_id: u16) -> Self {
+        match collation_id {
+            // ascii, utf8mb3, utf8mb4
+            11 | 192..=247 | 255..=323 => Self::Utf8,
+            // latin1
+            5 | 8 | 15 | 31 | 47 | 48 | 49 | 94 => Self::Latin1,
+            // binary
+            63 => Self::Binary,
+
+            // Default to UTF-8 for other collations
+            _ => Self::Utf8,
+        }
+    }
+
+    fn get_encoding_rs(&self) -> Option<&'static encoding_rs::Encoding> {
+        match self {
+            Self::Utf8 => Some(UTF_8),
+            Self::Latin1 => Some(WINDOWS_1252),
+            Self::Binary => None,
+        }
+    }
+
+    /// Decode bytes from this encoding to a UTF-8 String
+    ///
+    /// For UTF-8 encoding, this validates that the bytes are valid UTF-8.
+    /// For Binary encoding, this returns an error as binary data can't be converted to a String.
+    pub fn decode(&self, bytes: &[u8]) -> ReadySetResult<String> {
+        match self {
+            Encoding::Binary => Err(ReadySetError::DecodingError {
+                encoding: self.to_string(),
+                message: "Cannot decode binary data to string".to_string(),
+            }),
+            _ => {
+                if let Some(encoding) = self.get_encoding_rs() {
+                    let (cow, _encoding_used, had_errors) = encoding.decode(bytes);
+
+                    if had_errors {
+                        return Err(ReadySetError::DecodingError {
+                            encoding: self.to_string(),
+                            message: "Some characters couldn't be decoded properly".to_string(),
+                        });
+                    }
+
+                    Ok(cow.into_owned())
+                } else {
+                    Err(ReadySetError::DecodingError {
+                        encoding: self.to_string(),
+                        message: "Unsupported encoding".to_string(),
+                    })
+                }
+            }
+        }
+    }
+
+    /// Encode a UTF-8 string to bytes in this encoding
+    pub fn encode(&self, string: &str) -> ReadySetResult<Vec<u8>> {
+        match self {
+            Encoding::Binary => Err(ReadySetError::EncodingError {
+                encoding: self.to_string(),
+                message: "Cannot encode string to binary".to_string(),
+            }),
+            _ => {
+                if let Some(encoding) = self.get_encoding_rs() {
+                    let (cow, _encoding_used, had_errors) = encoding.encode(string);
+
+                    if had_errors {
+                        return Err(ReadySetError::EncodingError {
+                            encoding: self.to_string(),
+                            message: "Some characters couldn't be encoded properly".to_string(),
+                        });
+                    }
+
+                    Ok(cow.into_owned())
+                } else {
+                    Err(ReadySetError::EncodingError {
+                        encoding: self.to_string(),
+                        message: "Unsupported encoding".to_string(),
+                    })
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_latin1_to_utf8() {
+        // Test with ASCII characters (valid in both Latin1 and UTF-8)
+        let latin1_bytes = b"Hello World";
+        let result = Encoding::Latin1.decode(latin1_bytes).unwrap();
+        assert_eq!(result, "Hello World");
+
+        // Test with Latin1 characters that need conversion in UTF-8
+        // Characters 0xA0-0xFF in Latin1 map to Unicode code points 0xA0-0xFF
+        // For example, 0xE9 in Latin1 is 'é'
+        let latin1_bytes = &[0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0xE9]; // "Hello é" in Latin1
+        let result = Encoding::Latin1.decode(latin1_bytes).unwrap();
+        assert_eq!(result, "Hello é");
+
+        // Test with all high-bit Latin1 characters (0x80-0xFF)
+        let mut latin1_high_bytes = Vec::new();
+        for b in 0x80..=0xFF {
+            latin1_high_bytes.push(b);
+        }
+
+        let result = Encoding::Latin1.decode(&latin1_high_bytes).unwrap();
+        // Make sure all characters were decoded (should be 128 chars for bytes 0x80-0xFF)
+        assert_eq!(result.chars().count(), 128);
+    }
+
+    #[test]
+    fn test_utf8_to_latin1() {
+        // Test with ASCII (should work fine)
+        let utf8_str = "Hello World";
+        let result = Encoding::Latin1.encode(utf8_str).unwrap();
+        assert_eq!(result, b"Hello World");
+
+        // Test with Latin1 characters
+        let utf8_str = "Hello é";
+        let result = Encoding::Latin1.encode(utf8_str).unwrap();
+        assert_eq!(result, &[0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0xE9]);
+
+        // Test with characters outside Latin1 range (should fail)
+        let utf8_str = "Hello 😊"; // Emoji is outside Latin1 range
+        let result = Encoding::Latin1.encode(utf8_str);
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            ReadySetError::EncodingError { encoding, .. } => {
+                assert_eq!(encoding, "latin1");
+            }
+            e => panic!("Unexpected error type: {:?}", e),
+        }
+    }
+}
diff --git a/readyset-data/src/lib.rs b/readyset-data/src/lib.rs
@@ -34,6 +34,7 @@ use uuid::Uuid;
 mod array;
 mod collation;
 pub mod dialect;
+pub mod encoding;
 mod r#enum;
 mod float;
 mod integer;
diff --git a/readyset-e2e-tests/Cargo.toml b/readyset-e2e-tests/Cargo.toml
@@ -8,8 +8,10 @@ edition = "2021"
 [dev-dependencies]
 chrono = { workspace = true }
 futures = { workspace = true }
+itertools = { workspace = true }
 mysql_async = { workspace = true }
 mysql_common = { workspace = true }
+pretty_assertions = { workspace = true }
 proptest = { workspace = true }
 readyset-adapter = { path = "../readyset-adapter" }
 readyset-client-test-helpers = { path = "../readyset-client-test-helpers", features = [
diff --git a/readyset-e2e-tests/tests/encoding_mysql.rs b/readyset-e2e-tests/tests/encoding_mysql.rs
@@ -0,0 +1,170 @@
+use mysql_async::prelude::Queryable;
+use pretty_assertions::assert_eq;
+use readyset_client_test_helpers::{
+    mysql_helpers::{self, MySQLAdapter},
+    sleep, TestBuilder,
+};
+use std::fmt::{Display, UpperHex};
+use test_utils::{serial, slow};
+
+/// At present, this tests that snapshotting and streaming replication of a varchar column with the
+/// specified character set results in the same utf8 encoded version of the data in Readyset. This
+/// means that we connect with relevant session variables configured for utf8 (i.e. `SET NAMES
+/// utf8mb4;`) so that MySQL will convert the data to utf8 before returning it to the client. This
+/// causes Readyset to return matching data iff it decodes the replicated (e.g. latin1) data to utf8
+/// before storing it.
+///
+/// In the future, this test can be extended for other boundary interfaces, e.g. also checking that
+/// `HEX(text)` returns the same values on both MySQL and Readyset; or that connecting to Readyset
+/// and requesting the original character set results in it being re-encoded from utf8 to the
+/// original before being returned to the client.
+#[cfg(test)]
+async fn test_encoding_replication_inner<T>(
+    charset: &str,
+    hex_len: usize,
+    range: impl IntoIterator<Item = T>,
+) where
+    T: UpperHex + Display,
+{
+    readyset_tracing::init_test_logging();
+
+    println!("starting");
+
+    let upstream_opts = mysql_helpers::upstream_config().db_name(Some("encoding_test"));
+    mysql_helpers::recreate_database("encoding_test").await;
+
+    let mut upstream_conn = mysql_async::Conn::new(upstream_opts).await.unwrap();
+
+    let create_snapshot_table = format!(
+        r#"
+            SET NAMES utf8mb4;
+            DROP TABLE IF EXISTS encoding_snapshot;
+            CREATE TABLE encoding_snapshot (
+                id INT NOT NULL PRIMARY KEY,
+                text VARCHAR(255) CHARACTER SET {}
+            );
+        "#,
+        charset
+    );
+    upstream_conn
+        .query_drop(create_snapshot_table)
+        .await
+        .unwrap();
+
+    println!("created table");
+
+    // Generate all latin1 characters (0x00-0xFF); characters 128-255 aren't valid UTF-8
+    let insert_values: String = range
+        .into_iter()
+        .map(|i| format!("({i}, UNHEX('{i:0width$X}'))", width = hex_len))
+        .collect::<Vec<String>>()
+        .join(",");
+    upstream_conn
+        .query_drop(format!(
+            "INSERT INTO encoding_snapshot (id, text) VALUES {insert_values}"
+        ))
+        .await
+        .unwrap();
+
+    println!("inserted data");
+
+    // Verify the data was inserted correctly
+    let my_rows: Vec<(i64, Vec<u8>)> = upstream_conn
+        .query("SELECT id, text FROM encoding_snapshot ORDER BY id")
+        .await
+        .unwrap();
+    dbg!(&my_rows);
+
+    println!("starting readyset");
+
+    // Test snapshot replication
+    let (rs_opts, _handle, shutdown_tx) = TestBuilder::default()
+        .recreate_database(false)
+        .fallback_db("encoding_test".to_string())
+        .build::<MySQLAdapter>()
+        .await;
+
+    println!("started readyset");
+
+    sleep().await;
+
+    println!("slept");
+
+    // Query the snapshotted table to verify all 255 characters were replicated correctly
+    let mut rs_conn = mysql_async::Conn::new(mysql_async::OptsBuilder::from_opts(rs_opts))
+        .await
+        .unwrap();
+
+    println!("connected");
+
+    let rs_snapshot_rows: Vec<(i64, Vec<u8>)> = rs_conn
+        .query("SELECT id, text FROM encoding_snapshot ORDER BY id")
+        .await
+        .unwrap();
+
+    dbg!(&rs_snapshot_rows);
+
+    assert_eq!(my_rows, rs_snapshot_rows);
+
+    // Test streaming replication
+    let create_streaming_table = format!(
+        r#"
+            DROP TABLE IF EXISTS encoding_streaming;
+            CREATE TABLE encoding_streaming (
+                id INT NOT NULL PRIMARY KEY,
+                text VARCHAR(255) CHARACTER SET {}
+            );
+        "#,
+        charset
+    );
+    upstream_conn
+        .query_drop(create_streaming_table)
+        .await
+        .unwrap();
+
+    println!("created streaming table");
+
+    upstream_conn
+        .query_drop(format!(
+            "INSERT INTO encoding_streaming (id, text) VALUES {insert_values}"
+        ))
+        .await
+        .unwrap();
+
+    println!("inserted streaming data");
+
+    sleep().await;
+
+    println!("slept");
+
+    let rs_streaming_rows: Vec<(i64, Vec<u8>)> = rs_conn
+        .query("SELECT id, text FROM encoding_streaming ORDER BY id")
+        .await
+        .unwrap();
+
+    dbg!(&rs_streaming_rows);
+
+    assert_eq!(my_rows, rs_streaming_rows);
+
+    println!("shutting down");
+
+    shutdown_tx.shutdown().await;
+
+    println!("shutdown complete");
+}
+
+macro_rules! test_encoding_replication {
+    ($name:ident, $charset:expr, $hex_len:expr, $range:expr) => {
+        #[tokio::test]
+        #[serial(mysql)]
+        #[slow]
+        async fn $name() {
+            test_encoding_replication_inner($charset, $hex_len, $range).await;
+        }
+    };
+}
+
+test_encoding_replication!(ascii, "ascii", 2, 0..=127);
+test_encoding_replication!(test_latin1, "latin1", 2, 0..=255);
+test_encoding_replication!(test_utf8mb4, "utf8mb4", 2, 0..=127);
+test_encoding_replication!(test_utf8mb3, "utf8mb3", 2, 0..=127);
diff --git a/readyset-errors/src/lib.rs b/readyset-errors/src/lib.rs
diff --git a/replicators/src/mysql_connector/connector.rs b/replicators/src/mysql_connector/connector.rs