diff --git a/.cargo/config.toml b/.cargo/config.toml
new file mode 100644
index 0000000..ddff440
--- /dev/null
+++ b/.cargo/config.toml
@@ -0,0 +1,2 @@
+[build]
+rustflags = ["-C", "target-cpu=native"]
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 1cd0722..b245e70 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -17,12 +17,12 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       # rust-version from Cargo.toml
-      - name: Install Rust 1.65.0
-        uses: dtolnay/rust-toolchain@1.65.0
+      - name: Install Rust 1.82.0
+        uses: dtolnay/rust-toolchain@1.82.0
       - name: Use predefined lockfile
         run: mv Cargo.lock.MSRV Cargo.lock
       - name: Build
-        run: cargo check --locked
+        run: cargo check
 
   build:
     runs-on: ubuntu-latest
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..d9cf66e
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "rust-analyzer.cargo.features": [
+        "libblur",
+        "rayon"
+    ]
+}
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index 37f1a26..bf0aab3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ssimulacra2"
-version = "0.5.1"
+version = "0.6.0"
 edition = "2021"
 description = "Rust implementation of the SSIMULACRA2 metric"
 repository = "https://github.com/rust-av/ssimulacra2"
@@ -10,16 +10,17 @@ exclude = ["test_data"]
 license = "BSD-2-Clause"
 
 # When changing MSRV: Also update README and .github/workflows/rust.yml
-rust-version = "1.65.0"
+rust-version = "1.82.0"
 
 [features]
 default = ["rayon"]
 
 [dependencies]
-num-traits = "0.2.15"
-rayon = { version = "1.5.3", optional = true }
-thiserror = "2.0.9"
-yuvxyb = "0.4.1"
+libblur = { version = "0.17", optional = true }
+num-traits = "0.2.19"
+rayon = { version = "1.10", optional = true }
+thiserror = "2.0.12"
+yuvxyb = "0.4.2"
 
 [build-dependencies.yuvxyb-math]
 version = "0.1"
@@ -27,7 +28,7 @@ version = "0.1"
 [dev-dependencies]
 criterion = "0.5.0"
 image = "0.25.0"
-rand = "0.8.5"
+rand = "0.9.1"
 
 [[bench]]
 name = "benches"
diff --git a/benches/benches.rs b/benches/benches.rs
index a9d4316..44d4b85 100644
--- a/benches/benches.rs
+++ b/benches/benches.rs
@@ -2,9 +2,10 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use num_traits::clamp;
 use rand::Rng;
 use ssimulacra2::{
-    compute_frame_ssimulacra2, Blur, ColorPrimaries, Frame, MatrixCoefficients, Plane,
-    TransferCharacteristic, Yuv, YuvConfig,
+    compute_frame_ssimulacra2, xyb_to_planar, Blur, BlurOperator, ColorPrimaries, Frame,
+    MatrixCoefficients, Plane, TransferCharacteristic, Yuv, YuvConfig,
 };
+use yuvxyb::{LinearRgb, Xyb};
 
 fn make_yuv(
     ss: (u8, u8),
@@ -36,10 +37,10 @@ fn make_yuv(
             ),
         ],
     };
-    let mut rng = rand::thread_rng();
+    let mut rng = rand::rng();
     for (i, plane) in data.planes.iter_mut().enumerate() {
         for val in plane.data_origin_mut().iter_mut() {
-            *val = rng.gen_range(if full_range {
+            *val = rng.random_range(if full_range {
                 0..=255
             } else if i == 0 {
                 16..=235
@@ -64,7 +65,7 @@ fn make_yuv(
 }
 
 fn distort_yuv(input: &Yuv<u8>) -> Yuv<u8> {
-    let mut rng = rand::thread_rng();
+    let mut rng = rand::rng();
     let mut planes = [
         input.data()[0].clone(),
         input.data()[1].clone(),
@@ -72,7 +73,7 @@ fn distort_yuv(input: &Yuv<u8>) -> Yuv<u8> {
     ];
     for plane in &mut planes {
         for pix in plane.data_origin_mut() {
-            *pix = clamp(i16::from(*pix) + rng.gen_range(-16..=16), 0, 255) as u8;
+            *pix = clamp(i16::from(*pix) + rng.random_range(-16..=16), 0, 255) as u8;
         }
     }
     let data: Frame<u8> = Frame { planes };
@@ -121,10 +122,169 @@ fn bench_blur(c: &mut Criterion) {
 
         // Blur the image
         let mut blur = Blur::new(width, height);
+        let mut out = std::array::from_fn(|_| vec![0.0f32; width * height]);
 
-        b.iter(|| blur.blur(black_box(&image)))
+        b.iter(|| blur.blur(black_box(&image), black_box(&mut out)));
     });
 }
 
-criterion_group!(benches, bench_ssimulacra2, bench_blur);
+fn create_linearrgb_from_image(path: &str) -> LinearRgb {
+    let img = image::open(path).unwrap();
+    let img = img.to_rgb8();
+    let (width, height) = img.dimensions();
+
+    let mut data = vec![[0.0f32; 3]; (width * height) as usize];
+    for (i, pixel) in img.pixels().enumerate() {
+        data[i] = [
+            pixel[0] as f32 / 255.0,
+            pixel[1] as f32 / 255.0,
+            pixel[2] as f32 / 255.0,
+        ];
+    }
+
+    LinearRgb::new(data, width as usize, height as usize).expect("이미지 변환 실패")
+}
+
+// downscale_by_2
+fn bench_downscale_by_2(c: &mut Criterion) {
+    // let mut group = c.benchmark_group("downscale");
+    // group.measurement_time(std::time::Duration::from_secs_f32(9.2)); //Unable to complete 100 samples in 9.0s, so increased to 9.2s
+
+    c.bench_function("downscale_by_2", |b| {
+        // load the image
+        let image = create_linearrgb_from_image("test_data/tank_source.png");
+
+        b.iter(|| ssimulacra2::downscale_by_2(black_box(&image)))
+    });
+    // c.finish();
+}
+
+fn bench_image_multiply(c: &mut Criterion) {
+    c.bench_function("image_multiply", |b| {
+        let image = create_linearrgb_from_image("test_data/tank_source.png");
+        let width = image.width();
+        let height = image.height();
+
+        let img1 = xyb_to_planar(&Xyb::from(image.clone()));
+        let img2 = img1.clone();
+
+        let mut out = [
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+        ];
+
+        b.iter(|| {
+            ssimulacra2::image_multiply(black_box(&img1), black_box(&img2), black_box(&mut out))
+        })
+    });
+}
+
+fn bench_ssim_map(c: &mut Criterion) {
+    let mut group = c.benchmark_group("ssim_map");
+    group.measurement_time(std::time::Duration::from_secs(9));
+
+    group.bench_function("standard", |b| {
+        let source_image = create_linearrgb_from_image("test_data/tank_source.png");
+        let distorted_image = create_linearrgb_from_image("test_data/tank_distorted.png");
+        let width = source_image.width();
+        let height = source_image.height();
+
+        let mut img1_xyb = Xyb::from(source_image);
+        let mut img2_xyb = Xyb::from(distorted_image);
+
+        for pix in img1_xyb.data_mut().iter_mut() {
+            pix[2] = (pix[2] - pix[1]) + 0.55;
+            pix[0] = (pix[0]).mul_add(14.0, 0.42);
+            pix[1] += 0.01;
+        }
+        for pix in img2_xyb.data_mut().iter_mut() {
+            pix[2] = (pix[2] - pix[1]) + 0.55;
+            pix[0] = (pix[0]).mul_add(14.0, 0.42);
+            pix[1] += 0.01;
+        }
+
+        let img1_planar = xyb_to_planar(&img1_xyb);
+        let img2_planar = xyb_to_planar(&img2_xyb);
+
+        let mut blur = Blur::new(width, height);
+
+        let mut mul = [
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+        ];
+        let mut sigma1_sq = [
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+        ];
+        let mut sigma2_sq = [
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+        ];
+        let mut sigma12 = [
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+        ];
+        let mut mu1 = [
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+        ];
+        let mut mu2 = [
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+            vec![0.0f32; width * height],
+        ];
+
+        ssimulacra2::image_multiply(&img1_planar, &img1_planar, &mut mul);
+        blur.blur(&mul, &mut sigma1_sq).expect("blur failed");
+
+        ssimulacra2::image_multiply(&img2_planar, &img2_planar, &mut mul);
+        blur.blur(&mul, &mut sigma2_sq).expect("blur failed");
+
+        ssimulacra2::image_multiply(&img1_planar, &img2_planar, &mut mul);
+        blur.blur(&mul, &mut sigma12).expect("blur failed");
+
+        blur.blur(&img1_planar, &mut mu1).expect("blur failed");
+        blur.blur(&img2_planar, &mut mu2).expect("blur failed");
+
+        b.iter(|| {
+            ssimulacra2::ssim_map(
+                black_box(width),
+                black_box(height),
+                black_box(&mu1),
+                black_box(&mu2),
+                black_box(&sigma1_sq),
+                black_box(&sigma2_sq),
+                black_box(&sigma12),
+            )
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_xyb_to_planar(c: &mut Criterion) {
+    c.bench_function("xyb_to_planar", |b| {
+        let image = create_linearrgb_from_image("test_data/tank_source.png");
+        let xyb = Xyb::from(image.clone());
+
+        b.iter(|| xyb_to_planar(black_box(&xyb)))
+    });
+}
+
+// criterion_group!(benches, bench_ssimulacra2, bench_blur);
+criterion_group!(
+    benches,
+    bench_ssimulacra2,
+    bench_blur,
+    bench_downscale_by_2,
+    bench_image_multiply,
+    bench_ssim_map,
+    bench_xyb_to_planar
+);
 criterion_main!(benches);
diff --git a/build.rs b/build.rs
index 3fd6e2b..b96b20f 100644
--- a/build.rs
+++ b/build.rs
@@ -121,13 +121,13 @@ fn init_recursive_gaussian(out_path: &str) -> io::Result<()> {
 
     write_const_usize(&mut out_file, "RADIUS", radius as usize)?;
 
-    write_const_f32(&mut out_file, "VERT_MUL_IN_1", n2[0] as f32)?;
-    write_const_f32(&mut out_file, "VERT_MUL_IN_3", n2[1] as f32)?;
-    write_const_f32(&mut out_file, "VERT_MUL_IN_5", n2[2] as f32)?;
+    // write_const_f32(&mut out_file, "VERT_MUL_IN_1", n2[0] as f32)?;
+    // write_const_f32(&mut out_file, "VERT_MUL_IN_3", n2[1] as f32)?;
+    // write_const_f32(&mut out_file, "VERT_MUL_IN_5", n2[2] as f32)?;
 
-    write_const_f32(&mut out_file, "VERT_MUL_PREV_1", d1[0] as f32)?;
-    write_const_f32(&mut out_file, "VERT_MUL_PREV_3", d1[1] as f32)?;
-    write_const_f32(&mut out_file, "VERT_MUL_PREV_5", d1[2] as f32)?;
+    // write_const_f32(&mut out_file, "VERT_MUL_PREV_1", d1[0] as f32)?;
+    // write_const_f32(&mut out_file, "VERT_MUL_PREV_3", d1[1] as f32)?;
+    // write_const_f32(&mut out_file, "VERT_MUL_PREV_5", d1[2] as f32)?;
 
     write_const_f32(&mut out_file, "MUL_IN_1", mul_in[0])?;
     write_const_f32(&mut out_file, "MUL_IN_3", mul_in[4])?;
diff --git a/src/blur/gaussian.rs b/src/blur/gaussian.rs
deleted file mode 100644
index bd99067..0000000
--- a/src/blur/gaussian.rs
+++ /dev/null
@@ -1,186 +0,0 @@
-mod consts {
-    #![allow(clippy::unreadable_literal)]
-    include!(concat!(env!("OUT_DIR"), "/recursive_gaussian.rs"));
-}
-
-/// Implements "Recursive Implementation of the Gaussian Filter Using Truncated
-/// Cosine Functions" by Charalampidis [2016].
-pub struct RecursiveGaussian;
-
-impl RecursiveGaussian {
-    #[cfg(feature = "rayon")]
-    pub fn horizontal_pass(&self, input: &[f32], output: &mut [f32], width: usize) {
-        use rayon::iter::{IndexedParallelIterator, ParallelIterator};
-        use rayon::prelude::ParallelSliceMut;
-        use rayon::slice::ParallelSlice;
-
-        assert_eq!(input.len(), output.len());
-
-        input
-            .par_chunks_exact(width)
-            .zip(output.par_chunks_exact_mut(width))
-            .for_each(|(input, output)| self.horizontal_row(input, output, width));
-    }
-
-    #[cfg(not(feature = "rayon"))]
-    pub fn horizontal_pass(&self, input: &[f32], output: &mut [f32], width: usize) {
-        assert_eq!(input.len(), output.len());
-
-        for (input, output) in input
-            .chunks_exact(width)
-            .zip(output.chunks_exact_mut(width))
-        {
-            self.horizontal_row(input, output, width);
-        }
-    }
-
-    fn horizontal_row(&self, input: &[f32], output: &mut [f32], width: usize) {
-        let big_n = consts::RADIUS as isize;
-        let mut prev_1 = 0f32;
-        let mut prev_3 = 0f32;
-        let mut prev_5 = 0f32;
-        let mut prev2_1 = 0f32;
-        let mut prev2_3 = 0f32;
-        let mut prev2_5 = 0f32;
-
-        let mut n = (-big_n) + 1;
-        while n < width as isize {
-            let left = n - big_n - 1;
-            let right = n + big_n - 1;
-            let left_val = if left >= 0 {
-                // SAFETY: `left` can never be bigger than `width`
-                unsafe { *input.get_unchecked(left as usize) }
-            } else {
-                0f32
-            };
-            let right_val = if right < width as isize {
-                // SAFETY: this branch ensures that `right` is not bigger than `width`
-                unsafe { *input.get_unchecked(right as usize) }
-            } else {
-                0f32
-            };
-            let sum = left_val + right_val;
-
-            let mut out_1 = sum * consts::MUL_IN_1;
-            let mut out_3 = sum * consts::MUL_IN_3;
-            let mut out_5 = sum * consts::MUL_IN_5;
-
-            out_1 = consts::MUL_PREV2_1.mul_add(prev2_1, out_1);
-            out_3 = consts::MUL_PREV2_3.mul_add(prev2_3, out_3);
-            out_5 = consts::MUL_PREV2_5.mul_add(prev2_5, out_5);
-            prev2_1 = prev_1;
-            prev2_3 = prev_3;
-            prev2_5 = prev_5;
-
-            out_1 = consts::MUL_PREV_1.mul_add(prev_1, out_1);
-            out_3 = consts::MUL_PREV_3.mul_add(prev_3, out_3);
-            out_5 = consts::MUL_PREV_5.mul_add(prev_5, out_5);
-            prev_1 = out_1;
-            prev_3 = out_3;
-            prev_5 = out_5;
-
-            if n >= 0 {
-                // SAFETY: We know that this chunk of output is of size `width`,
-                // which `n` cannot be larger than.
-                unsafe {
-                    *output.get_unchecked_mut(n as usize) = out_1 + out_3 + out_5;
-                }
-            }
-
-            n += 1;
-        }
-    }
-
-    pub fn vertical_pass_chunked<const J: usize, const K: usize>(
-        &self,
-        input: &[f32],
-        output: &mut [f32],
-        width: usize,
-        height: usize,
-    ) {
-        assert!(J > K);
-        assert!(K > 0);
-
-        assert_eq!(input.len(), output.len());
-
-        let mut x = 0;
-        while x + J <= width {
-            self.vertical_pass::<J>(&input[x..], &mut output[x..], width, height);
-            x += J;
-        }
-
-        while x + K <= width {
-            self.vertical_pass::<K>(&input[x..], &mut output[x..], width, height);
-            x += K;
-        }
-
-        while x < width {
-            self.vertical_pass::<1>(&input[x..], &mut output[x..], width, height);
-            x += 1;
-        }
-    }
-
-    // Apply 1D vertical scan on COLUMNS elements at a time
-    pub fn vertical_pass<const COLUMNS: usize>(
-        &self,
-        input: &[f32],
-        output: &mut [f32],
-        width: usize,
-        height: usize,
-    ) {
-        assert_eq!(input.len(), output.len());
-
-        let big_n = consts::RADIUS as isize;
-
-        let zeroes = vec![0f32; COLUMNS];
-        let mut prev = vec![0f32; 3 * COLUMNS];
-        let mut prev2 = vec![0f32; 3 * COLUMNS];
-        let mut out = vec![0f32; 3 * COLUMNS];
-
-        let mut n = (-big_n) + 1;
-        while n < height as isize {
-            let top = n - big_n - 1;
-            let bottom = n + big_n - 1;
-            let top_row = if top >= 0 {
-                &input[top as usize * width..][..COLUMNS]
-            } else {
-                &zeroes
-            };
-
-            let bottom_row = if bottom < height as isize {
-                &input[bottom as usize * width..][..COLUMNS]
-            } else {
-                &zeroes
-            };
-
-            for i in 0..COLUMNS {
-                let sum = top_row[i] + bottom_row[i];
-
-                let i1 = i;
-                let i3 = i1 + COLUMNS;
-                let i5 = i3 + COLUMNS;
-
-                let out1 = prev[i1].mul_add(consts::VERT_MUL_PREV_1, prev2[i1]);
-                let out3 = prev[i3].mul_add(consts::VERT_MUL_PREV_3, prev2[i3]);
-                let out5 = prev[i5].mul_add(consts::VERT_MUL_PREV_5, prev2[i5]);
-
-                let out1 = sum.mul_add(consts::VERT_MUL_IN_1, -out1);
-                let out3 = sum.mul_add(consts::VERT_MUL_IN_3, -out3);
-                let out5 = sum.mul_add(consts::VERT_MUL_IN_5, -out5);
-
-                out[i1] = out1;
-                out[i3] = out3;
-                out[i5] = out5;
-
-                if n >= 0 {
-                    output[n as usize * width + i] = out1 + out3 + out5;
-                }
-            }
-
-            prev2.copy_from_slice(&prev);
-            prev.copy_from_slice(&out);
-
-            n += 1;
-        }
-    }
-}
diff --git a/src/blur/gaussian_impl/gaussian.rs b/src/blur/gaussian_impl/gaussian.rs
new file mode 100644
index 0000000..812c03a
--- /dev/null
+++ b/src/blur/gaussian_impl/gaussian.rs
@@ -0,0 +1,150 @@
+mod consts {
+    #![allow(clippy::unreadable_literal)]
+    include!(concat!(env!("OUT_DIR"), "/recursive_gaussian.rs"));
+}
+
+pub struct RecursiveGaussian;
+
+impl RecursiveGaussian {
+    #[inline(always)]
+    pub fn horizontal_pass(&self, input: &[f32], output: &mut [f32], width: usize) {
+        assert_eq!(input.len(), output.len());
+
+        #[cfg(feature = "rayon")]
+        {
+            use rayon::prelude::*;
+            input
+                .par_chunks_exact(width)
+                .zip(output.par_chunks_exact_mut(width))
+                .for_each(|(input, output)| self.horizontal_row(input, output, width));
+        }
+
+        #[cfg(not(feature = "rayon"))]
+        {
+            input
+                .chunks_exact(width)
+                .zip(output.chunks_exact_mut(width))
+                .for_each(|(input, output)| self.horizontal_row(input, output, width));
+        }
+    }
+
+    #[inline(always)]
+    fn horizontal_row(&self, input: &[f32], output: &mut [f32], width: usize) {
+        let big_n = consts::RADIUS as isize;
+        let [mul_in_1, mul_in_3, mul_in_5] = [consts::MUL_IN_1, consts::MUL_IN_3, consts::MUL_IN_5];
+        let [mul_prev_1, mul_prev_3, mul_prev_5] =
+            [consts::MUL_PREV_1, consts::MUL_PREV_3, consts::MUL_PREV_5];
+        let [mul_prev2_1, mul_prev2_3, mul_prev2_5] = [
+            consts::MUL_PREV2_1,
+            consts::MUL_PREV2_3,
+            consts::MUL_PREV2_5,
+        ];
+
+        let mut prev = [0f32; 6]; // [prev_1, prev_3, prev_5, prev2_1, prev2_3, prev2_5]
+
+        let mut n = (-big_n) + 1;
+        while n < width as isize {
+            let left = n - big_n - 1;
+            let right = n + big_n - 1;
+
+            let left_val = if left >= 0 {
+                unsafe { *input.get_unchecked(left as usize) }
+            } else {
+                0f32
+            };
+            let right_val = if right < width as isize {
+                unsafe { *input.get_unchecked(right as usize) }
+            } else {
+                0f32
+            };
+            let sum = left_val + right_val;
+
+            let mut out = [sum * mul_in_1, sum * mul_in_3, sum * mul_in_5];
+
+            out[0] = mul_prev2_1.mul_add(prev[3], out[0]);
+            out[1] = mul_prev2_3.mul_add(prev[4], out[1]);
+            out[2] = mul_prev2_5.mul_add(prev[5], out[2]);
+
+            prev[3] = prev[0];
+            prev[4] = prev[1];
+            prev[5] = prev[2];
+
+            out[0] = mul_prev_1.mul_add(prev[0], out[0]);
+            out[1] = mul_prev_3.mul_add(prev[1], out[1]);
+            out[2] = mul_prev_5.mul_add(prev[2], out[2]);
+
+            prev[0] = out[0];
+            prev[1] = out[1];
+            prev[2] = out[2];
+
+            if n >= 0 {
+                unsafe {
+                    *output.get_unchecked_mut(n as usize) = out[0] + out[1] + out[2];
+                }
+            }
+
+            n += 1;
+        }
+    }
+
+    #[inline(always)]
+    fn transpose(&self, input: &[f32], output: &mut [f32], width: usize, height: usize) {
+        assert_eq!(input.len(), width * height);
+        assert_eq!(output.len(), width * height);
+
+        for y in 0..height {
+            for x in 0..width {
+                output[x * height + y] = input[y * width + x];
+            }
+        }
+    }
+
+    #[inline(always)]
+    pub fn vertical_pass(
+        &self,
+        input: &[f32],
+        output: &mut [f32],
+        width: usize,
+        height: usize,
+        transposed_input: &mut Vec<f32>,
+        transposed_output: &mut Vec<f32>,
+    ) {
+        let size = width * height;
+        assert_eq!(input.len(), size);
+        assert_eq!(output.len(), size);
+
+        // Transpose the input data to make it easier to process in horizontal rows
+        self.transpose(input, transposed_input, width, height);
+
+        #[cfg(feature = "rayon")]
+        {
+            use rayon::prelude::*;
+
+            //Use par_chunks_exact_mut to divide the output buffer and calculate the corresponding input chunk for each chunk
+            let chunk_size = height;
+            transposed_output
+                .par_chunks_exact_mut(chunk_size)
+                .enumerate()
+                .for_each(|(index, output_chunk)| {
+                    let start = index * chunk_size;
+                    let input_chunk = &transposed_input[start..start + chunk_size];
+                    self.horizontal_row(input_chunk, output_chunk, height);
+                });
+        }
+
+        #[cfg(not(feature = "rayon"))]
+        {
+            for y in 0..width {
+                let start = y * height;
+                self.horizontal_row(
+                    &transposed_input[start..start + height],
+                    &mut transposed_output[start..start + height],
+                    height,
+                );
+            }
+        }
+
+        // Transpose the output data back to the original format
+        self.transpose(&transposed_output, output, height, width);
+    }
+}
diff --git a/src/blur/gaussian_impl/mod.rs b/src/blur/gaussian_impl/mod.rs
new file mode 100644
index 0000000..df148c6
--- /dev/null
+++ b/src/blur/gaussian_impl/mod.rs
@@ -0,0 +1,65 @@
+mod gaussian;
+use super::{BlurOperator, Ssimulacra2Error};
+
+pub struct GaussianBlur {
+    width: usize,
+    height: usize,
+    temp_buffer: Option<Vec<f32>>,
+    transposed_input: Option<Vec<f32>>,
+    transposed_output: Option<Vec<f32>>,
+}
+
+impl BlurOperator for GaussianBlur {
+    fn new(width: usize, height: usize) -> Self {
+        let size = width * height;
+        GaussianBlur {
+            width,
+            height,
+            temp_buffer: Some(vec![0.0f32; size]),
+            transposed_input: Some(vec![0.0f32; size]),
+            transposed_output: Some(vec![0.0f32; size]),
+        }
+    }
+
+    fn shrink_to(&mut self, width: usize, height: usize) {
+        self.width = width;
+        self.height = height;
+
+        if let Some(buffer) = &mut self.temp_buffer {
+            buffer.resize(width * height, 0.0);
+        }
+
+        if let Some(buffer) = &mut self.transposed_input {
+            buffer.resize(width * height, 0.0);
+        }
+
+        if let Some(buffer) = &mut self.transposed_output {
+            buffer.resize(width * height, 0.0);
+        }
+    }
+
+    fn blur(
+        &mut self,
+        img: &[Vec<f32>; 3],
+        out: &mut [Vec<f32>; 3],
+    ) -> Result<(), Ssimulacra2Error> {
+        let mut temp = self.temp_buffer.take().unwrap();
+        let kernel = gaussian::RecursiveGaussian;
+
+        for i in 0..3 {
+            kernel.horizontal_pass(&img[i], &mut temp, self.width);
+            kernel.vertical_pass(
+                &temp,
+                &mut out[i],
+                self.width,
+                self.height,
+                self.transposed_input.as_mut().unwrap(),
+                self.transposed_output.as_mut().unwrap(),
+            );
+        }
+
+        self.temp_buffer = Some(temp);
+
+        Ok(())
+    }
+}
diff --git a/src/blur/libblur_impl/mod.rs b/src/blur/libblur_impl/mod.rs
new file mode 100644
index 0000000..5fa2169
--- /dev/null
+++ b/src/blur/libblur_impl/mod.rs
@@ -0,0 +1,84 @@
+use super::{BlurOperator, Ssimulacra2Error};
+use libblur::{BlurImage, BlurImageMut, EdgeMode, FastBlurChannels, ThreadingPolicy};
+use std::borrow::Cow;
+
+pub struct LibBlur {
+    width: usize,
+    height: usize,
+}
+
+impl BlurOperator for LibBlur {
+    fn new(width: usize, height: usize) -> Self {
+        LibBlur { width, height }
+    }
+
+    fn shrink_to(&mut self, width: usize, height: usize) {
+        self.width = width;
+        self.height = height;
+    }
+
+    fn blur(
+        &mut self,
+        img: &[Vec<f32>; 3],
+        out: &mut [Vec<f32>; 3],
+    ) -> Result<(), Ssimulacra2Error> {
+        self.blur_plane(&img[0], &mut out[0])?;
+        self.blur_plane(&img[1], &mut out[1])?;
+        self.blur_plane(&img[2], &mut out[2])?;
+        Ok(())
+    }
+}
+
+impl LibBlur {
+    fn blur_plane(&mut self, plane: &[f32], out: &mut [f32]) -> Result<(), Ssimulacra2Error> {
+        //Set kernel size and sigma value - adjust as needed but not recommended to change
+        const KERNEL_SIZE: u32 = 11;
+        const SIGMA: f32 = 2.2943; // diff 0.000 / 0.932
+
+        // BlurImage creation
+        let src_image = BlurImage {
+            data: Cow::Borrowed(plane),
+            width: self.width as u32,
+            height: self.height as u32,
+            stride: self.width as u32, // stride == width
+            channels: FastBlurChannels::Plane,
+        };
+
+        // BlurImageMut creation
+        let mut dst_image = BlurImageMut::borrow(
+            out,
+            self.width as u32,
+            self.height as u32,
+            FastBlurChannels::Plane,
+        );
+
+        // gaussian_blur_f32 call
+        #[cfg(feature = "rayon")]
+        if let Err(e) = libblur::gaussian_blur_f32(
+            &src_image,
+            &mut dst_image,
+            KERNEL_SIZE,
+            SIGMA,
+            EdgeMode::Clamp,
+            ThreadingPolicy::Adaptive, // use rayon
+        ) {
+            eprintln!("Error in gaussian_blur_f32: {:?}", e);
+            return Err(Ssimulacra2Error::GaussianBlurError);
+        }
+
+        #[cfg(not(feature = "rayon"))]
+        if let Err(e) = libblur::gaussian_blur_f32(
+            &src_image,
+            &mut dst_image,
+            KERNEL_SIZE,
+            SIGMA,
+            EdgeMode::Clamp,
+            ThreadingPolicy::Single, // no rayon
+        ) {
+            eprintln!("Error in gaussian_blur_f32: {:?}", e);
+            return Err(Ssimulacra2Error::GaussianBlurError);
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/blur/mod.rs b/src/blur/mod.rs
index cadca76..f68b0e4 100644
--- a/src/blur/mod.rs
+++ b/src/blur/mod.rs
@@ -1,60 +1,32 @@
-mod gaussian;
+#[cfg(not(feature = "libblur"))]
+mod gaussian_impl;
+#[cfg(feature = "libblur")]
+mod libblur_impl;
+use crate::Ssimulacra2Error;
 
-use gaussian::RecursiveGaussian;
-
-/// Structure handling image blur.
+/// Trait handling image blur.
 ///
-/// This struct contains the necessary buffers and the kernel used for blurring
+/// This trait contains the necessary buffers and the kernel used for blurring
 /// (currently a recursive approximation of the Gaussian filter).
 ///
 /// Note that the width and height of the image passed to [blur][Self::blur] needs to exactly
 /// match the width and height of this instance. If you reduce the image size (e.g. via
 /// downscaling), [`shrink_to`][Self::shrink_to] can be used to resize the internal buffers.
-pub struct Blur {
-    kernel: RecursiveGaussian,
-    temp: Vec<f32>,
-    width: usize,
-    height: usize,
-}
-
-impl Blur {
+pub trait BlurOperator {
     /// Create a new [Blur] for images of the given width and height.
-    /// This pre-allocates the necessary buffers.
-    #[must_use]
-    pub fn new(width: usize, height: usize) -> Self {
-        Blur {
-            kernel: RecursiveGaussian,
-            temp: vec![0.0f32; width * height],
-            width,
-            height,
-        }
-    }
-
+    fn new(width: usize, height: usize) -> Self;
     /// Truncates the internal buffers to fit images of the given width and height.
-    ///
-    /// This will [truncate][Vec::truncate] the internal buffers
-    /// without affecting the allocated memory.
-    pub fn shrink_to(&mut self, width: usize, height: usize) {
-        self.temp.truncate(width * height);
-        self.width = width;
-        self.height = height;
-    }
+    fn shrink_to(&mut self, width: usize, height: usize);
+    /// Blur the given image using libblur's gaussian_blur_f32.
+    fn blur(
+        &mut self,
+        img: &[Vec<f32>; 3],
+        out: &mut [Vec<f32>; 3],
+    ) -> Result<(), Ssimulacra2Error>;
+}
 
-    /// Blur the given image.
-    pub fn blur(&mut self, img: &[Vec<f32>; 3]) -> [Vec<f32>; 3] {
-        [
-            self.blur_plane(&img[0]),
-            self.blur_plane(&img[1]),
-            self.blur_plane(&img[2]),
-        ]
-    }
+#[cfg(not(feature = "libblur"))]
+pub use gaussian_impl::GaussianBlur as Blur;
 
-    fn blur_plane(&mut self, plane: &[f32]) -> Vec<f32> {
-        let mut out = vec![0f32; self.width * self.height];
-        self.kernel
-            .horizontal_pass(plane, &mut self.temp, self.width);
-        self.kernel
-            .vertical_pass_chunked::<128, 32>(&self.temp, &mut out, self.width, self.height);
-        out
-    }
-}
+#[cfg(feature = "libblur")]
+pub use libblur_impl::LibBlur as Blur;
diff --git a/src/lib.rs b/src/lib.rs
index 62b4026..a0f0df9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,6 @@
 mod blur;
 
-pub use blur::Blur;
+pub use blur::{Blur, BlurOperator};
 pub use yuvxyb::{CastFromPrimitive, Frame, LinearRgb, Pixel, Plane, Rgb, Xyb, Yuv};
 pub use yuvxyb::{ColorPrimaries, MatrixCoefficients, TransferCharacteristic, YuvConfig};
 
@@ -25,6 +25,12 @@ pub enum Ssimulacra2Error {
     /// This is not currently supported by the SSIMULACRA2 metric.
     #[error("Images must be at least 8x8 pixels")]
     InvalidImageSize,
+
+    #[error("Error in gaussian_blur_f32")]
+    GaussianBlurError,
+
+    #[error("Error in fast_image_resize")]
+    ResizeError,
 }
 
 /// Computes the SSIMULACRA2 score for a given input frame and the distorted
@@ -34,6 +40,7 @@ pub enum Ssimulacra2Error {
 /// - If the source and distorted image width and height do not match
 /// - If the source or distorted image cannot be converted to XYB successfully
 /// - If the image is smaller than 8x8 pixels
+#[inline(always)]
 pub fn compute_frame_ssimulacra2<T, U>(source: T, distorted: U) -> Result<f64, Ssimulacra2Error>
 where
     LinearRgb: TryFrom<T> + TryFrom<U>,
@@ -57,11 +64,39 @@ where
     let mut width = img1.width();
     let mut height = img1.height();
 
+    let initial_size = width * height;
     let mut mul = [
-        vec![0.0f32; width * height],
-        vec![0.0f32; width * height],
-        vec![0.0f32; width * height],
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+    ];
+
+    let mut sigma1_sq = [
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+    ];
+    let mut sigma2_sq = [
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+    ];
+    let mut sigma12 = [
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+    ];
+    let mut mu1 = [
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
     ];
+    let mut mu2 = [
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+        vec![0.0f32; initial_size],
+    ];
+
     let mut blur = Blur::new(width, height);
     let mut msssim = Msssim::default();
 
@@ -75,37 +110,55 @@ where
             img2 = downscale_by_2(&img2);
             width = img1.width();
             height = img2.height();
+
+            for c in &mut mul {
+                c.truncate(width * height);
+            }
+            for c in &mut sigma1_sq {
+                c.truncate(width * height);
+            }
+            for c in &mut sigma2_sq {
+                c.truncate(width * height);
+            }
+            for c in &mut sigma12 {
+                c.truncate(width * height);
+            }
+            for c in &mut mu1 {
+                c.truncate(width * height);
+            }
+            for c in &mut mu2 {
+                c.truncate(width * height);
+            }
+
+            blur.shrink_to(width, height);
         }
-        for c in &mut mul {
-            c.truncate(width * height);
-        }
-        blur.shrink_to(width, height);
 
-        let mut img1 = Xyb::from(img1.clone());
-        let mut img2 = Xyb::from(img2.clone());
+        // "Convert to XYB format"
+        let mut img1_xyb = Xyb::from(img1.clone());
+        let mut img2_xyb = Xyb::from(img2.clone());
 
-        make_positive_xyb(&mut img1);
-        make_positive_xyb(&mut img2);
+        make_positive_xyb(&mut img1_xyb);
+        make_positive_xyb(&mut img2_xyb);
 
-        // SSIMULACRA2 works with the data in a planar format,
-        // so we need to convert to that.
-        let img1 = xyb_to_planar(&img1);
-        let img2 = xyb_to_planar(&img2);
+        // "Convert to planar format"
+        let img1_planar = xyb_to_planar(&img1_xyb);
+        let img2_planar = xyb_to_planar(&img2_xyb);
 
-        image_multiply(&img1, &img1, &mut mul);
-        let sigma1_sq = blur.blur(&mul);
+        image_multiply(&img1_planar, &img1_planar, &mut mul);
+        blur.blur(&mul, &mut sigma1_sq)?;
 
-        image_multiply(&img2, &img2, &mut mul);
-        let sigma2_sq = blur.blur(&mul);
+        image_multiply(&img2_planar, &img2_planar, &mut mul);
+        blur.blur(&mul, &mut sigma2_sq)?;
 
-        image_multiply(&img1, &img2, &mut mul);
-        let sigma12 = blur.blur(&mul);
+        image_multiply(&img1_planar, &img2_planar, &mut mul);
+        blur.blur(&mul, &mut sigma12)?;
 
-        let mu1 = blur.blur(&img1);
-        let mu2 = blur.blur(&img2);
+        blur.blur(&img1_planar, &mut mu1)?;
+        blur.blur(&img2_planar, &mut mu2)?;
 
         let avg_ssim = ssim_map(width, height, &mu1, &mu2, &sigma1_sq, &sigma2_sq, &sigma12);
-        let avg_edgediff = edge_diff_map(width, height, &img1, &mu1, &img2, &mu2);
+        let avg_edgediff = edge_diff_map(width, height, &img1_planar, &mu1, &img2_planar, &mu2);
+
         msssim.scales.push(MsssimScale {
             avg_ssim,
             avg_edgediff,
@@ -126,6 +179,7 @@ where
 //  B: 0.272295..0.938012
 // The maximum pixel-wise difference has to be <= 1 for the ssim formula to make
 // sense.
+#[inline(always)]
 fn make_positive_xyb(xyb: &mut Xyb) {
     for pix in xyb.data_mut().iter_mut() {
         pix[2] = (pix[2] - pix[1]) + 0.55;
@@ -134,7 +188,8 @@ fn make_positive_xyb(xyb: &mut Xyb) {
     }
 }
 
-fn xyb_to_planar(xyb: &Xyb) -> [Vec<f32>; 3] {
+#[inline(always)]
+pub fn xyb_to_planar(xyb: &Xyb) -> [Vec<f32>; 3] {
     let mut out1 = vec![0.0f32; xyb.width() * xyb.height()];
     let mut out2 = vec![0.0f32; xyb.width() * xyb.height()];
     let mut out3 = vec![0.0f32; xyb.width() * xyb.height()];
@@ -154,7 +209,9 @@ fn xyb_to_planar(xyb: &Xyb) -> [Vec<f32>; 3] {
     [out1, out2, out3]
 }
 
-fn image_multiply(img1: &[Vec<f32>; 3], img2: &[Vec<f32>; 3], out: &mut [Vec<f32>; 3]) {
+#[inline(always)]
+// 2.3625ms
+pub fn image_multiply(img1: &[Vec<f32>; 3], img2: &[Vec<f32>; 3], out: &mut [Vec<f32>; 3]) {
     for ((plane1, plane2), out_plane) in img1.iter().zip(img2.iter()).zip(out.iter_mut()) {
         for ((&p1, &p2), o) in plane1.iter().zip(plane2.iter()).zip(out_plane.iter_mut()) {
             *o = p1 * p2;
@@ -162,7 +219,9 @@ fn image_multiply(img1: &[Vec<f32>; 3], img2: &[Vec<f32>; 3], out: &mut [Vec<f32
     }
 }
 
-fn downscale_by_2(in_data: &LinearRgb) -> LinearRgb {
+#[inline(always)]
+/// 1.7866ms
+pub fn downscale_by_2(in_data: &LinearRgb) -> LinearRgb {
     const SCALE: usize = 2;
     let in_w = in_data.width();
     let in_h = in_data.height();
@@ -170,23 +229,69 @@ fn downscale_by_2(in_data: &LinearRgb) -> LinearRgb {
     let out_h = (in_h + SCALE - 1) / SCALE;
     let mut out_data = vec![[0.0f32; 3]; out_w * out_h];
     let normalize = 1f32 / (SCALE * SCALE) as f32;
+    let in_data = in_data.data();
+
+    // "Process the inner area without boundaries first"
+    let safe_h = in_h - (in_h % SCALE);
+    let safe_w = in_w - (in_w % SCALE);
+
+    for oy in 0..(safe_h / SCALE) {
+        let y_base = oy * SCALE;
+        for ox in 0..(safe_w / SCALE) {
+            let x_base = ox * SCALE;
+            let out_idx = oy * out_w + ox;
+
+            // "Unroll the loop to access all pixels directly"
+            let p00 = in_data[y_base * in_w + x_base];
+            let p01 = in_data[y_base * in_w + (x_base + 1)];
+            let p10 = in_data[(y_base + 1) * in_w + x_base];
+            let p11 = in_data[(y_base + 1) * in_w + (x_base + 1)];
+
+            out_data[out_idx][0] = (p00[0] + p01[0] + p10[0] + p11[0]) * normalize;
+            out_data[out_idx][1] = (p00[1] + p01[1] + p10[1] + p11[1]) * normalize;
+            out_data[out_idx][2] = (p00[2] + p01[2] + p10[2] + p11[2]) * normalize;
+        }
+    }
 
-    let in_data = &in_data.data();
-    for oy in 0..out_h {
-        for ox in 0..out_w {
-            for c in 0..3 {
-                let mut sum = 0f32;
-                for iy in 0..SCALE {
-                    for ix in 0..SCALE {
-                        let x = (ox * SCALE + ix).min(in_w - 1);
-                        let y = (oy * SCALE + iy).min(in_h - 1);
-                        let in_pix = in_data[y * in_w + x];
-
-                        sum += in_pix[c];
+    if safe_w < in_w || safe_h < in_h {
+        for oy in 0..out_h {
+            let y_start = oy * SCALE;
+            for ox in 0..out_w {
+                if oy < safe_h / SCALE && ox < safe_w / SCALE {
+                    continue;
+                }
+
+                let x_start = ox * SCALE;
+                let out_idx = oy * out_w + ox;
+                let mut r = 0.0;
+                let mut g = 0.0;
+                let mut b = 0.0;
+                let mut count = 0;
+
+                for dy in 0..SCALE {
+                    let y = y_start + dy;
+                    if y >= in_h {
+                        continue;
+                    }
+
+                    for dx in 0..SCALE {
+                        let x = x_start + dx;
+                        if x >= in_w {
+                            continue;
+                        }
+
+                        let pixel = in_data[y * in_w + x];
+                        r += pixel[0];
+                        g += pixel[1];
+                        b += pixel[2];
+                        count += 1;
                     }
                 }
-                let out_pix = &mut out_data[oy * out_w + ox];
-                out_pix[c] = sum * normalize;
+
+                let scale = if count > 0 { 1.0 / count as f32 } else { 0.0 };
+                out_data[out_idx][0] = r * scale;
+                out_data[out_idx][1] = g * scale;
+                out_data[out_idx][2] = b * scale;
             }
         }
     }
@@ -194,7 +299,9 @@ fn downscale_by_2(in_data: &LinearRgb) -> LinearRgb {
     LinearRgb::new(out_data, out_w, out_h).expect("Resolution and data size match")
 }
 
-fn ssim_map(
+#[inline(always)]
+// 4.5360ms
+pub fn ssim_map(
     width: usize,
     height: usize,
     m1: &[Vec<f32>; 3],
@@ -203,56 +310,93 @@ fn ssim_map(
     s22: &[Vec<f32>; 3],
     s12: &[Vec<f32>; 3],
 ) -> [f64; 3 * 2] {
-    const C2: f32 = 0.0009f32;
-
     let one_per_pixels = 1.0f64 / (width * height) as f64;
     let mut plane_averages = [0f64; 3 * 2];
 
-    for c in 0..3 {
-        let mut sum1 = [0.0f64; 2];
-        for (row_m1, (row_m2, (row_s11, (row_s22, row_s12)))) in m1[c].chunks_exact(width).zip(
-            m2[c].chunks_exact(width).zip(
-                s11[c]
-                    .chunks_exact(width)
-                    .zip(s22[c].chunks_exact(width).zip(s12[c].chunks_exact(width))),
-            ),
-        ) {
-            for x in 0..width {
-                let mu1 = row_m1[x];
-                let mu2 = row_m2[x];
-                let mu11 = mu1 * mu1;
-                let mu22 = mu2 * mu2;
-                let mu12 = mu1 * mu2;
-                let mu_diff = mu1 - mu2;
-
-                // Correction applied compared to the original SSIM formula, which has:
-                //   luma_err = 2 * mu1 * mu2 / (mu1^2 + mu2^2)
-                //            = 1 - (mu1 - mu2)^2 / (mu1^2 + mu2^2)
-                // The denominator causes error in the darks (low mu1 and mu2) to weigh
-                // more than error in the brights (high mu1 and mu2). This would make
-                // sense if values correspond to linear luma. However, the actual values
-                // are either gamma-compressed luma (which supposedly is already
-                // perceptually uniform) or chroma (where weighing green more than red
-                // or blue more than yellow does not make any sense at all). So it is
-                // better to simply drop this denominator.
-                let num_m = mu_diff.mul_add(-mu_diff, 1.0f32);
-                let num_s = 2f32.mul_add(row_s12[x] - mu12, C2);
-                let denom_s = (row_s11[x] - mu11) + (row_s22[x] - mu22) + C2;
-                // Use 1 - SSIM' so it becomes an error score instead of a quality
-                // index. This makes it make sense to compute an L_4 norm.
-                let mut d = 1.0f64 - f64::from((num_m * num_s) / denom_s);
-                d = d.max(0.0);
-                sum1[0] += d;
-                sum1[1] += d.powi(4);
-            }
+    #[cfg(feature = "rayon")]
+    {
+        use rayon::prelude::*;
+        let results: Vec<(usize, [f64; 2])> = (0..3)
+            .into_par_iter()
+            .map(|c| {
+                let sums = compute_channel_sums(width, c, m1, m2, s11, s22, s12);
+                (c, sums)
+            })
+            .collect();
+
+        for (c, sums) in results {
+            plane_averages[c * 2] = one_per_pixels * sums[0];
+            plane_averages[c * 2 + 1] = (one_per_pixels * sums[1]).sqrt().sqrt();
+        }
+    }
+    #[cfg(not(feature = "rayon"))]
+    {
+        for c in 0..3 {
+            let sums = compute_channel_sums(width, c, m1, m2, s11, s22, s12);
+            plane_averages[c * 2] = one_per_pixels * sums[0];
+            plane_averages[c * 2 + 1] = (one_per_pixels * sums[1]).sqrt().sqrt();
         }
-        plane_averages[c * 2] = one_per_pixels * sum1[0];
-        plane_averages[c * 2 + 1] = (one_per_pixels * sum1[1]).sqrt().sqrt();
     }
 
     plane_averages
 }
 
+#[inline(always)]
+fn compute_channel_sums(
+    width: usize,
+    c: usize,
+    m1: &[Vec<f32>; 3],
+    m2: &[Vec<f32>; 3],
+    s11: &[Vec<f32>; 3],
+    s22: &[Vec<f32>; 3],
+    s12: &[Vec<f32>; 3],
+) -> [f64; 2] {
+    const C2: f32 = 0.0009f32;
+    let mut sums = [0.0f64; 2];
+
+    for (row_idx, (row_m1, row_m2)) in m1[c]
+        .chunks_exact(width)
+        .zip(m2[c].chunks_exact(width))
+        .enumerate()
+    {
+        let row_offset = row_idx * width;
+        for x in 0..width {
+            let mu1 = row_m1[x];
+            let mu2 = row_m2[x];
+            let mu_diff = mu1 - mu2;
+
+            // Correction applied compared to the original SSIM formula, which has:
+            //   luma_err = 2 * mu1 * mu2 / (mu1^2 + mu2^2)
+            //            = 1 - (mu1 - mu2)^2 / (mu1^2 + mu2^2)
+            // The denominator causes error in the darks (low mu1 and mu2) to weigh
+            // more than error in the brights (high mu1 and mu2). This would make
+            // sense if values correspond to linear luma. However, the actual values
+            // are either gamma-compressed luma (which supposedly is already
+            // perceptually uniform) or chroma (where weighing green more than red
+            // or blue more than yellow does not make any sense at all). So it is
+            // better to simply drop this denominator.
+            let num_m = mu_diff.mul_add(-mu_diff, 1.0);
+            let mu12 = mu1 * mu2;
+            let sigma12 = s12[c][row_offset + x] - mu12;
+            let num_s = 2.0f32.mul_add(sigma12, C2);
+            let sigma1_sq = s11[c][row_offset + x] - mu1 * mu1;
+            let sigma2_sq = s22[c][row_offset + x] - mu2 * mu2;
+            let denom_s = sigma1_sq + sigma2_sq + C2;
+
+            let ssim = (num_m * num_s) / denom_s;
+            let err = (1.0 - f64::from(ssim)).max(0.0);
+
+            sums[0] += err;
+            let err_sq = err * err;
+            sums[1] += err_sq * err_sq;
+        }
+    }
+
+    sums
+}
+
+#[inline(always)]
+// 12.492ms ssimulacra2 bench
 fn edge_diff_map(
     width: usize,
     height: usize,
@@ -265,34 +409,46 @@ fn edge_diff_map(
     let mut plane_averages = [0f64; 3 * 4];
 
     for c in 0..3 {
-        let mut sum1 = [0.0f64; 4];
+        // cache
+        let mut artifact_sum = 0.0f64;
+        let mut artifact_pow4_sum = 0.0f64;
+        let mut detail_lost_sum = 0.0f64;
+        let mut detail_lost_pow4_sum = 0.0f64;
+
         for (row1, (row2, (rowm1, rowm2))) in img1[c].chunks_exact(width).zip(
             img2[c]
                 .chunks_exact(width)
                 .zip(mu1[c].chunks_exact(width).zip(mu2[c].chunks_exact(width))),
         ) {
             for x in 0..width {
-                let d1: f64 = (1.0 + f64::from((row2[x] - rowm2[x]).abs()))
-                    / (1.0 + f64::from((row1[x] - rowm1[x]).abs()))
-                    - 1.0;
+                let diff1 = f64::from((row1[x] - rowm1[x]).abs());
+                let diff2 = f64::from((row2[x] - rowm2[x]).abs());
+
+                let d1 = (1.0 + diff2) / (1.0 + diff1) - 1.0;
 
                 // d1 > 0: distorted has an edge where original is smooth
                 //         (indicating ringing, color banding, blockiness, etc)
-                let artifact = d1.max(0.0);
-                sum1[0] += artifact;
-                sum1[1] += artifact.powi(4);
+                let artifact = 0.5 * (d1 + d1.abs()); // if d1 is positive, d1; if negative, 0
+                artifact_sum += artifact;
+
+                let artifact_squared = artifact * artifact;
+                artifact_pow4_sum += artifact_squared * artifact_squared;
 
                 // d1 < 0: original has an edge where distorted is smooth
                 //         (indicating smoothing, blurring, smearing, etc)
-                let detail_lost = (-d1).max(0.0);
-                sum1[2] += detail_lost;
-                sum1[3] += detail_lost.powi(4);
+                let detail_lost = 0.5 * (-d1 + d1.abs()); // if d1 is negative, -d1; if positive, 0
+                detail_lost_sum += detail_lost;
+                let detail_lost_squared = detail_lost * detail_lost;
+                detail_lost_pow4_sum += detail_lost_squared * detail_lost_squared;
             }
         }
-        plane_averages[c * 4] = one_per_pixels * sum1[0];
-        plane_averages[c * 4 + 1] = (one_per_pixels * sum1[1]).sqrt().sqrt();
-        plane_averages[c * 4 + 2] = one_per_pixels * sum1[2];
-        plane_averages[c * 4 + 3] = (one_per_pixels * sum1[3]).sqrt().sqrt();
+
+        // Calculate the averages for the current channel
+        let base_idx = c * 4;
+        plane_averages[base_idx] = one_per_pixels * artifact_sum;
+        plane_averages[base_idx + 1] = (one_per_pixels * artifact_pow4_sum).sqrt().sqrt();
+        plane_averages[base_idx + 2] = one_per_pixels * detail_lost_sum;
+        plane_averages[base_idx + 3] = (one_per_pixels * detail_lost_pow4_sum).sqrt().sqrt();
     }
 
     plane_averages
@@ -341,6 +497,7 @@ impl Msssim {
     //    KADID-10k: 0.6175 | 0.8133 | 0.8030
     //    KonFiG(F): 0.7668 | 0.9194 | 0.9136
     #[allow(clippy::too_many_lines)]
+    #[inline(always)]
     pub fn score(&self) -> f64 {
         const WEIGHT: [f64; 108] = [
             0.0,
@@ -489,7 +646,7 @@ impl Msssim {
 
 #[cfg(test)]
 mod tests {
-    use std::path::PathBuf;
+    use std::{path::PathBuf, time::Instant};
 
     use super::*;
     use yuvxyb::Rgb;
@@ -540,8 +697,15 @@ mod tests {
             .unwrap(),
         )
         .unwrap();
+        let start = Instant::now();
         let result = compute_frame_ssimulacra2(source_data, distorted_data).unwrap();
+        let elapsed = start.elapsed();
+        println!("Elapsed time: {:?}", elapsed);
         let expected = 17.398_505_f64;
+
+        println!("Result: {result:.6}");
+        println!("Expected: {expected:.6}");
+        println!("Diff: {:.3}", (result - expected).abs());
         assert!(
             // SOMETHING is WEIRD with Github CI where it gives different results across DIFFERENT
             // RUNS
@@ -549,4 +713,67 @@ mod tests {
             "Result {result:.6} not equal to expected {expected:.6}",
         );
     }
+
+    #[test]
+    fn test2_ssimulacra2() {
+        let source = image::open(
+            PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+                .join("test_data")
+                .join("test_image_4.png"),
+        )
+        .unwrap();
+        let distorted = image::open(
+            PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+                .join("test_data")
+                .join("test_image_4.jpg"),
+        )
+        .unwrap();
+
+        let source_data = source
+            .to_rgb32f()
+            .chunks_exact(3)
+            .map(|chunk| [chunk[0], chunk[1], chunk[2]])
+            .collect::<Vec<_>>();
+        let source_data = Xyb::try_from(
+            Rgb::new(
+                source_data,
+                source.width() as usize,
+                source.height() as usize,
+                TransferCharacteristic::SRGB,
+                ColorPrimaries::BT709,
+            )
+            .unwrap(),
+        )
+        .unwrap();
+        let distorted_data = distorted
+            .to_rgb32f()
+            .chunks_exact(3)
+            .map(|chunk| [chunk[0], chunk[1], chunk[2]])
+            .collect::<Vec<_>>();
+        let distorted_data = Xyb::try_from(
+            Rgb::new(
+                distorted_data,
+                distorted.width() as usize,
+                distorted.height() as usize,
+                TransferCharacteristic::SRGB,
+                ColorPrimaries::BT709,
+            )
+            .unwrap(),
+        )
+        .unwrap();
+        let start = Instant::now();
+        let result = compute_frame_ssimulacra2(source_data, distorted_data).unwrap();
+        let elapsed = start.elapsed();
+        println!("Elapsed time: {:?}", elapsed);
+        let expected = 84.031_931_f64;
+        println!("Result: {result:.6}");
+        println!("Expected: {expected:.6}");
+        println!("Diff: {:.3}", (result - expected).abs());
+        assert!(
+            // SOMETHING is WEIRD with Github CI where it gives different results across DIFFERENT
+            // RUNS
+            (result - expected).abs() < 1.0f64,
+            "Result {result:.6} not equal to expected {expected:.6}",
+        );
+    }
 }
diff --git a/test_data/test_image_4.jpg b/test_data/test_image_4.jpg
new file mode 100644
index 0000000..a9c3e14
Binary files /dev/null and b/test_data/test_image_4.jpg differ
diff --git a/test_data/test_image_4.png b/test_data/test_image_4.png
new file mode 100644
index 0000000..3bb0764
Binary files /dev/null and b/test_data/test_image_4.png differ