perf: improvements to scenecut detection speed

shssoichiro · shssoichiro · commit d555b7efcde7 · 2025-09-18T23:09:12.000-04:00
- Actually enable SIMD for scenecut detection. Oops.
- Always use the VapoursynthDecoder instead of piping to reduce memory
  bandwidth overhead.
- Use the default cachepath behavior (place the cache file next to the
  video), this avoids needing to re-cache if av1an's input hash changes,
  and for some reason is just faster.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/av1an-core/Cargo.toml b/av1an-core/Cargo.toml
@@ -22,6 +22,7 @@ av-decoders = { version = "0.4.0", features = ["vapoursynth"] }
 av-format = "0.7.0"
 av-ivf = "0.5.0"
 av-scenechange = { version = "0.17.3", default-features = false, features = [
+    "asm",
     "vapoursynth",
 ] }
 av1-grain = { version = "0.2.4", default-features = false, features = [
@@ -160,7 +161,6 @@ ref_option_ref = "warn"
 # Disabled due to https://github.com/rust-lang/rust-clippy/issues/14697
 # self_named_module_files = "warn"
 semicolon_if_nothing_returned = "warn"
-string_to_string = "warn"
 tests_outside_test_module = "warn"
 transmute_ptr_to_ptr = "warn"
 unused_peekable = "warn"
diff --git a/av1an-core/src/context.rs b/av1an-core/src/context.rs
@@ -922,9 +922,14 @@ impl Av1anContext {
             passes: overrides.as_ref().map_or(self.args.passes, |ovr| ovr.passes),
             encoder: overrides.as_ref().map_or(self.args.encoder, |ovr| ovr.encoder),
             noise_size: self.args.photon_noise_size,
-            target_quality: overrides.as_ref().map_or(self.args.target_quality.clone(), |ovr| {
-                ovr.target_quality.clone().map_or(self.args.target_quality.clone(), |tq| tq)
-            }),
+            target_quality: overrides.as_ref().map_or_else(
+                || self.args.target_quality.clone(),
+                |ovr| {
+                    ovr.target_quality
+                        .clone()
+                        .map_or_else(|| self.args.target_quality.clone(), |tq| tq)
+                },
+            ),
             tq_cq: None,
             ignore_frame_mismatch: self.args.ignore_frame_mismatch,
         };
@@ -1038,8 +1043,8 @@ impl Av1anContext {
             noise_size: scene.zone_overrides.as_ref().map_or(self.args.photon_noise_size, |ovr| {
                 (ovr.photon_noise_width, ovr.photon_noise_height)
             }),
-            target_quality: scene.zone_overrides.as_ref().map_or(
-                self.args.target_quality.clone(),
+            target_quality: scene.zone_overrides.as_ref().map_or_else(
+                || self.args.target_quality.clone(),
                 |ovr| {
                     ovr.target_quality.clone().unwrap_or_else(|| self.args.target_quality.clone())
                 },
@@ -1275,9 +1280,14 @@ impl Av1anContext {
             passes: overrides.as_ref().map_or(self.args.passes, |ovr| ovr.passes),
             encoder: overrides.as_ref().map_or(self.args.encoder, |ovr| ovr.encoder),
             noise_size: self.args.photon_noise_size,
-            target_quality: overrides.as_ref().map_or(self.args.target_quality.clone(), |ovr| {
-                ovr.target_quality.clone().map_or(self.args.target_quality.clone(), |tq| tq)
-            }),
+            target_quality: overrides.as_ref().map_or_else(
+                || self.args.target_quality.clone(),
+                |ovr| {
+                    ovr.target_quality
+                        .clone()
+                        .map_or_else(|| self.args.target_quality.clone(), |tq| tq)
+                },
+            ),
             tq_cq: None,
             ignore_frame_mismatch: self.args.ignore_frame_mismatch,
         };
diff --git a/av1an-core/src/loadscript.vpy b/av1an-core/src/loadscript.vpy
@@ -1,19 +1,9 @@
 import os
-import vapoursynth
 from vapoursynth import core
 
-# Set cache size to 1GB
-core.max_cache_size = 1024
-
 source = os.environ.get("AV1AN_SOURCE", None)
 chunk_method = os.environ.get("AV1AN_CHUNK_METHOD", None)
-cache_file = os.environ.get("AV1AN_CACHE_FILE", None)
-
-# Scene Detection
-perform_scene_detection = os.environ.get("AV1AN_PERFORM_SCENE_DETECTION", None)
-downscale_height = os.environ.get("AV1AN_DOWNSCALE_HEIGHT", None)
-pixel_format = os.environ.get("AV1AN_PIXEL_FORMAT", None)
-scaler = os.environ.get("AV1AN_SCALER", None)
+perform_scene_detection = globals().get("AV1AN_PERFORM_SCENE_DETECTION", None)
 
 # Default valid chunk methods
 VALID_CHUNK_METHODS: list[str] = ["lsmash", "ffms2", "dgdecnv", "bestsource"]
@@ -33,61 +23,22 @@ if not os.path.exists(source):
 # Import video
 match (chunk_method):  # type: ignore
     case "lsmash":
-        video = core.lsmas.LWLibavSource(source, cachefile=cache_file)
+        video = core.lsmas.LWLibavSource(source)
     case "ffms2":
-        video = core.ffms2.Source(source, cachefile=cache_file)
+        video = core.ffms2.Source(source)
     case "dgdecnv":
         video = core.dgdecodenv.DGSource(source)
     case "bestsource":
-        # Different versions of BestSource have different behaviors on Windows
-        # Versions R1 and older support absolute paths just as Av1an originally expected (with .json extension)
-        # Versions R8 and newer support absolute paths for cache files, but require setting cachemode to 4
-        # Versions since ~R2 attempt to create a path stemming from CWD but using the path of the source and also appends the track index and a .bsindex extension, which is unexpected for Av1an
-        # Unfortunately, BestSource is not keeping the reported version number updated properly so we cannot reliably determine if it supports absolute paths or not
-        # At best, we can wrap an attempt in a try/except block as previous versions of BestSource should throw an exception if an invalid cachemode value is provided
-        try:
-            video = core.bs.VideoSource(source, cachepath=cache_file, cachemode=4)
-        except Exception:
-            # Installed BestSource version does not support absolute paths, fallback to default behavior
-            video = core.bs.VideoSource(source, cachepath=cache_file)
-
-# Scene Detection
-# Bicubic is based on FFmpeg defaults. See https://ffmpeg.org/ffmpeg-scaler.html#toc-Scaler-Options
-scaler_function = core.resize.Bicubic
-# Map scaler parameter to equivalent vapoursynth scaler. See https://www.vapoursynth.com/doc/functions/video/resize.html#resize
-if scaler is not None:
-    match (scaler.lower()):
-        case "fast_bilinear" | "bilinear": scaler_function = core.resize.Bilinear
-        case "neighbor": scaler_function = core.resize.Point
-        case _: scaler_function = core.resize.Bicubic
-
-# Map pixel format to equivalent vapoursynth pixel format (Needs expansion)
-if pixel_format is not None:
-    match (pixel_format.lower()):
-        case "yuv420p": pixel_format = vapoursynth.YUV420P8
-        case "yuv420p10le": pixel_format = vapoursynth.YUV420P10
-        case _: pixel_format = None
+        # bestsource has the default behavior to store its index files in a user-specific directory
+        # but for consistency, this setting makes it store the index file next to the video
+        # as all the other source filters do
+        video = core.bs.VideoSource(source, cachepath="/")
 
-# Apply Scene Detection Processing
-if perform_scene_detection is not None:
-    if downscale_height is not None or pixel_format is not None:
-        # Ensure downscale_height is not greater than video height
-        if downscale_height is not None:
-            try:
-                downscale_height = int(downscale_height)
-            finally:
-                downscale_height = min(downscale_height, video.height)
-        # Maintain aspect ratio and ensure width is divisible by 2
-        video = scaler_function(
-            video,
-            width=int(((video.width / video.height) * int(downscale_height)) // 2 * 2) if downscale_height is not None else video.width,
-            height=int(downscale_height or video.height),
-            format=pixel_format,
-        )
-else:
-    # Limit to one thread when encoding
+if perform_scene_detection is None:
+    # Limit decoder resources when encoding since we will have multiple workers running
     core.num_threads = 1
+    core.max_cache_size = 1024
 
 
 # Output video
-video.set_output()
+video.set_output()
diff --git a/av1an-core/src/scene_detect.rs b/av1an-core/src/scene_detect.rs
@@ -241,16 +241,15 @@ fn build_decoder(
         clip_info.format_info.as_bit_depth()?
     };
 
-    let decoder = if input.is_vapoursynth() {
+    let decoder = if input.is_vapoursynth() || input.is_vapoursynth_script() {
         // VapoursynthDecoder is the only reliable method for downscaling user-provided
-        // scripts
+        // scripts, and for our generated scripts, it is faster than piping.
 
         // Must use from_file in order to set the CWD to the
         // directory of the user-provided VapourSynth script
-        let mut vs_decoder = VapoursynthDecoder::from_file(
-            input.as_vapoursynth_path(),
-            input.as_vspipe_args_hashmap()?,
-        )?;
+        let mut args_map = input.as_vspipe_args_hashmap()?;
+        args_map.insert("AV1AN_PERFORM_SCENE_DETECTION".into(), "1".into());
+        let mut vs_decoder = VapoursynthDecoder::from_file(input.as_script_path(), args_map)?;
 
         if sc_downscale_height.is_some() || sc_pix_format.is_some() {
             let downscale_height = sc_downscale_height.map(|dh| dh as u32);
@@ -286,38 +285,6 @@ fn build_decoder(
         }
 
         Decoder::from_decoder_impl(DecoderImpl::Vapoursynth(vs_decoder))?
-    } else if input.is_vapoursynth_script() {
-        // User provides a video input but is using a Vapoursynth-based chunk method.
-        // This may be slower than using ffmpeg but by using the same source filter,
-        // we ensure consistency in decoding.
-        let mut command = Command::new("vspipe");
-
-        if let Some(downscale_height) = sc_downscale_height {
-            command.env("AV1AN_DOWNSCALE_HEIGHT", downscale_height.to_string());
-        }
-        if let Some(pixel_format) = sc_pix_format {
-            command.env("AV1AN_PIXEL_FORMAT", format!("{pixel_format:?}"));
-        }
-
-        command
-            .arg("-c")
-            .arg("y4m")
-            .arg(input.as_script_path())
-            .arg("-")
-            .env("AV1AN_PERFORM_SCENE_DETECTION", "true")
-            .env("AV1AN_SCALER", sc_scaler)
-            .stdin(Stdio::null())
-            .stdout(Stdio::piped())
-            .stderr(Stdio::null());
-        // Append vspipe python arguments to the environment if there are any
-        for arg in input.as_vspipe_args_vec()? {
-            command.args(["-a", &arg]);
-        }
-
-        let y4m_decoder = Y4mDecoder::new(Box::new(
-            command.spawn()?.stdout.expect("vspipe should have stdout"),
-        ) as Box<dyn Read>)?;
-        Decoder::from_decoder_impl(DecoderImpl::Y4m(y4m_decoder))?
     } else {
         // FFmpeg is faster if the user provides video input
         let path = input.as_path();
diff --git a/av1an-core/src/vapoursynth.rs b/av1an-core/src/vapoursynth.rs
@@ -829,10 +829,6 @@ pub fn generate_loadscript_text(
         .replace(
             "chunk_method = os.environ.get(\"AV1AN_CHUNK_METHOD\", None)",
             &format!("chunk_method = {chunk_method_lower:?}"),
-        )
-        .replace(
-            "cache_file = os.environ.get(\"AV1AN_CACHE_FILE\", None)",
-            &format!("cache_file = {cache_file:?}"),
         );
 
     if let Some(scene_detection_downscale_height) = scene_detection_downscale_height {
diff --git a/av1an/Cargo.toml b/av1an/Cargo.toml
@@ -117,7 +117,6 @@ ref_option_ref = "warn"
 # Disabled due to https://github.com/rust-lang/rust-clippy/issues/14697
 # self_named_module_files = "warn"
 semicolon_if_nothing_returned = "warn"
-string_to_string = "warn"
 tests_outside_test_module = "warn"
 transmute_ptr_to_ptr = "warn"
 unused_peekable = "warn"