-
-
Notifications
You must be signed in to change notification settings - Fork 983
core: Optimize CPU bitmap blending and copy operations #23006
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
jarca0123
wants to merge
1
commit into
ruffle-rs:master
Choose a base branch
from
jarca0123:optimize-cpu-bitmap-blending
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+139
−45
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1284,6 +1284,66 @@ | |
| } | ||
| } | ||
|
|
||
| /// Blend a single pixel (src-over, premultiplied alpha, two-lane u32 trick). | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a difference between this method and |
||
| #[inline(always)] | ||
| fn blend_pixel(d: &mut u32, s: u32) { | ||
| // Color layout is [r, g, b, a] in memory. | ||
| // On little-endian: u32 bits = 0xAABBGGRR, so alpha is bits 24..31. | ||
| // On big-endian: u32 bits = 0xRRGGBBAA, so alpha is bits 0..7. | ||
| #[cfg(target_endian = "little")] | ||
| let inv_a = 255 - (s >> 24); | ||
| #[cfg(target_endian = "big")] | ||
| let inv_a = 255 - (s & 0xFF); | ||
|
|
||
| let d_val = *d; | ||
|
|
||
| // Split into two lanes: bytes 0,2 and bytes 1,3. | ||
| let d02 = d_val & 0x00FF_00FF; | ||
| let d13 = (d_val >> 8) & 0x00FF_00FF; | ||
|
|
||
| // Multiply each lane by inv_alpha. | ||
| let m02 = d02 * inv_a; | ||
| let m13 = d13 * inv_a; | ||
|
|
||
| // div255 on both sub-lanes: (x + 1 + (x >> 8)) >> 8 | ||
| let m02 = ((m02 + 0x0001_0001 + ((m02 >> 8) & 0x00FF_00FF)) >> 8) & 0x00FF_00FF; | ||
| let m13 = ((m13 + 0x0001_0001 + ((m13 >> 8) & 0x00FF_00FF)) >> 8) & 0x00FF_00FF; | ||
|
|
||
| // Add src channels (same lane trick avoids cross-byte carries). | ||
| let s02 = s & 0x00FF_00FF; | ||
| let s13 = (s >> 8) & 0x00FF_00FF; | ||
| let o02 = (m02 + s02) & 0x00FF_00FF; | ||
| let o13 = ((m13 + s13) & 0x00FF_00FF) << 8; | ||
|
|
||
| *d = o02 | o13; | ||
| } | ||
|
|
||
| /// Blend `src` over `dst` in-place (premultiplied alpha, src-over). | ||
| fn blend_row(dst: &mut [Color], src: &[Color]) { | ||
| debug_assert_eq!(dst.len(), src.len()); | ||
| let dst_u32: &mut [u32] = bytemuck::cast_slice_mut(dst); | ||
| let src_u32: &[u32] = bytemuck::cast_slice(src); | ||
|
|
||
| let (dst_chunks, dst_remainder) = dst_u32.split_at_mut(dst_u32.len() & !3); | ||
| let (src_chunks, src_remainder) = src_u32.split_at(src_u32.len() & !3); | ||
|
|
||
| // Process 4 pixels per iteration (unrolled for autovectorization). | ||
| for (d4, s4) in dst_chunks | ||
| .chunks_exact_mut(4) | ||
| .zip(src_chunks.chunks_exact(4)) | ||
| { | ||
| blend_pixel(&mut d4[0], s4[0]); | ||
| blend_pixel(&mut d4[1], s4[1]); | ||
| blend_pixel(&mut d4[2], s4[2]); | ||
| blend_pixel(&mut d4[3], s4[3]); | ||
| } | ||
|
|
||
| // Handle remainder (0-3 pixels). | ||
| for (d, &s) in dst_remainder.iter_mut().zip(src_remainder.iter()) { | ||
| blend_pixel(d, s); | ||
| } | ||
| } | ||
|
|
||
| fn copy_on_cpu<'gc>( | ||
| context: &Mutation<'gc>, | ||
| renderer: &mut dyn RenderBackend, | ||
|
|
@@ -1302,20 +1362,36 @@ | |
| return; | ||
| } | ||
|
|
||
| let width = dest_region.width() as usize; | ||
| let height = dest_region.height() as usize; | ||
|
|
||
| if source.ptr_eq(dest) { | ||
| let dest = dest.sync(renderer); | ||
| let mut write = dest.borrow_mut(context); | ||
|
|
||
| for y in 0..dest_region.height() { | ||
| for x in 0..dest_region.width() { | ||
| let mut color = | ||
| write.get_pixel32_raw(source_region.x_min + x, source_region.y_min + y); | ||
| if blend { | ||
| color = write | ||
| .get_pixel32_raw(dest_region.x_min + x, dest_region.y_min + y) | ||
| .blend_over(&color); | ||
| } | ||
| write.set_pixel32_raw(dest_region.x_min + x, dest_region.y_min + y, color); | ||
| let stride = write.width() as usize; | ||
|
|
||
| if blend { | ||
| // Self-blend: need a temporary copy of source rows to avoid aliasing | ||
| let mut row_buf = vec![Color::default(); width]; | ||
| for y in 0..height as u32 { | ||
| let src_start = | ||
| (source_region.x_min + (source_region.y_min + y) * stride as u32) as usize; | ||
| row_buf.copy_from_slice(&write.raw_pixels()[src_start..src_start + width]); | ||
|
|
||
| let dst_start = | ||
| (dest_region.x_min + (dest_region.y_min + y) * stride as u32) as usize; | ||
| let dst_row = &mut write.raw_pixels_mut()[dst_start..dst_start + width]; | ||
| blend_row(dst_row, &row_buf); | ||
| } | ||
| } else { | ||
| // Self-copy without blending: use copy_within per row | ||
| let pixels = write.raw_pixels_mut(); | ||
| for y in 0..height as u32 { | ||
| let src_start = | ||
| (source_region.x_min + (source_region.y_min + y) * stride as u32) as usize; | ||
| let dst_start = | ||
| (dest_region.x_min + (dest_region.y_min + y) * stride as u32) as usize; | ||
| pixels.copy_within(src_start..src_start + width, dst_start); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -1325,53 +1401,63 @@ | |
| let mut dest_write = dest.borrow_mut(context); | ||
| let source_read = source.read_area(source_region, renderer); | ||
|
|
||
| let src_stride = source_read.width() as usize; | ||
| let dst_stride = dest_write.width() as usize; | ||
|
|
||
| if !blend && (dest_write.transparency() || !source_read.transparency()) { | ||
| // Copying (not blending) anything to a transparent texture, | ||
| // or copying an opaque texture to an opaque texture, | ||
| // means we can skip alpha premultiplication | ||
| // Straight copy (no blending, no alpha fixup needed) | ||
|
|
||
| if dest_region == source_region | ||
| && dest_region.width() == dest_write.width() | ||
| && dest_region.height() == dest_write.height() | ||
| && dest_region.width() == source_read.width() | ||
| && dest_region.height() == source_read.height() | ||
| { | ||
| // Copying an entire texture that's the same size and type? Just replace the whole thing | ||
| // Full-image copy | ||
| dest_write | ||
| .raw_pixels_mut() | ||
| .copy_from_slice(source_read.raw_pixels()); | ||
| } else { | ||
| for y in 0..dest_region.height() { | ||
| for x in 0..dest_region.width() { | ||
| let color = source_read | ||
| .get_pixel32_raw(source_region.x_min + x, source_region.y_min + y); | ||
| dest_write.set_pixel32_raw( | ||
| dest_region.x_min + x, | ||
| dest_region.y_min + y, | ||
| color, | ||
| ); | ||
| } | ||
| // Row-by-row memcpy | ||
| let src_pixels = source_read.raw_pixels(); | ||
| let dst_pixels = dest_write.raw_pixels_mut(); | ||
| for y in 0..height as u32 { | ||
| let src_start = (source_region.x_min | ||
| + (source_region.y_min + y) * src_stride as u32) | ||
| as usize; | ||
| let dst_start = | ||
| (dest_region.x_min + (dest_region.y_min + y) * dst_stride as u32) as usize; | ||
| dst_pixels[dst_start..dst_start + width] | ||
| .copy_from_slice(&src_pixels[src_start..src_start + width]); | ||
| } | ||
| } | ||
| } else { | ||
| // Copying (not blending) a transparent texture to an opaque texture, | ||
| // or blending anything to anything | ||
|
|
||
| // Blending or transparent-to-opaque copy | ||
| let opaque = !dest_write.transparency(); | ||
| let src_pixels = source_read.raw_pixels(); | ||
| let dst_pixels = dest_write.raw_pixels_mut(); | ||
|
|
||
| for y in 0..dest_region.height() { | ||
| for x in 0..dest_region.width() { | ||
| let mut color = source_read | ||
| .get_pixel32_raw(source_region.x_min + x, source_region.y_min + y); | ||
| if blend { | ||
| color = dest_write | ||
| .get_pixel32_raw(dest_region.x_min + x, dest_region.y_min + y) | ||
| .blend_over(&color); | ||
| } | ||
| for y in 0..height as u32 { | ||
| let src_start = | ||
| (source_region.x_min + (source_region.y_min + y) * src_stride as u32) as usize; | ||
| let dst_start = | ||
| (dest_region.x_min + (dest_region.y_min + y) * dst_stride as u32) as usize; | ||
| let src_row = &src_pixels[src_start..src_start + width]; | ||
| let dst_row = &mut dst_pixels[dst_start..dst_start + width]; | ||
|
|
||
| if blend { | ||
| blend_row(dst_row, src_row); | ||
| if opaque { | ||
| color = color.with_alpha(255); | ||
| for d in dst_row.iter_mut() { | ||
| *d = d.with_alpha(255); | ||
| } | ||
| } | ||
| } else { | ||
| // transparent-to-opaque: copy and force alpha to 255 | ||
| dst_row.copy_from_slice(src_row); | ||
| for d in dst_row.iter_mut() { | ||
| *d = d.with_alpha(255); | ||
| } | ||
| dest_write.set_pixel32_raw(dest_region.x_min + x, dest_region.y_min + y, color); | ||
| } | ||
| } | ||
| } | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you think this is better? It produces more complicated assembly:
I would imagine the compiler knows how to optimize a division by 255, why is it an issue? Which architecture are you optimizing for?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am optimizing for no specific architecture in mind, I want to take every target into account. However, this truly is an oversight that I am reverting.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you really want to extract a function, you can do it in such a way that will prevent us from casting integers in every line, e.g.