Skip to content

Extra memcpy when compiling with rustc_codegen_gcc #56

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
jrmuizel opened this issue Aug 4, 2021 · 8 comments
Open

Extra memcpy when compiling with rustc_codegen_gcc #56

jrmuizel opened this issue Aug 4, 2021 · 8 comments
Labels

Comments

@jrmuizel
Copy link

jrmuizel commented Aug 4, 2021

When compiling the following there's an extra memcpy that LLVM is able to optimize away:

use std::mem;

struct SV {
    disc: usize,
    data: [usize; 40],
    capacity: usize,
}

impl SV {
    fn new() -> SV {
        SV { data: unsafe { mem::uninitialized() },
            disc: 0,
            capacity: 0 }
    }
}

pub struct L {
    a: SV,
    b: SV
}

pub struct Allocation<T> {
    f: *mut T,
}

impl<T> Allocation<T> {
    pub fn init(self, value: T) {
        use std::ptr;
        unsafe {
        ptr::write(self.f, value);
        }
    }
}

#[inline(never)]
pub fn foo(a: Allocation<L>) {
    a.init(L {
        a: SV::new(),
        b: SV::new()
    });
}

See: https://rust.godbolt.org/z/1d843PPnx

This test case is derived from rust-lang/rust#58082 which links to https://bugs.llvm.org/show_bug.cgi?id=40574 which suggests that GCC should be able to optimize something like this.

@antoyo
Copy link
Contributor

antoyo commented Aug 4, 2021

It might be because the sysroot is not being compiled with optimizations yet.

This commit in that PR was meant to allow compiling the sysroot with optimizations, but is breaking a hack. I have yet to look into fixing this.

@antoyo
Copy link
Contributor

antoyo commented Aug 4, 2021

That does not seem to be the problem.

Here's the GIMPLE for future reference:

void _ZN9test_rust3foo17h9f0abfb59652170fE (signed long * param0)
{
  struct struct undefined;
  signed long dummyValueThatShouldNeverBeUsed;
  struct struct undefined;
  signed long dummyValueThatShouldNeverBeUsed;
  struct struct undefined;
  signed long dummyValueThatShouldNeverBeUsed;
  struct struct undefined;
  signed long * stack_var_4;
  struct  stack_var_3;
  struct  stack_var_2;
  struct  stack_var_1;
  struct struct undefined;
  struct struct undefined;

  try
    {
      start:
      stack_var_4.3_1 = &stack_var_4;
      MEM[(signed long * *)stack_var_4.3_1] = param0;
      _ZN9test_rust2SV3new17h38b8a107f2f9bb1aE (&stack_var_2);
      dummyValueThatShouldNeverBeUsed = 0;
      goto bb1;
      bb1:
      _ZN9test_rust2SV3new17h38b8a107f2f9bb1aE (&stack_var_3);
      dummyValueThatShouldNeverBeUsed = 0;
      goto bb2;
      bb2:
      memcpy (&stack_var_1, &stack_var_2, 336);
      memcpy (&stack_var_1.field_3, &stack_var_3, 336);
      _ZN9test_rust19Allocation_LT_T_GT_4init17h88034cc4566a9510E (param0, &stack_var_1);
      dummyValueThatShouldNeverBeUsed = 0;
      goto bb3;
      bb3:
      return;
    }
  finally
    {
      undefined = {CLOBBER};
      undefined = {CLOBBER};
      undefined = {CLOBBER};
      undefined = {CLOBBER};
      stack_var_4 = {CLOBBER};
      stack_var_3 = {CLOBBER};
      stack_var_2 = {CLOBBER};
      stack_var_1 = {CLOBBER};
      undefined = {CLOBBER};
      undefined = {CLOBBER};
    }
}

while with g++ and your C++ example, it gives:

void __GIMPLE (struct Allocation a, double g)
{
  struct L s;

  try
    {
      # DEBUG BEGIN_STMT
      s.a = SV::make (); [return slot optimization]
      s.b = SV::make (); [return slot optimization]
      # DEBUG BEGIN_STMT
      Allocation<L>::init (&a, s);
    }
  finally
    {
      s = _Literal (struct L) {CLOBBER};
    }
}

@antoyo antoyo added the perf label Oct 31, 2021
@bjorn3
Copy link
Member

bjorn3 commented Jan 25, 2022

I think this is caused by bad handling of mem::uninitialized(). I reduced it to

#![allow(deprecated)]

#[no_mangle]
unsafe extern "C" fn new_uninit() -> [usize; 40] {
    std::mem::uninitialized()
}

https://rust.godbolt.org/z/z6T68Yx7c

For some reason the assembly contains two globals full of garbage:

global_1_example_67fcd5eb_cgu_0:
        .quad   global_0_example_67fcd5eb_cgu_0
        .byte   15
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   5
        .byte   0
        .byte   0
        .byte   0
        .byte   5
        .byte   0
        .byte   0
        .byte   0
global_0_example_67fcd5eb_cgu_0:
        .byte   47
        .byte   97
        .byte   112
        .byte   112
        .byte   47
        .byte   101
        .byte   120
        .byte   97
        .byte   109
        .byte   112
        .byte   108
        .byte   101
        .byte   46
        .byte   114
        .byte   115

Using MaybeUninit::uninit() however results in just a ret instruction:

#[no_mangle]
unsafe extern "C" fn new_uninit() -> std::mem::MaybeUninit<[usize; 40]> {
    std::mem::MaybeUninit::uninit()
}

https://rust.godbolt.org/z/3os7jG3ro

new_uninit:
        ret

@bjorn3
Copy link
Member

bjorn3 commented Jan 25, 2022

Right, mem::uninitialized() is marked with #[track_caller], so those globals are the caller location. Still doesn't explain the unoptimized body of mem::uninitialized() though.

@antoyo
Copy link
Contributor

antoyo commented Jan 25, 2022

Maybe it's just because inlining is not implemented yet?

@bjorn3
Copy link
Member

bjorn3 commented Jan 25, 2022

Even without inlining the mem::uninitialized call mem::uninitialized itself should result in a ret instruction and nothing more.

@antoyo
Copy link
Contributor

antoyo commented Nov 23, 2023

The example in the original post now seems good locally (with a sysroot compiled in release mode, which is not the case on Compiler Explorer, which is currently broken, so it doesn't have the latest version).

The mem::uninitialized() is still not optimized, though. Here's the GIMPLE for future reference:

__attribute__((visibility ("default")))
void new_uninit (size_t[40] * param0)
{
  void * undefined;
  long int * stack_var_2;
  long int * stack_var_1;
  struct  stack_var_2;
  struct  stack_var_1;

  try
    {
      start:
      stack_var_1.0_1 = &stack_var_1;
      MEM[(struct  * *)stack_var_1.0_1] = &stack_var_1;
      stack_var_2.1_2 = &stack_var_2;
      MEM[(long int * *)stack_var_2.1_2] = &stack_var_1;
      memset (&stack_var_1, 1, 320);
      memcpy (&stack_var_2, &stack_var_1, 320);
      memcpy (param0, &stack_var_2, 320);
      return;
    }
  finally
    {
      stack_var_2 = {CLOBBER(eol)};
      stack_var_1 = {CLOBBER(eol)};
      stack_var_2 = {CLOBBER(eol)};
      stack_var_1 = {CLOBBER(eol)};
    }
}

@antoyo
Copy link
Contributor

antoyo commented Nov 23, 2023

Or is it actually optimized? There are no call intructions or jumps in the cg_gcc version while cg_llvm calls memset.
So, it seems the GCC version is unrolled.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

3 participants