Parallel arrays in Rust

In previous post we've used Go benchmarks to measure performance difference in total age computation for array of structs and struct of arrays. As expected parallel arrays, being cache and prefetch-friendly, delivered significant performance boost depending on the size of the metadata competing with age field. The only thing, we couldn't check is the impact of vectorization, since Go compiler still hasn't embraced SIMD instructions. So to verify our hypothesis, this time we'll use Rust, since its compiler is using LLVM under the hood. To control the number of users and metadata size following constants are used:

const USER_COUNT: usize = 10000;
const METADATA_SIZE: usize = 32;

The metadata is just an array of bytes with METADATA_SIZE size.

type Metadata = [u8; METADATA_SIZE];

Array of structs is implemented as follows:

#[derive(Clone, Copy)]
struct User {
    metadata: Metadata,
    age: u8,
}

impl User {
    fn with_age(age: u8) -> Self {
        User {
            metadata: [0; METADATA_SIZE],
            age: age,
        }
    }
}

struct Users([User; USER_COUNT]);

and parallel array:

struct ParallelUsers {
    metadata: [Metadata; USER_COUNT],
    ages: [u8; USER_COUNT],
}

For benchmarking we'll still be using code computing total age of users:

impl Users {
    fn total_age(&self) -> u64 {
        self.0.iter().map(|user| user.age as u64).sum()
    }
}

impl ParallelUsers {
    fn total_age(&self) -> u64 {
        self.ages.iter().map(|&age| age as u64).sum()
    }
}

When metadata size is set to 0, both implementations produce efficient vectorized code:

playground::Users::total_age: # @playground::Users::total_age
# %bb.0:
    pxor    %xmm0, %xmm0
    movl    $6, %eax
    pxor    %xmm2, %xmm2
    pxor    %xmm1, %xmm1

.LBB1_1:                                # =>This Inner Loop Header: Depth=1
    movzwl    -6(%rdi,%rax), %ecx
    movd    %ecx, %xmm4
    movzwl    -4(%rdi,%rax), %ecx
    movd    %ecx, %xmm3
    punpcklbw    %xmm0, %xmm4            # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
    punpcklwd    %xmm0, %xmm4            # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
    punpckldq    %xmm0, %xmm4            # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
    paddq    %xmm2, %xmm4
    punpcklbw    %xmm0, %xmm3            # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
    punpcklwd    %xmm0, %xmm3            # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
    punpckldq    %xmm0, %xmm3            # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
    paddq    %xmm1, %xmm3
    movzwl    -2(%rdi,%rax), %ecx
    movd    %ecx, %xmm2
    movzwl    (%rdi,%rax), %ecx
    movd    %ecx, %xmm1
    punpcklbw    %xmm0, %xmm2            # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
    punpcklwd    %xmm0, %xmm2            # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    punpckldq    %xmm0, %xmm2            # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
    paddq    %xmm4, %xmm2
    punpcklbw    %xmm0, %xmm1            # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    punpcklwd    %xmm0, %xmm1            # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    punpckldq    %xmm0, %xmm1            # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    paddq    %xmm3, %xmm1
    addq    $8, %rax
    cmpq    $10006, %rax                    # imm = 0x2716
    jne    .LBB1_1
# %bb.2:
    paddq    %xmm2, %xmm1
    pshufd    $238, %xmm1, %xmm0              # xmm0 = xmm1[2,3,2,3]
    paddq    %xmm1, %xmm0
    movq    %xmm0, %rax
    retq
...
playground::ParallelUsers::total_age: # @playground::ParallelUsers::total_age
# %bb.0:
    pxor    %xmm0, %xmm0
    movl    $6, %eax
    pxor    %xmm2, %xmm2
    pxor    %xmm1, %xmm1

.LBB2_1:                                # =>This Inner Loop Header: Depth=1
    movzwl    -6(%rdi,%rax), %ecx
    movd    %ecx, %xmm4
    movzwl    -4(%rdi,%rax), %ecx
    movd    %ecx, %xmm3
    punpcklbw    %xmm0, %xmm4            # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
    punpcklwd    %xmm0, %xmm4            # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
    punpckldq    %xmm0, %xmm4            # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
    paddq    %xmm2, %xmm4
    punpcklbw    %xmm0, %xmm3            # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
    punpcklwd    %xmm0, %xmm3            # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
    punpckldq    %xmm0, %xmm3            # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
    paddq    %xmm1, %xmm3
    movzwl    -2(%rdi,%rax), %ecx
    movd    %ecx, %xmm2
    movzwl    (%rdi,%rax), %ecx
    movd    %ecx, %xmm1
    punpcklbw    %xmm0, %xmm2            # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
    punpcklwd    %xmm0, %xmm2            # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    punpckldq    %xmm0, %xmm2            # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
    paddq    %xmm4, %xmm2
    punpcklbw    %xmm0, %xmm1            # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    punpcklwd    %xmm0, %xmm1            # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    punpckldq    %xmm0, %xmm1            # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    paddq    %xmm3, %xmm1
    addq    $8, %rax
    cmpq    $10006, %rax                    # imm = 0x2716
    jne    .LBB2_1
# %bb.2:
    paddq    %xmm2, %xmm1
    pshufd    $238, %xmm1, %xmm0              # xmm0 = xmm1[2,3,2,3]
    paddq    %xmm1, %xmm0
    movq    %xmm0, %rax
    retq

Now it's time to use benchmarks to see how they perform:

#[cfg(test)]
mod tests {
    use super::*;
    use rand::Rng;
    use test::Bencher;

    #[bench]
    fn bench_total_user_age(b: &mut Bencher) {
        let mut user_arr = [User::with_age(3); USER_COUNT];
        for idx in 0..USER_COUNT {
            user_arr[idx] = User::with_age(rand::random());
        }
        let users = Users(user_arr);
        b.iter(|| users.total_age());
    }

    #[bench]
    fn bench_total_parallel_user_age(b: &mut Bencher) {
        let mut ages = [0; USER_COUNT];
        rand::thread_rng().fill(&mut ages[..]);
        let users = ParallelUsers {
            metadata: [[0; METADATA_SIZE]; USER_COUNT],
            ages: ages,
        };
        b.iter(|| users.total_age());
    }
}

As expected, with no metadata, performance results are very close:

test tests::bench_total_parallel_user_age ... bench:       2,805 ns/iter (+/- 277)
test tests::bench_total_user_age          ... bench:       2,491 ns/iter (+/- 435)

As soon as we increase metadata size (4 for example), vectorization is no longer possible for array of structs:

playground::Users::total_age: # @playground::Users::total_age
# %bb.0:
    movl    $49, %ecx
    xorl    %eax, %eax

.LBB1_1:                                # =>This Inner Loop Header: Depth=1
    movzbl    -45(%rdi,%rcx), %edx
    addq    %rax, %rdx
    movzbl    -40(%rdi,%rcx), %eax
    addq    %rdx, %rax
    movzbl    -35(%rdi,%rcx), %edx
    addq    %rax, %rdx
    movzbl    -30(%rdi,%rcx), %eax
    addq    %rdx, %rax
    movzbl    -25(%rdi,%rcx), %edx
    addq    %rax, %rdx
    movzbl    -20(%rdi,%rcx), %eax
    addq    %rdx, %rax
    movzbl    -15(%rdi,%rcx), %edx
    addq    %rax, %rdx
    movzbl    -10(%rdi,%rcx), %eax
    addq    %rdx, %rax
    movzbl    -5(%rdi,%rcx), %edx
    addq    %rax, %rdx
    movzbl    (%rdi,%rcx), %eax
    addq    %rdx, %rax
    addq    $50, %rcx
    cmpq    $50049, %rcx                    # imm = 0xC381
    jne    .LBB1_1
# %bb.2:
    retq

but performance results are still fairly close:

test tests::bench_total_parallel_user_age ... bench:       2,505 ns/iter (+/- 577)
test tests::bench_total_user_age          ... bench:       2,563 ns/iter (+/- 467)

As soon as cache waste reaches 32 (half of the cache line), performance drops by ~2X:

test tests::bench_total_parallel_user_age ... bench:       2,752 ns/iter (+/- 509)
test tests::bench_total_user_age          ... bench:       4,693 ns/iter (+/- 739)

and by ~5X, similar to Go results, once metadata size reaches 64:

test tests::bench_total_parallel_user_age ... bench:       2,720 ns/iter (+/- 532)
test tests::bench_total_user_age          ... bench:      10,257 ns/iter (+/- 2,194)

So what are the conclusions?

even though parallel array structure enabled vectorization, it hasn't had a significant performance impact
memory bandwidth and cache efficiency played critical role on performance

Parallel arrays in Rust

Benchmark-driven approach