In previous post we've used Go benchmarks to measure performance difference in total age computation for array of structs and struct of arrays. As expected parallel arrays, being cache and prefetch-friendly, delivered significant performance boost depending on the size of the metadata competing with age
field. The only thing, we couldn't check is the impact of vectorization, since Go compiler still hasn't embraced SIMD instructions.
So to verify our hypothesis, this time we'll use Rust, since its compiler is using LLVM under the hood. To control the number of users and metadata size following constants are used:
const USER_COUNT: usize = 10000;
const METADATA_SIZE: usize = 32;
The metadata is just an array of bytes with METADATA_SIZE
size.
type Metadata = [u8; METADATA_SIZE];
Array of structs is implemented as follows:
#[derive(Clone, Copy)]
struct User {
metadata: Metadata,
age: u8,
}
impl User {
fn with_age(age: u8) -> Self {
User {
metadata: [0; METADATA_SIZE],
age: age,
}
}
}
struct Users([User; USER_COUNT]);
and parallel array:
struct ParallelUsers {
metadata: [Metadata; USER_COUNT],
ages: [u8; USER_COUNT],
}
For benchmarking we'll still be using code computing total age of users:
impl Users {
fn total_age(&self) -> u64 {
self.0.iter().map(|user| user.age as u64).sum()
}
}
impl ParallelUsers {
fn total_age(&self) -> u64 {
self.ages.iter().map(|&age| age as u64).sum()
}
}
When metadata size is set to 0
, both implementations produce efficient vectorized code:
playground::Users::total_age: # @playground::Users::total_age
# %bb.0:
pxor %xmm0, %xmm0
movl $6, %eax
pxor %xmm2, %xmm2
pxor %xmm1, %xmm1
.LBB1_1: # =>This Inner Loop Header: Depth=1
movzwl -6(%rdi,%rax), %ecx
movd %ecx, %xmm4
movzwl -4(%rdi,%rax), %ecx
movd %ecx, %xmm3
punpcklbw %xmm0, %xmm4 # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
punpcklwd %xmm0, %xmm4 # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
punpckldq %xmm0, %xmm4 # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
paddq %xmm2, %xmm4
punpcklbw %xmm0, %xmm3 # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
punpcklwd %xmm0, %xmm3 # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
punpckldq %xmm0, %xmm3 # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
paddq %xmm1, %xmm3
movzwl -2(%rdi,%rax), %ecx
movd %ecx, %xmm2
movzwl (%rdi,%rax), %ecx
movd %ecx, %xmm1
punpcklbw %xmm0, %xmm2 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
punpcklwd %xmm0, %xmm2 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
punpckldq %xmm0, %xmm2 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
paddq %xmm4, %xmm2
punpcklbw %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
punpcklwd %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
punpckldq %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
paddq %xmm3, %xmm1
addq $8, %rax
cmpq $10006, %rax # imm = 0x2716
jne .LBB1_1
# %bb.2:
paddq %xmm2, %xmm1
pshufd $238, %xmm1, %xmm0 # xmm0 = xmm1[2,3,2,3]
paddq %xmm1, %xmm0
movq %xmm0, %rax
retq
...
playground::ParallelUsers::total_age: # @playground::ParallelUsers::total_age
# %bb.0:
pxor %xmm0, %xmm0
movl $6, %eax
pxor %xmm2, %xmm2
pxor %xmm1, %xmm1
.LBB2_1: # =>This Inner Loop Header: Depth=1
movzwl -6(%rdi,%rax), %ecx
movd %ecx, %xmm4
movzwl -4(%rdi,%rax), %ecx
movd %ecx, %xmm3
punpcklbw %xmm0, %xmm4 # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
punpcklwd %xmm0, %xmm4 # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
punpckldq %xmm0, %xmm4 # xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
paddq %xmm2, %xmm4
punpcklbw %xmm0, %xmm3 # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
punpcklwd %xmm0, %xmm3 # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
punpckldq %xmm0, %xmm3 # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
paddq %xmm1, %xmm3
movzwl -2(%rdi,%rax), %ecx
movd %ecx, %xmm2
movzwl (%rdi,%rax), %ecx
movd %ecx, %xmm1
punpcklbw %xmm0, %xmm2 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
punpcklwd %xmm0, %xmm2 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
punpckldq %xmm0, %xmm2 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
paddq %xmm4, %xmm2
punpcklbw %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
punpcklwd %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
punpckldq %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
paddq %xmm3, %xmm1
addq $8, %rax
cmpq $10006, %rax # imm = 0x2716
jne .LBB2_1
# %bb.2:
paddq %xmm2, %xmm1
pshufd $238, %xmm1, %xmm0 # xmm0 = xmm1[2,3,2,3]
paddq %xmm1, %xmm0
movq %xmm0, %rax
retq
Now it's time to use benchmarks to see how they perform:
#[cfg(test)]
mod tests {
use super::*;
use rand::Rng;
use test::Bencher;
#[bench]
fn bench_total_user_age(b: &mut Bencher) {
let mut user_arr = [User::with_age(3); USER_COUNT];
for idx in 0..USER_COUNT {
user_arr[idx] = User::with_age(rand::random());
}
let users = Users(user_arr);
b.iter(|| users.total_age());
}
#[bench]
fn bench_total_parallel_user_age(b: &mut Bencher) {
let mut ages = [0; USER_COUNT];
rand::thread_rng().fill(&mut ages[..]);
let users = ParallelUsers {
metadata: [[0; METADATA_SIZE]; USER_COUNT],
ages: ages,
};
b.iter(|| users.total_age());
}
}
As expected, with no metadata, performance results are very close:
test tests::bench_total_parallel_user_age ... bench: 2,805 ns/iter (+/- 277)
test tests::bench_total_user_age ... bench: 2,491 ns/iter (+/- 435)
As soon as we increase metadata size (4
for example), vectorization is no longer possible for array of structs:
playground::Users::total_age: # @playground::Users::total_age
# %bb.0:
movl $49, %ecx
xorl %eax, %eax
.LBB1_1: # =>This Inner Loop Header: Depth=1
movzbl -45(%rdi,%rcx), %edx
addq %rax, %rdx
movzbl -40(%rdi,%rcx), %eax
addq %rdx, %rax
movzbl -35(%rdi,%rcx), %edx
addq %rax, %rdx
movzbl -30(%rdi,%rcx), %eax
addq %rdx, %rax
movzbl -25(%rdi,%rcx), %edx
addq %rax, %rdx
movzbl -20(%rdi,%rcx), %eax
addq %rdx, %rax
movzbl -15(%rdi,%rcx), %edx
addq %rax, %rdx
movzbl -10(%rdi,%rcx), %eax
addq %rdx, %rax
movzbl -5(%rdi,%rcx), %edx
addq %rax, %rdx
movzbl (%rdi,%rcx), %eax
addq %rdx, %rax
addq $50, %rcx
cmpq $50049, %rcx # imm = 0xC381
jne .LBB1_1
# %bb.2:
retq
but performance results are still fairly close:
test tests::bench_total_parallel_user_age ... bench: 2,505 ns/iter (+/- 577)
test tests::bench_total_user_age ... bench: 2,563 ns/iter (+/- 467)
As soon as cache waste reaches 32
(half of the cache line), performance drops by ~2X:
test tests::bench_total_parallel_user_age ... bench: 2,752 ns/iter (+/- 509)
test tests::bench_total_user_age ... bench: 4,693 ns/iter (+/- 739)
and by ~5X, similar to Go results, once metadata size reaches 64
:
test tests::bench_total_parallel_user_age ... bench: 2,720 ns/iter (+/- 532)
test tests::bench_total_user_age ... bench: 10,257 ns/iter (+/- 2,194)
So what are the conclusions?
- even though parallel array structure enabled vectorization, it hasn't had a significant performance impact
- memory bandwidth and cache efficiency played critical role on performance