arraybinops

An example implementation of Array Element-Wise Binary Operations in Rust.

Trying to produce efficient code where possible, avoiding any memory safety issues.

Example ASM

Given the following rust code

rust pub fn add_i64x32(lhs: [i64; 32], rhs: [i64; 32]) -> [i64; 32] { Array(lhs) + rhs }

It outputs the following asm, which is performing 16 i64x2 add operations, in an unrolled loop to avoid branching.

asm add_i64x32: sub rsp, 72 mov rax, rdi movdqu xmm1, xmmword, ptr, [rsi] movdqu xmm3, xmmword, ptr, [rsi, +, 16] movdqu xmm5, xmmword, ptr, [rsi, +, 32] movdqu xmm7, xmmword, ptr, [rsi, +, 48] movdqu xmm15, xmmword, ptr, [rsi, +, 64] movdqu xmm8, xmmword, ptr, [rsi, +, 80] movdqu xmm9, xmmword, ptr, [rsi, +, 96] movdqu xmm10, xmmword, ptr, [rsi, +, 112] movdqu xmm14, xmmword, ptr, [rsi, +, 128] movdqu xmm13, xmmword, ptr, [rsi, +, 144] movdqu xmm12, xmmword, ptr, [rsi, +, 160] movdqu xmm11, xmmword, ptr, [rsi, +, 176] movups xmm0, xmmword, ptr, [rsi, +, 192] movaps xmmword, ptr, [rsp], xmm0 movdqu xmm2, xmmword, ptr, [rsi, +, 208] movups xmm0, xmmword, ptr, [rsi, +, 224] movaps xmmword, ptr, [rsp, +, 48], xmm0 movdqu xmm0, xmmword, ptr, [rdx] paddq xmm0, xmm1 movdqa xmmword, ptr, [rsp, +, 32], xmm0 movdqu xmm0, xmmword, ptr, [rdx, +, 16] paddq xmm0, xmm3 movdqa xmmword, ptr, [rsp, +, 16], xmm0 movdqu xmm4, xmmword, ptr, [rdx, +, 32] paddq xmm4, xmm5 movdqu xmm6, xmmword, ptr, [rdx, +, 48] paddq xmm6, xmm7 movdqu xmm1, xmmword, ptr, [rdx, +, 64] paddq xmm1, xmm15 movdqu xmm15, xmmword, ptr, [rdx, +, 80] paddq xmm15, xmm8 movdqu xmm8, xmmword, ptr, [rdx, +, 96] paddq xmm8, xmm9 movdqu xmm9, xmmword, ptr, [rdx, +, 112] paddq xmm9, xmm10 movdqu xmm10, xmmword, ptr, [rdx, +, 128] paddq xmm10, xmm14 movdqu xmm14, xmmword, ptr, [rdx, +, 144] paddq xmm14, xmm13 movdqu xmm13, xmmword, ptr, [rdx, +, 160] paddq xmm13, xmm12 movdqu xmm12, xmmword, ptr, [rdx, +, 176] paddq xmm12, xmm11 movdqu xmm3, xmmword, ptr, [rdx, +, 192] paddq xmm3, xmmword, ptr, [rsp] movdqu xmm7, xmmword, ptr, [rdx, +, 208] paddq xmm7, xmm2 movdqu xmm5, xmmword, ptr, [rdx, +, 224] paddq xmm5, xmmword, ptr, [rsp, +, 48] movdqu xmm11, xmmword, ptr, [rsi, +, 240] movdqu xmm0, xmmword, ptr, [rdx, +, 240] paddq xmm0, xmm11 movaps xmm2, xmmword, ptr, [rsp, +, 32] movups xmmword, ptr, [rdi], xmm2 movaps xmm2, xmmword, ptr, [rsp, +, 16] movups xmmword, ptr, [rdi, +, 16], xmm2 movdqu xmmword, ptr, [rdi, +, 32], xmm4 movdqu xmmword, ptr, [rdi, +, 48], xmm6 movdqu xmmword, ptr, [rdi, +, 64], xmm1 movdqu xmmword, ptr, [rdi, +, 80], xmm15 movdqu xmmword, ptr, [rdi, +, 96], xmm8 movdqu xmmword, ptr, [rdi, +, 112], xmm9 movdqu xmmword, ptr, [rdi, +, 128], xmm10 movdqu xmmword, ptr, [rdi, +, 144], xmm14 movdqu xmmword, ptr, [rdi, +, 160], xmm13 movdqu xmmword, ptr, [rdi, +, 176], xmm12 movdqu xmmword, ptr, [rdi, +, 192], xmm3 movdqu xmmword, ptr, [rdi, +, 208], xmm7 movdqu xmmword, ptr, [rdi, +, 224], xmm5 movdqu xmmword, ptr, [rdi, +, 240], xmm0 add rsp, 72 ret