How to Make Your Compiler Love You

Take a look at the seemly inoccuous looking function in Enhedron/Examples/Add.cpp

// Copyright 2014 Enhedron Ltd

#include <cstdint>

namespace Enhedron { namespace Examples { namespace Impl {

using std::size_t;

//! Add each element of source to destination
void add(double* source, double* destination, size_t size) {
    for (size_t index = 0; index < size; index++) {
        destination[index] += source[index];
    }
}

}}}

Then take a look at the generated assembly!

	.file	"Add.cpp"
	.text
	.p2align 4,,15
	.globl	_ZN8Enhedron8Examples4Impl3addEPdS2_m
	.type	_ZN8Enhedron8Examples4Impl3addEPdS2_m, @function
_ZN8Enhedron8Examples4Impl3addEPdS2_m:
.LFB0:
	.cfi_startproc
	testq	%rdx, %rdx
	je	.L21
	leaq	32(%rsi), %rax
	cmpq	%rax, %rdi
	leaq	32(%rdi), %rax
	setae	%cl
	cmpq	%rax, %rsi
	setae	%al
	orb	%al, %cl
	je	.L3
	cmpq	$6, %rdx
	jbe	.L3
	movq	%rdx, %r9
	xorl	%eax, %eax
	xorl	%ecx, %ecx
	shrq	$2, %r9
	leaq	0(,%r9,4), %r8
.L9:
	vmovupd	(%rdi,%rax), %xmm0
	addq	$1, %rcx
	vmovupd	(%rsi,%rax), %xmm1
	vinsertf128	$0x1, 16(%rdi,%rax), %ymm0, %ymm0
	vinsertf128	$0x1, 16(%rsi,%rax), %ymm1, %ymm1
	vaddpd	%ymm0, %ymm1, %ymm0
	vmovupd	%xmm0, (%rsi,%rax)
	vextractf128	$0x1, %ymm0, 16(%rsi,%rax)
	addq	$32, %rax
	cmpq	%r9, %rcx
	jb	.L9
	cmpq	%r8, %rdx
	je	.L20
	leaq	(%rsi,%r8,8), %rax
	vmovsd	(%rax), %xmm0
	vaddsd	(%rdi,%r8,8), %xmm0, %xmm0
	vmovsd	%xmm0, (%rax)
	leaq	1(%r8), %rax
	cmpq	%rax, %rdx
	jbe	.L20
	leaq	(%rsi,%rax,8), %rcx
	addq	$2, %r8
	cmpq	%r8, %rdx
	vmovsd	(%rcx), %xmm0
	vaddsd	(%rdi,%rax,8), %xmm0, %xmm0
	vmovsd	%xmm0, (%rcx)
	jbe	.L20
	leaq	(%rsi,%r8,8), %rax
	vmovsd	(%rax), %xmm0
	vaddsd	(%rdi,%r8,8), %xmm0, %xmm0
	vmovsd	%xmm0, (%rax)
	vzeroupper
	ret
	.p2align 4,,10
	.p2align 3
.L20:
	vzeroupper
.L21:
	rep ret
	.p2align 4,,10
	.p2align 3
.L3:
	xorl	%eax, %eax
	.p2align 4,,10
	.p2align 3
.L11:
	vmovsd	(%rsi,%rax,8), %xmm0
	vaddsd	(%rdi,%rax,8), %xmm0, %xmm0
	vmovsd	%xmm0, (%rsi,%rax,8)
	addq	$1, %rax
	cmpq	%rdx, %rax
	jne	.L11
	rep ret
	.cfi_endproc
.LFE0:
	.size	_ZN8Enhedron8Examples4Impl3addEPdS2_m, .-_ZN8Enhedron8Examples4Impl3addEPdS2_m
	.ident	"GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2"
	.section	.note.GNU-stack,"",@progbits

What’s going on here? The problem is, the compiler can’t make any assumptions about the arguments we provide. Let’s take a look at each assumption that would make the compilers life easier, one by one.

  1. source and destination are aligned on 32 byte boundries.
  2. size is > 0.
  3. size is divisible by 4 (4 * sizeof(double) == 32).

The code in Enhedron/Examples/AddAligned.cpp tells the compiler that it can make these assumptions.

// Copyright 2014 Enhedron Ltd

#include <cstdint>
#include <cassert>

#include "Enhedron/Util/Optimization.h"
#include "Enhedron/Util/Math.h"

namespace Enhedron { namespace Examples { namespace Impl {

using std::size_t;

using Util::assumeAligned;
using Util::isDivisible;

//! Add each element of source to destination
//! @param source Must be aligned to 32 bytes.
//! @param destination Must be aligned to 32 bytes.
//! @param size Must be non zero and divisble by (32 / sizeof(double)
void addAligned(double* source, double* destination, size_t size) {
    static constexpr const size_t alignment = 32;
    static constexpr const size_t doublesPerAlignedBlock = alignment / sizeof(double);

    static_assert(
        isDivisible(alignment, sizeof(double)),
         "alignment must be divisible by sizeof(double)"
    );
    assert(
        isDivisible(size, doublesPerAlignedBlock) &&
        "size must be divisble by (32 / sizeof(double)"
    );

    auto destinationEnd = destination + size;
    
    do {
        auto alignedSource = assumeAligned<double, alignment>(source);
        auto alignedDestination = assumeAligned<double, alignment>(destination);

        for (size_t index = 0; index < doublesPerAlignedBlock; ++index) {
            alignedDestination[index] += alignedSource[index];
        }

        destination += doublesPerAlignedBlock;
        source += doublesPerAlignedBlock;
    } while (destination != destinationEnd);
}

}}}

This is the generated assembly.

	.file	"AddAligned.cpp"
	.text
	.p2align 4,,15
	.globl	_ZN8Enhedron8Examples4Impl10addAlignedEPdS2_m
	.type	_ZN8Enhedron8Examples4Impl10addAlignedEPdS2_m, @function
_ZN8Enhedron8Examples4Impl10addAlignedEPdS2_m:
.LFB4:
	.cfi_startproc
	leaq	(%rsi,%rdx,8), %rax
	.p2align 4,,10
	.p2align 3
.L3:
	vmovsd	(%rsi), %xmm0
	addq	$32, %rsi
	addq	$32, %rdi
	vaddsd	-32(%rdi), %xmm0, %xmm0
	vmovsd	%xmm0, -32(%rsi)
	vmovsd	-24(%rsi), %xmm0
	vaddsd	-24(%rdi), %xmm0, %xmm0
	vmovsd	%xmm0, -24(%rsi)
	vmovsd	-16(%rsi), %xmm0
	vaddsd	-16(%rdi), %xmm0, %xmm0
	vmovsd	%xmm0, -16(%rsi)
	vmovsd	-8(%rsi), %xmm0
	vaddsd	-8(%rdi), %xmm0, %xmm0
	vmovsd	%xmm0, -8(%rsi)
	cmpq	%rax, %rsi
	jne	.L3
	rep ret
	.cfi_endproc
.LFE4:
	.size	_ZN8Enhedron8Examples4Impl10addAlignedEPdS2_m, .-_ZN8Enhedron8Examples4Impl10addAlignedEPdS2_m
	.ident	"GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2"
	.section	.note.GNU-stack,"",@progbits

This won’t have much of an effect for large arrays. There’s one final assumption that will really help our compiler out though, which is that source and destination do not overlap. For example, currently we could use our add function to perform a prefix sum with add(data, data + 1, size - 1). The compiler has to generate code that fits with those semantics.

The code in Enhedron/Examples/AddAligned.cpp tells the compiler that it can assume source and destination don’t overlap, using a Restrict template on the arguments (see Appendix).

// Copyright 2014 Enhedron Ltd

#include <cstdint>
#include <cassert>

#include "Enhedron/Util/Optimization.h"
#include "Enhedron/Util/Math.h"

namespace Enhedron { namespace Examples { namespace Impl {

using std::size_t;

using Util::Restrict;
using Util::assumeAligned;
using Util::isDivisible;

//! Add each element of source to destination
//! @param source Must not overlap with \p destination and must be aligned to 32 bytes.
//! @param destination Must not overlap with \p source and must be aligned to 32 bytes.
//! @param size Must be non zero and divisble by (32 / sizeof(double)
void addStrict(Restrict<double*> source, Restrict<double*> destination, size_t size) {
    static constexpr const size_t alignment = 32;
    static constexpr const size_t doublesPerAlignedBlock = alignment / sizeof(double);

    static_assert(
        isDivisible(alignment, sizeof(double)),
         "alignment must be divisible by sizeof(double)"
    );
    assert(
        isDivisible(size, doublesPerAlignedBlock) &&
        "size must be divisble by (32 / sizeof(double)"
    );

    auto destinationEnd = destination + size;
    
    assert(source >= destinationEnd || source + size <= destination);

    do {
        auto alignedSource = assumeAligned<double, alignment>(source);
        auto alignedDestination = assumeAligned<double, alignment>(destination);

        for (size_t index = 0; index < doublesPerAlignedBlock; ++index) {
            alignedDestination[index] += alignedSource[index];
        }

        destination += doublesPerAlignedBlock;
        source += doublesPerAlignedBlock;
    } while (destination != destinationEnd);
}

}}}

This is the generated assembly.

	.file	"AddStrict.cpp"
	.text
	.p2align 4,,15
	.globl	_ZN8Enhedron8Examples4Impl9addStrictEPdS2_m
	.type	_ZN8Enhedron8Examples4Impl9addStrictEPdS2_m, @function
_ZN8Enhedron8Examples4Impl9addStrictEPdS2_m:
.LFB4:
	.cfi_startproc
	leaq	(%rsi,%rdx,8), %rax
	.p2align 4,,10
	.p2align 3
.L3:
	vmovapd	(%rsi), %ymm0
	addq	$32, %rsi
	addq	$32, %rdi
	vaddpd	-32(%rdi), %ymm0, %ymm0
	vmovapd	%ymm0, -32(%rsi)
	cmpq	%rax, %rsi
	jne	.L3
	vzeroupper
	ret
	.cfi_endproc
.LFE4:
	.size	_ZN8Enhedron8Examples4Impl9addStrictEPdS2_m, .-_ZN8Enhedron8Examples4Impl9addStrictEPdS2_m
	.ident	"GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2"
	.section	.note.GNU-stack,"",@progbits

Now gcc was able to vectorize our code.

Appendix

Here’s the support files we used, followed by build instructions.

Enhedron/Util/Optimization.h

// Copyright 2014 Enhedron Ltd

#pragma once

#include <cstdint>

namespace Enhedron { namespace Util {
namespace Impl {
    using std::size_t;

    template<typename Value>
    using Restrict = Value __restrict;

    template<typename Value, size_t alignment, size_t offset = 0>
    Value* assumeAligned(Value* value) {
        return static_cast<Value*>(__builtin_assume_aligned(value, alignment, offset));
    }
}

using Impl::Restrict;
using Impl::assumeAligned;
}}

Enhedron/Util/Math.h

// Copyright 2014 Enhedron Ltd

#pragma once

#include <type_traits>

namespace Enhedron { namespace Util {
namespace Impl {
    using std::is_integral;

    template<typename Numerator, typename Denominator>
    constexpr bool isDivisible(Numerator numerator, Denominator denominator) {
        static_assert(is_integral<Numerator>::value, "Numerator must be an integral type");
        static_assert(is_integral<Denominator>::value, "Denominator must be an integral type");

        return denominator * (numerator / denominator) == numerator;
    }
}

using Impl::isDivisible;
}}

Enhedron/Examples/Main.cpp

#include <cstdint>
#include <iostream>

#include "Enhedron/Util/Optimization.h"

namespace Enhedron { namespace Examples { namespace Impl {
    using std::cout;
    using std::size_t;

    using Util::Restrict;

    void add(double* source, double* destination, size_t size);
    void addAligned(double* source, double* destination, size_t size);
    void addStrict(Restrict<double*> source, Restrict<double*> destination, size_t size);

    void main() {
        static constexpr const size_t size = 256;

        double source[size];
        double destination[size];

        for (size_t index = 0; index < size; ++index) {
            source[index] = index;
            destination[index] = index;
        }

        add(source, destination, size);
        addAligned(source, destination, size);
        addStrict(source, destination, size);

        for (size_t index = 0; index < size; ++index) {
            cout << destination[index] << '\n';
        }
    }
}

using Impl::main;
}}

int main() {
    Enhedron::Examples::main();
    
    return 0;
}

To build the examples with gcc 4.8.2 on linux

g++ -std=c++11 -mavx -O3 -I. -DNDEBUG Enhedron/Examples/*.cpp -o add

To generate the assembly

g++ -std=c++11 -mavx -O3 -S -I. -DNDEBUG Enhedron/Examples/*.cpp