Julia Dedicated

using BenchmarkTools, InteractiveUtils
versioninfo()

2.6s

global N = 100

0.2s

0.8s

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/

# based on Oleg Mazurov's Java Implementation and Jeremy Zerfas' C implementation
# transliterated by Hamza Yusuf Çakır
global const preferred_num_blocks = 12

struct Fannkuch
    n::Int64
    blocksz::Int64
    maxflips::Vector{Int32}
    chksums::Vector{Int32}

    function Fannkuch(n, nthreads)
        nfact = factorial(n)

        blocksz = nfact ÷ (nfact < preferred_num_blocks ? 1 : preferred_num_blocks)
        maxflips = zeros(Int32,nthreads)
        chksums = zeros(Int32, nthreads)

        new(n, blocksz, maxflips, chksums)
    end
end

struct Perm
    p::Vector{Int8}
    pp::Vector{Int8}
    count::Vector{Int32}

    function Perm(n)
        p = zeros(Int8, n)
        pp = zeros(Int8, n)
        count = zeros(Int32, n)

        new(p, pp, count)
    end
end

Base.@propagate_inbounds function first_permutation(perm::Perm, idx)
    p = perm.p
    pp = perm.pp

    for i = 2:length(p)
        p[i] = i - 1
    end

    for i = length(p):-1:2
        ifact = factorial(i-1)
        d = idx ÷ ifact
        perm.count[i] = d
        idx = idx % ifact

        for j = 1:i
            pp[j] = p[j]
        end

        for j = 1:i
            p[j] = j+d <= i ? pp[j+d] : pp[j+d-i]
        end
    end
end

Base.@propagate_inbounds function next_permutation(perm::Perm)
    p = perm.p
    count = perm.count

    first = p[2]
    p[2]  = p[1]
    p[1]  = first

    i = 2
    while (count[i] + 1) >= i
        count[i] = 0
        i += 1

        next = p[1] = p[2]

        for j = 1:i-1
            p[j] = p[j+1]
        end

        p[i] = first
        first = next
    end
    count[i] += 1
end

Base.@propagate_inbounds @inline function count_flips(perm::Perm)
    p = perm.p
    pp = perm.pp

    flips = 1
    first = p[1] + 1
    if p[first] != 0

        copyto!(pp, 2, p, 2, length(p) - 1)

        while true
            flips += 1

            new_first = pp[first]
            pp[first] = first - 1

            if first > 3
                lo = 2; hi = first-1

                # see the note in Jeremy Zerfas' C implementation
                for k = 1:15
                    t = pp[lo]
                    pp[lo] = pp[hi]
                    pp[hi] = t

                    !((lo + 3) <= hi) && break
                    lo += 1
                    hi -= 1
                end
            end

            first = new_first + 1
            pp[first] == 0 && break
        end
    end

    return flips
end

Base.@propagate_inbounds function run_task(f::Fannkuch, perm::Perm, idxmin, idxmax)

    maxflips = 1
    chksum = 0

    let
        i = idxmin
        while true
            if perm.p[1] != 0
                flips = count_flips(perm)
                maxflips = max(maxflips, flips)
                chksum += iseven(i) ? flips : -flips
            end
            if i == idxmax
                break
            end
            i += 1
            next_permutation(perm)
        end
    end
    id = Threads.threadid()
    f.maxflips[id] = max(f.maxflips[id], maxflips)
    f.chksums[id] += chksum
end

function runf(f::Fannkuch)
    factn = factorial(f.n)

    Threads.@threads for idxmin=0:f.blocksz:factn-1
        perm = Perm(f.n)
        first_permutation(perm, idxmin)
        idxmax = idxmin + f.blocksz - 1
        @inbounds run_task(f, perm, idxmin, idxmax)
    end
end

function fannkuch(n = 12)
    f = Fannkuch(n,Threads.nthreads())
    runf(f)
    # reduce results
    chk = sum(f.chksums)
    res = maximum(f.maxflips)
    (chk, res)
end

0.8s

0.4s

#=
The Computer Language Benchmarks Game
 https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
 https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
 modified for Julia 1.0 by Simon Danisch
=#
const zerov8 = ntuple(x-> 0f0, 8)

# Calculate mandelbrot set for one Vec8 into one byte
Base.@propagate_inbounds function mand8(cr, ci)
    Zr = zerov8
    Zi = zerov8
    Tr = zerov8
    Ti = zerov8
    t = zerov8
    for i in 0:49
        Zi = 2f0 .* Zr .* Zi .+ ci
        Zr = Tr .- Ti .+ cr
        Tr = Zr .* Zr
        Ti = Zi .* Zi
        t = Tr .+ Ti
        all(x-> x > 4f0, t) && break
    end
    byte = UInt8(0)
    t[1] <= 4f0 && (byte |= 0x80)
    t[2] <= 4f0 && (byte |= 0x40)
    t[3] <= 4f0 && (byte |= 0x20)
    t[4] <= 4f0 && (byte |= 0x10)
    t[5] <= 4f0 && (byte |= 0x08)
    t[6] <= 4f0 && (byte |= 0x04)
    t[7] <= 4f0 && (byte |= 0x02)
    t[8] <= 4f0 && (byte |= 0x01)
    return byte
end

function mandel_inner(rows, ci, y, N, xvals)
    @simd for x in 1:8:N
        @inbounds begin
            cr = ntuple(i-> xvals[x + (i - 1)], 8)
            rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
        end
    end
end

function mandel(n = 16000)
    inv_ = 2.0 / n
    N = n
    xvals = zeros(Float32, n)
    yvals = zeros(Float32, n)
    Threads.@threads for i in 0:(N-1)
        @inbounds xvals[i + 1] = i * inv_ - 1.5
        @inbounds yvals[i + 1] = i * inv_ - 1.0
    end
    rows = zeros(UInt8, n*N÷8)
    Threads.@threads for y in 1:N
        @inbounds ci = yvals[y]
        mandel_inner(rows, ci, y, N, xvals)
    end
end

0.4s

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/

# contributed by Jarret Revels and Alex Arslan
# based on the Java version

module NBody

using Printf
using LinearAlgebra

# Constants
const solar_mass = 4 * pi * pi
const days_per_year = 365.24

# Use a 4-tuple here to get better SIMD instructions.
# This is a space-time tradeoff, but this benchmark is well within L1 cache limits.
struct Vec3
    x::NTuple{4, Float64}
end
Vec3(x, y, z) = Vec3((x,y,z,0.0))
Base.:/(v::Vec3, n::Number) = Vec3(1/n .* v.x)
Base.:*(v::Vec3, n::Number) = Vec3(n .* v.x)
Base.:-(v1::Vec3, v2::Vec3) = Vec3(v1.x .- v2.x)
Base.:+(v1::Vec3, v2::Vec3) = Vec3(v1.x .+ v2.x)
# Todo, prettify
squarednorm(v1::Vec3) = v1.x[1]^2 + v1.x[2]^2 + v1.x[3]^2
Base.muladd(x::Vec3, y::Number, z::Vec3) = Vec3(muladd.(x.x, y, z.x))

# A heavenly body in the system
mutable struct Body
    pos::Vec3
    v::Vec3
    mass::Float64
end

function offset_momentum!(b::Body, p::Vec3)
    b.v -= p / solar_mass
end

function init_sun!(bodies::Vector{Body})
    p = Vec3(0.0, 0.0, 0.0)
    for b in bodies
        p += b.v * b.mass
    end
    offset_momentum!(bodies[1], p)
end

function advance(bodies::Vector{Body}, dt::Number)
    @inbounds for i = 1:length(bodies)
        bi = bodies[i]
        for j = i+1:length(bodies)
            bj = bodies[j]
            delta = bi.pos - bj.pos
            dsq = squarednorm(delta)
            distance = sqrt(dsq)
            mag = dt / (dsq * distance)
            bi.v = muladd(delta, -(bj.mass * mag), bi.v)
            bj.v = muladd(delta, (bi.mass * mag), bj.v)
        end
    end

    for b in bodies
        b.pos = muladd(b.v, dt, b.pos)
    end
end

function energy(bodies::Vector{Body})
    e = 0.0
    @inbounds for i = 1:length(bodies)
        bi = bodies[i]
        e += 0.5 * bi.mass * squarednorm(bi.v)
        for j = i+1:length(bodies)
            bj = bodies[j]
            delta = bi.pos - bj.pos
            distance = sqrt(squarednorm(delta))
            dinv = 1.0 / distance
            e = muladd((bi.mass * bj.mass), -dinv, e)
        end
    end
    return e
end


function perf_nbody(N::Int=1000)
    jupiter = Body( Vec3(4.84143144246472090e+00,                  # pos[1] = x
                         -1.16032004402742839e+00,                 # pos[2] = y
                         -1.03622044471123109e-01),                # pos[3] = z
                   Vec3(1.66007664274403694e-03 * days_per_year,   # v[1] = vx
                        7.69901118419740425e-03 * days_per_year,   # v[2] = vy
                        -6.90460016972063023e-05 * days_per_year), # v[3] = vz
                   9.54791938424326609e-04 * solar_mass)       # mass

    saturn = Body(Vec3(8.34336671824457987e+00,
                       4.12479856412430479e+00,
                      -4.03523417114321381e-01),
                  Vec3(-2.76742510726862411e-03 * days_per_year,
                       4.99852801234917238e-03 * days_per_year,
                       2.30417297573763929e-05 * days_per_year),
                  2.85885980666130812e-04 * solar_mass)

    uranus = Body(Vec3(1.28943695621391310e+01,
                    -1.51111514016986312e+01,
                    -2.23307578892655734e-01),
                  Vec3(2.96460137564761618e-03 * days_per_year,
                       2.37847173959480950e-03 * days_per_year,
                       -2.96589568540237556e-05 * days_per_year),
                  4.36624404335156298e-05 * solar_mass)

    neptune = Body(Vec3(1.53796971148509165e+01,
                        -2.59193146099879641e+01,
                        1.79258772950371181e-01),
                   Vec3(2.68067772490389322e-03 * days_per_year,
                        1.62824170038242295e-03 * days_per_year,
                       -9.51592254519715870e-05 * days_per_year),
                   5.15138902046611451e-05 * solar_mass)

    sun = Body(Vec3(0.0, 0.0, 0.0), Vec3(0.0, 0.0, 0.0), solar_mass)

    bodies = [sun, jupiter, saturn, uranus, neptune]

    init_sun!(bodies)
    for i = 1:N
        advance(bodies, 0.01)
    end
end

end # module
nbody(n) = NBody.perf_nbody(n)

0.4s

using BenchmarkTools
julia_results = Dict{String, Vector{BenchmarkTools.Trial}}()
for (name, n) in (
    nbody => 50000000,
		mandel => 16000,
		fannkuch => 12,
  )
  julia_results[string(name)] = map(1:N) do i
     @benchmark $(name)($n)
  end
end

10969.7s

# for small function benchmark take double
julia_results["sin"] = map(1:(N * 2)) do x
  @benchmark sin($(Ref(1.0))[])
end

259.0s

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <emmintrin.h>
#include <time.h>


long numDigits(long n)
{
    long len = 0;
    while(n)
    {
        n=n/10;
        len++;
    }
    return len;
}

inline long vec_nle(__m128d *v, double f)
{
    return (v[0][0] <= f ||
        v[0][1] <= f ||
        v[1][0] <= f ||
        v[1][1] <= f ||
        v[2][0] <= f ||
        v[2][1] <= f ||
        v[3][0] <= f ||
        v[3][1] <= f) ? 0 : -1;
}

inline void clrPixels_nle(__m128d *v, double f, unsigned long * pix8)
{
    if(!(v[0][0] <= f)) *pix8 &= 0x7f;
    if(!(v[0][1] <= f)) *pix8 &= 0xbf;
    if(!(v[1][0] <= f)) *pix8 &= 0xdf;
    if(!(v[1][1] <= f)) *pix8 &= 0xef;
    if(!(v[2][0] <= f)) *pix8 &= 0xf7;
    if(!(v[2][1] <= f)) *pix8 &= 0xfb;
    if(!(v[3][0] <= f)) *pix8 &= 0xfd;
    if(!(v[3][1] <= f)) *pix8 &= 0xfe;
}

inline void calcSum(__m128d *r, __m128d *i, __m128d *sum, __m128d *init_r, __m128d init_i)
{
    for(long pair=0; pair<4; pair++)
    {
        __m128d r2 = r[pair] * r[pair];
        __m128d i2 = i[pair] * i[pair];
        __m128d ri = r[pair] * i[pair];

        sum[pair] = r2 + i2;

        r[pair]=r2 - i2 + init_r[pair];
        i[pair]=ri + ri + init_i;
    }
}

inline unsigned long mand8(__m128d *init_r, __m128d init_i)
{
    __m128d r[4], i[4], sum[4];
    for(long pair=0; pair<4; pair++)
    {
        r[pair]=init_r[pair];
        i[pair]=init_i;
    }

    unsigned long pix8 = 0xff;

    for (long j = 0; j < 6; j++)
    {
        for(long k=0; k<8; k++)
            calcSum(r, i, sum, init_r, init_i);

        if (vec_nle(sum, 4.0))
        {
            pix8 = 0x00;
            break;
        }
    }
    if (pix8)
    {
        calcSum(r, i, sum, init_r, init_i);
        calcSum(r, i, sum, init_r, init_i);
        clrPixels_nle(sum, 4.0, &pix8);
    }

    return pix8;
}

unsigned long mand64(__m128d *init_r, __m128d init_i)
{
    unsigned long pix64 = 0;

    for(long byte=0; byte<8; byte++)
    {
        unsigned long pix8 = mand8(init_r, init_i);

        pix64 = (pix64 >> 8) | (pix8 << 56);
        init_r += 4;
    }

    return pix64;
}



void benchmark(int n)
{

    // get width/height from arguments
    long wid_ht = n;
    wid_ht = (wid_ht+7) & ~7;

    // allocate memory for header and pixels
    long headerLength = numDigits(wid_ht)*2+5;
    long pad = ((headerLength + 7) & ~7) - headerLength; // pad aligns pixels on 8
    long dataLength = headerLength + wid_ht*(wid_ht>>3);
    unsigned char * const buffer = malloc(pad + dataLength);
    unsigned char * const header = buffer + pad;
    unsigned char * const pixels = header + headerLength;


    // generate the bitmap header

    // sprintf((char *)header, "P4\n%ld %ld\n", wid_ht, wid_ht);


    // calculate initial values, store in r0, i0

    __m128d r0[wid_ht/2];
    double i0[wid_ht];

    for(long xy=0; xy<wid_ht; xy+=2)
    {
        r0[xy>>1] = 2.0 / wid_ht * (__m128d){xy,  xy+1} - 1.5;
        i0[xy]    = 2.0 / wid_ht *  xy    - 1.0;
        i0[xy+1]  = 2.0 / wid_ht * (xy+1) - 1.0;
    }


    // generate the bitmap

    long use8 = wid_ht%64;
    if (use8)
    {
        // process 8 pixels (one byte) at a time
        #pragma omp parallel for schedule(guided)
        for(long y=0; y<wid_ht; y++)
        {
            __m128d init_i = (__m128d){i0[y], i0[y]};
            long rowstart = y*wid_ht/8;
            for(long x=0; x<wid_ht; x+=8)
            {
                pixels[rowstart + x/8] = mand8(r0+x/2, init_i);
            }
        }
    }
    else
    {
        // process 64 pixels (8 bytes) at a time
        #pragma omp parallel for schedule(guided)
        for(long y=0; y<wid_ht; y++)
        {
            __m128d init_i = (__m128d){i0[y], i0[y]};
            long rowstart = y*wid_ht/64;
            for(long x=0; x<wid_ht; x+=64)
            {
                ((unsigned long *)pixels)[rowstart + x/64] = mand64(r0+x/2, init_i);
            }
        }
    }

    // write the data

    // long ret = ret = write(STDOUT_FILENO, header, dataLength);


    free(buffer);
                                                
}
extern void benchmark(int);

mandel.c

// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// Contributed by Jeremy Zerfas
// Based on the Ada program by Jonathan Parker and Georg Bauhaus which in turn
// was based on code by Dave Fladebo, Eckehard Berns, Heiner Marxen, Hongwei Xi,
// and The Anh Tran and also the Java program by Oleg Mazurov.

// This value controls how many blocks the workload is broken up into (as long
// as the value is less than or equal to the factorial of the argument to this
// program) in order to allow the blocks to be processed in parallel if
// possible. PREFERRED_NUMBER_OF_BLOCKS_TO_USE should be some number which
// divides evenly into all factorials larger than it. It should also be around
// 2-8 times the amount of threads you want to use in order to create enough
// blocks to more evenly distribute the workload amongst the threads.
#define PREFERRED_NUMBER_OF_BLOCKS_TO_USE 12

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>


// intptr_t should be the native integer type on most sane systems.
typedef intptr_t intnative_t;


void benchmark(int n){

   // Create and initialize factorial_Lookup_Table.
   intnative_t factorial_Lookup_Table[n+1];
   factorial_Lookup_Table[0]=1;
   for(intnative_t i=0; ++i<=n;)
      factorial_Lookup_Table[i]=i*factorial_Lookup_Table[i-1];

   // Determine the block_Size to use. If n! is less than
   // PREFERRED_NUMBER_OF_BLOCKS_TO_USE then just use a single block to prevent
   // block_Size from being set to 0. This also causes smaller values of n to
   // be computed serially which is faster and uses less resources for small
   // values of n.
   const intnative_t block_Size=factorial_Lookup_Table[n]/
     (factorial_Lookup_Table[n]<PREFERRED_NUMBER_OF_BLOCKS_TO_USE ?
     1 : PREFERRED_NUMBER_OF_BLOCKS_TO_USE);

   intnative_t maximum_Flip_Count=0, checksum=0;

   // Iterate over each block.
   #pragma omp parallel for \
     reduction(max:maximum_Flip_Count) reduction(+:checksum)
   for(intnative_t initial_Permutation_Index_For_Block=0;
     initial_Permutation_Index_For_Block<factorial_Lookup_Table[n];
     initial_Permutation_Index_For_Block+=block_Size){

      intnative_t count[n];
      int8_t temp_Permutation[n], current_Permutation[n];


      // Initialize count and current_Permutation.
      count[0]=0;
      for(intnative_t i=0; i<n; ++i)
         current_Permutation[i]=i;
      for(intnative_t i=n-1,
        permutation_Index=initial_Permutation_Index_For_Block; i>0; --i){
         const intnative_t d=permutation_Index/factorial_Lookup_Table[i];
         permutation_Index=permutation_Index%factorial_Lookup_Table[i];
         count[i]=d;

         for(intnative_t j=0; j<n; ++j)
            temp_Permutation[j]=current_Permutation[j];
         for(intnative_t j=0; j<=i; ++j)
            current_Permutation[j]= j+d<=i ?
              temp_Permutation[j+d] : temp_Permutation[j+d-i-1];
      }


      // Iterate over each permutation in the block.
      const intnative_t last_Permutation_Index_In_Block=
        initial_Permutation_Index_For_Block+block_Size-1;
      for(intnative_t permutation_Index=initial_Permutation_Index_For_Block; ;
        ++permutation_Index){

         // If the first value in the current_Permutation is not 1 (0) then
         // we will need to do at least one flip for the current_Permutation.
         if(current_Permutation[0]>0){

            // Make a copy of current_Permutation[] to work on. Note that we
            // don't need to copy the first value since that will be stored
            // in a separate variable since it gets used a lot.
            for(intnative_t i=0; ++i<n;)
               temp_Permutation[i]=current_Permutation[i];

            intnative_t flip_Count=1;

            // Flip temp_Permutation until the element at the first_Value
            // index is 1 (0).
            for(intnative_t first_Value=current_Permutation[0];
              temp_Permutation[first_Value]>0; ++flip_Count){

               // Record the new_First_Value and restore the old
               // first_Value at its new flipped position.
               const int8_t new_First_Value=temp_Permutation[first_Value];
               temp_Permutation[first_Value]=first_Value;

               // If first_Value is greater than 3 (2) then we are flipping
               // a series of four or more values so we will also need to
               // flip additional elements in the middle of the
               // temp_Permutation.
               if(first_Value>2){
                  intnative_t low_Index=1, high_Index=first_Value-1;
                  // Note that this loop is written so that it will run at
                  // most 16 times so that compilers will be more willing
                  // to unroll it. Consequently this won't work right when
                  // n is greater than 35. This would probably be the
                  // least of your concerns since 21! won't fit into 64
                  // bit integers and even if it did you probably wouldn't
                  // want to run this program with a value that large
                  // since it would take thousands of years to do on a
                  // modern desktop computer. ;-)
                  do{
                     const int8_t temp=temp_Permutation[high_Index];
                     temp_Permutation[high_Index]=
                       temp_Permutation[low_Index];
                     temp_Permutation[low_Index]=temp;
                  }while(low_Index+++3<=high_Index-- && low_Index<16);
               }

               // Update first_Value to new_First_Value that we recorded
               // earlier.
               first_Value=new_First_Value;
            }


            // Update the checksum.
            if(permutation_Index%2==0)
               checksum+=flip_Count;
            else
               checksum-=flip_Count;

            // Update maximum_Flip_Count if necessary.
            if(flip_Count>maximum_Flip_Count)
               maximum_Flip_Count=flip_Count;
         }


         // Break out of the loop when we get to the
         // last_Permutation_Index_In_Block.
         if(permutation_Index>=last_Permutation_Index_In_Block)
            break;

         // Generate the next permutation.
         int8_t first_Value=current_Permutation[1];
         current_Permutation[1]=current_Permutation[0];
         current_Permutation[0]=first_Value;
         for(intnative_t i=1; ++count[i]>i;){
            count[i++]=0;
            const int8_t new_First_Value=current_Permutation[0]=
              current_Permutation[1];

            for(intnative_t j=0; ++j<i;)
               current_Permutation[j]=current_Permutation[j+1];

            current_Permutation[i]=first_Value;
            first_Value=new_First_Value;
         }
      }
   }
}

extern void benchmark(int);

fannkuch.c

/* The Computer Language Benchmarks Game
   https://salsa.debian.org/benchmarksgame-team/benchmarksgame/

   contributed by Mark C. Lewis
   modified slightly by Chad Whipkey
   converted from java to c++,added sse support, by Branimir Maksimovic
   converted from c++ to c, by Alexey Medvedchikov
*/

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <immintrin.h>
#include <time.h>


#define PI 3.141592653589793
#define SOLAR_MASS ( 4 * PI * PI )
#define DAYS_PER_YEAR 365.24

struct body {
   double x[3], fill, v[3], mass;
};

static struct body solar_bodies[] = {
   /* sun */
   {
      .x = { 0., 0., 0. },
      .v = { 0., 0., 0. },
      .mass = SOLAR_MASS
   },
   /* jupiter */
   {
      .x = { 4.84143144246472090e+00,
         -1.16032004402742839e+00,
         -1.03622044471123109e-01 },
      .v = { 1.66007664274403694e-03 * DAYS_PER_YEAR,
         7.69901118419740425e-03 * DAYS_PER_YEAR,
         -6.90460016972063023e-05 * DAYS_PER_YEAR },
      .mass = 9.54791938424326609e-04 * SOLAR_MASS
   },
   /* saturn */
   {
      .x = { 8.34336671824457987e+00,
         4.12479856412430479e+00,
         -4.03523417114321381e-01 },
      .v = { -2.76742510726862411e-03 * DAYS_PER_YEAR,
         4.99852801234917238e-03 * DAYS_PER_YEAR,
         2.30417297573763929e-05 * DAYS_PER_YEAR },
      .mass = 2.85885980666130812e-04 * SOLAR_MASS
   },
   /* uranus */
   {
      .x = { 1.28943695621391310e+01,
         -1.51111514016986312e+01,
         -2.23307578892655734e-01 },
      .v = { 2.96460137564761618e-03 * DAYS_PER_YEAR,
         2.37847173959480950e-03 * DAYS_PER_YEAR,
         -2.96589568540237556e-05 * DAYS_PER_YEAR },
      .mass = 4.36624404335156298e-05 * SOLAR_MASS
   },
   /* neptune */
   {
      .x = { 1.53796971148509165e+01,
         -2.59193146099879641e+01,
         1.79258772950371181e-01 },
      .v = { 2.68067772490389322e-03 * DAYS_PER_YEAR,
         1.62824170038242295e-03 * DAYS_PER_YEAR,
         -9.51592254519715870e-05 * DAYS_PER_YEAR },
      .mass = 5.15138902046611451e-05 * SOLAR_MASS
   }
};

static const int BODIES_SIZE = sizeof(solar_bodies) / sizeof(solar_bodies[0]);

void offset_momentum(struct body *bodies, unsigned int nbodies)
{
   unsigned int i, k;
   for (i = 0; i < nbodies; ++i)
      for (k = 0; k < 3; ++k)
         bodies[0].v[k] -= bodies[i].v[k] * bodies[i].mass
            / SOLAR_MASS;
}

void bodies_advance(struct body *bodies, unsigned int nbodies, double dt)
{
   unsigned int N = (nbodies - 1) * nbodies / 2;
   static struct {
      double dx[3], fill;
   } r[1000];
   static __attribute__((aligned(16))) double mag[1000];
   unsigned int i, j, k, m;
   __m128d dx[3], dsquared, distance, dmag;

   for(k = 0, i = 0; i < nbodies - 1; ++i)
      for(j = i + 1; j < nbodies; ++j, ++k)
         for ( m = 0; m < 3; ++m)
            r[k].dx[m] = bodies[i].x[m] - bodies[j].x[m];

   for (i = 0; i < N; i += 2) {
      for (m = 0; m < 3; ++m) {
         dx[m] = _mm_loadl_pd(dx[m], &r[i].dx[m]);
         dx[m] = _mm_loadh_pd(dx[m], &r[i+1].dx[m]);
      }

      dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
      distance = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dsquared)));

      for (j = 0; j < 2; ++j)
         distance = distance * _mm_set1_pd(1.5)
            - ((_mm_set1_pd(0.5) * dsquared) * distance)
            * (distance * distance);

      dmag = _mm_set1_pd(dt) / (dsquared) * distance;
      _mm_store_pd(&mag[i], dmag);
   }

   for (i = 0, k = 0; i < nbodies - 1; ++i)
      for ( j = i + 1; j < nbodies; ++j, ++k)
         for ( m = 0; m < 3; ++m) {
            bodies[i].v[m] -= r[k].dx[m] * bodies[j].mass
               * mag[k];
            bodies[j].v[m] += r[k].dx[m] * bodies[i].mass
               * mag[k];
         }

   for (i = 0; i < nbodies; ++i)
      for ( m = 0; m < 3; ++m)
         bodies[i].x[m] += dt * bodies[i].v[m];
}

double bodies_energy(struct body *bodies, unsigned int nbodies) {
   double dx[3], distance, e = 0.0;
   unsigned int i, j, k;

   for (i=0; i < nbodies; ++i) {
      e += bodies[i].mass * ( bodies[i].v[0] * bodies[i].v[0]
         + bodies[i].v[1] * bodies[i].v[1]
         + bodies[i].v[2] * bodies[i].v[2] ) / 2.;

      for (j=i+1; j < nbodies; ++j) {
         for (k = 0; k < 3; ++k)
            dx[k] = bodies[i].x[k] - bodies[j].x[k];

         distance = sqrt(dx[0] * dx[0] + dx[1] * dx[1]
            + dx[2] * dx[2]);
         e -= (bodies[i].mass * bodies[j].mass) / distance;
      }
   }
   return e;
}

void benchmark(int n)
{
   int i;
   offset_momentum(solar_bodies, BODIES_SIZE);
   for (i = 0; i < n; ++i){
     bodies_advance(solar_bodies, BODIES_SIZE, 0.01);
   }
}

extern void benchmark(int n);

nbody.c

run(`gcc -c -fpic -Wall -O3 -fomit-frame-pointer -march=native -mno-fma -fno-finite-math-only -fopenmp mandel.c -o mandel.o`)
run(`gcc -c -fpic -Wall -O3 -fomit-frame-pointer -march=native -mfpmath=sse -msse3 nbody.c -o nbody.o -lm`)
run(`gcc -c -fpic -Wall -O3 -fomit-frame-pointer -march=native -fopenmp fannkuch.c -o fannkuch.o`)
for name in ("fannkuch", "nbody", "mandel")
  run(`gcc -shared -mno-fma -fno-finite-math-only -mfpmath=sse -msse3 -fopenmp -fomit-frame-pointer -march=native -O3 -o $name.so $name.o`)
  cname = Symbol("c_$name")
  @eval $cname(n) = ccall((:benchmark, $("/$name.so")), Cvoid, (Cint,), n)
end

0.9s

c_results = Dict{String, Vector{BenchmarkTools.Trial}}()
for (name, n) in (
    c_nbody => 50000000,
		c_mandel => 16000,
		c_fannkuch => 12,
  )
  c_results[replace(string(name), "c_" => "")] = map(1:N) do i
     @benchmark $(name)($n)
  end
end

11065.8s

using Serialization
open("results/result.jls", "w") do io
  serialize(io, Dict(
      "julia" => julia_results,
      "c" => c_results
   ))
end

1.1s

result.jls