Simon Danisch / Nov 29 2018

# Julia Shared

using BenchmarkTools, InteractiveUtils
versioninfo()
global N = 100
0.9s
Language:Julia
# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/

# based on Oleg Mazurov's Java Implementation and Jeremy Zerfas' C implementation
# transliterated by Hamza Yusuf Çakır
global const preferred_num_blocks = 12

struct Fannkuch
n::Int64
blocksz::Int64
maxflips::Vector{Int32}
chksums::Vector{Int32}

nfact = factorial(n)

blocksz = nfact ÷ (nfact < preferred_num_blocks ? 1 : preferred_num_blocks)

new(n, blocksz, maxflips, chksums)
end
end

struct Perm
p::Vector{Int8}
pp::Vector{Int8}
count::Vector{Int32}

function Perm(n)
p = zeros(Int8, n)
pp = zeros(Int8, n)
count = zeros(Int32, n)

new(p, pp, count)
end
end

Base.@propagate_inbounds function first_permutation(perm::Perm, idx)
p = perm.p
pp = perm.pp

for i = 2:length(p)
p[i] = i - 1
end

for i = length(p):-1:2
ifact = factorial(i-1)
d = idx ÷ ifact
perm.count[i] = d
idx = idx % ifact

for j = 1:i
pp[j] = p[j]
end

for j = 1:i
p[j] = j+d <= i ? pp[j+d] : pp[j+d-i]
end
end
end

Base.@propagate_inbounds function next_permutation(perm::Perm)
p = perm.p
count = perm.count

first = p[2]
p[2]  = p[1]
p[1]  = first

i = 2
while (count[i] + 1) >= i
count[i] = 0
i += 1

next = p[1] = p[2]

for j = 1:i-1
p[j] = p[j+1]
end

p[i] = first
first = next
end
count[i] += 1
end

Base.@propagate_inbounds @inline function count_flips(perm::Perm)
p = perm.p
pp = perm.pp

flips = 1
first = p[1] + 1
if p[first] != 0

copyto!(pp, 2, p, 2, length(p) - 1)

while true
flips += 1

new_first = pp[first]
pp[first] = first - 1

if first > 3
lo = 2; hi = first-1

# see the note in Jeremy Zerfas' C implementation
for k = 1:15
t = pp[lo]
pp[lo] = pp[hi]
pp[hi] = t

!((lo + 3) <= hi) && break
lo += 1
hi -= 1
end
end

first = new_first + 1
pp[first] == 0 && break
end
end

return flips
end

Base.@propagate_inbounds function run_task(f::Fannkuch, perm::Perm, idxmin, idxmax)

maxflips = 1
chksum = 0

let
i = idxmin
while true
if perm.p[1] != 0
flips = count_flips(perm)
maxflips = max(maxflips, flips)
chksum += iseven(i) ? flips : -flips
end
if i == idxmax
break
end
i += 1
next_permutation(perm)
end
end
f.maxflips[id] = max(f.maxflips[id], maxflips)
f.chksums[id] += chksum
end

function runf(f::Fannkuch)
factn = factorial(f.n)

perm = Perm(f.n)
first_permutation(perm, idxmin)
idxmax = idxmin + f.blocksz - 1
end
end

function fannkuch(n = 12)
runf(f)
# reduce results
chk = sum(f.chksums)
res = maximum(f.maxflips)
(chk, res)
end
@time @benchmark fannkuch($12) 0.5s Language:Julia #= The Computer Language Benchmarks Game https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn: https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html modified for Julia 1.0 by Simon Danisch =# const zerov8 = ntuple(x-> 0f0, 8) # Calculate mandelbrot set for one Vec8 into one byte Base.@propagate_inbounds function mand8(cr, ci) Zr = zerov8 Zi = zerov8 Tr = zerov8 Ti = zerov8 t = zerov8 for i in 0:49 Zi = 2f0 .* Zr .* Zi .+ ci Zr = Tr .- Ti .+ cr Tr = Zr .* Zr Ti = Zi .* Zi t = Tr .+ Ti all(x-> x > 4f0, t) && break end byte = UInt8(0) t[1] <= 4f0 && (byte |= 0x80) t[2] <= 4f0 && (byte |= 0x40) t[3] <= 4f0 && (byte |= 0x20) t[4] <= 4f0 && (byte |= 0x10) t[5] <= 4f0 && (byte |= 0x08) t[6] <= 4f0 && (byte |= 0x04) t[7] <= 4f0 && (byte |= 0x02) t[8] <= 4f0 && (byte |= 0x01) return byte end function mandel_inner(rows, ci, y, N, xvals) @simd for x in 1:8:N @inbounds begin cr = ntuple(i-> xvals[x + (i - 1)], 8) rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci) end end end function mandel(n = 16000) inv_ = 2.0 / n N = n xvals = zeros(Float32, n) yvals = zeros(Float32, n) Threads.@threads for i in 0:(N-1) @inbounds xvals[i + 1] = i * inv_ - 1.5 @inbounds yvals[i + 1] = i * inv_ - 1.0 end rows = zeros(UInt8, n*N÷8) Threads.@threads for y in 1:N @inbounds ci = yvals[y] mandel_inner(rows, ci, y, N, xvals) end end 0.4s Language:Julia # The Computer Language Benchmarks Game # https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ # contributed by Jarret Revels and Alex Arslan # based on the Java version module NBody using Printf using LinearAlgebra # Constants const solar_mass = 4 * pi * pi const days_per_year = 365.24 # Use a 4-tuple here to get better SIMD instructions. # This is a space-time tradeoff, but this benchmark is well within L1 cache limits. struct Vec3 x::NTuple{4, Float64} end Vec3(x, y, z) = Vec3((x,y,z,0.0)) Base.:/(v::Vec3, n::Number) = Vec3(1/n .* v.x) Base.:*(v::Vec3, n::Number) = Vec3(n .* v.x) Base.:-(v1::Vec3, v2::Vec3) = Vec3(v1.x .- v2.x) Base.:+(v1::Vec3, v2::Vec3) = Vec3(v1.x .+ v2.x) # Todo, prettify squarednorm(v1::Vec3) = v1.x[1]^2 + v1.x[2]^2 + v1.x[3]^2 Base.muladd(x::Vec3, y::Number, z::Vec3) = Vec3(muladd.(x.x, y, z.x)) # A heavenly body in the system mutable struct Body pos::Vec3 v::Vec3 mass::Float64 end function offset_momentum!(b::Body, p::Vec3) b.v -= p / solar_mass end function init_sun!(bodies::Vector{Body}) p = Vec3(0.0, 0.0, 0.0) for b in bodies p += b.v * b.mass end offset_momentum!(bodies[1], p) end function advance(bodies::Vector{Body}, dt::Number) @inbounds for i = 1:length(bodies) bi = bodies[i] for j = i+1:length(bodies) bj = bodies[j] delta = bi.pos - bj.pos dsq = squarednorm(delta) distance = sqrt(dsq) mag = dt / (dsq * distance) bi.v = muladd(delta, -(bj.mass * mag), bi.v) bj.v = muladd(delta, (bi.mass * mag), bj.v) end end for b in bodies b.pos = muladd(b.v, dt, b.pos) end end function energy(bodies::Vector{Body}) e = 0.0 @inbounds for i = 1:length(bodies) bi = bodies[i] e += 0.5 * bi.mass * squarednorm(bi.v) for j = i+1:length(bodies) bj = bodies[j] delta = bi.pos - bj.pos distance = sqrt(squarednorm(delta)) dinv = 1.0 / distance e = muladd((bi.mass * bj.mass), -dinv, e) end end return e end function perf_nbody(N::Int=1000) jupiter = Body( Vec3(4.84143144246472090e+00, # pos[1] = x -1.16032004402742839e+00, # pos[2] = y -1.03622044471123109e-01), # pos[3] = z Vec3(1.66007664274403694e-03 * days_per_year, # v[1] = vx 7.69901118419740425e-03 * days_per_year, # v[2] = vy -6.90460016972063023e-05 * days_per_year), # v[3] = vz 9.54791938424326609e-04 * solar_mass) # mass saturn = Body(Vec3(8.34336671824457987e+00, 4.12479856412430479e+00, -4.03523417114321381e-01), Vec3(-2.76742510726862411e-03 * days_per_year, 4.99852801234917238e-03 * days_per_year, 2.30417297573763929e-05 * days_per_year), 2.85885980666130812e-04 * solar_mass) uranus = Body(Vec3(1.28943695621391310e+01, -1.51111514016986312e+01, -2.23307578892655734e-01), Vec3(2.96460137564761618e-03 * days_per_year, 2.37847173959480950e-03 * days_per_year, -2.96589568540237556e-05 * days_per_year), 4.36624404335156298e-05 * solar_mass) neptune = Body(Vec3(1.53796971148509165e+01, -2.59193146099879641e+01, 1.79258772950371181e-01), Vec3(2.68067772490389322e-03 * days_per_year, 1.62824170038242295e-03 * days_per_year, -9.51592254519715870e-05 * days_per_year), 5.15138902046611451e-05 * solar_mass) sun = Body(Vec3(0.0, 0.0, 0.0), Vec3(0.0, 0.0, 0.0), solar_mass) bodies = [sun, jupiter, saturn, uranus, neptune] init_sun!(bodies) for i = 1:N advance(bodies, 0.01) end end end # module nbody(n) = NBody.perf_nbody(n) using BenchmarkTools julia_results = Dict{String, Vector{BenchmarkTools.Trial}}() for (name, n) in ( nbody => 50000000, mandel => 16000, fannkuch => 12, ) julia_results[string(name)] = map(1:N) do i @benchmark$(name)($n) end end # for small function benchmark take double julia_results["sin"] = map(1:(N * 10)) do x @benchmark sin($(Ref(1.0))[])
end
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <emmintrin.h>
#include <time.h>

long numDigits(long n)
{
long len = 0;
while(n)
{
n=n/10;
len++;
}
return len;
}

inline long vec_nle(__m128d *v, double f)
{
return (v[0][0] <= f ||
v[0][1] <= f ||
v[1][0] <= f ||
v[1][1] <= f ||
v[2][0] <= f ||
v[2][1] <= f ||
v[3][0] <= f ||
v[3][1] <= f) ? 0 : -1;
}

inline void clrPixels_nle(__m128d *v, double f, unsigned long * pix8)
{
if(!(v[0][0] <= f)) *pix8 &= 0x7f;
if(!(v[0][1] <= f)) *pix8 &= 0xbf;
if(!(v[1][0] <= f)) *pix8 &= 0xdf;
if(!(v[1][1] <= f)) *pix8 &= 0xef;
if(!(v[2][0] <= f)) *pix8 &= 0xf7;
if(!(v[2][1] <= f)) *pix8 &= 0xfb;
if(!(v[3][0] <= f)) *pix8 &= 0xfd;
if(!(v[3][1] <= f)) *pix8 &= 0xfe;
}

inline void calcSum(__m128d *r, __m128d *i, __m128d *sum, __m128d *init_r, __m128d init_i)
{
for(long pair=0; pair<4; pair++)
{
__m128d r2 = r[pair] * r[pair];
__m128d i2 = i[pair] * i[pair];
__m128d ri = r[pair] * i[pair];

sum[pair] = r2 + i2;

r[pair]=r2 - i2 + init_r[pair];
i[pair]=ri + ri + init_i;
}
}

inline unsigned long mand8(__m128d *init_r, __m128d init_i)
{
__m128d r[4], i[4], sum[4];
for(long pair=0; pair<4; pair++)
{
r[pair]=init_r[pair];
i[pair]=init_i;
}

unsigned long pix8 = 0xff;

for (long j = 0; j < 6; j++)
{
for(long k=0; k<8; k++)
calcSum(r, i, sum, init_r, init_i);

if (vec_nle(sum, 4.0))
{
pix8 = 0x00;
break;
}
}
if (pix8)
{
calcSum(r, i, sum, init_r, init_i);
calcSum(r, i, sum, init_r, init_i);
clrPixels_nle(sum, 4.0, &pix8);
}

return pix8;
}

unsigned long mand64(__m128d *init_r, __m128d init_i)
{
unsigned long pix64 = 0;

for(long byte=0; byte<8; byte++)
{
unsigned long pix8 = mand8(init_r, init_i);

pix64 = (pix64 >> 8) | (pix8 << 56);
init_r += 4;
}

return pix64;
}

void benchmark(int n)
{

// get width/height from arguments
long wid_ht = n;
wid_ht = (wid_ht+7) & ~7;

// allocate memory for header and pixels
long dataLength = headerLength + wid_ht*(wid_ht>>3);
unsigned char * const buffer = malloc(pad + dataLength);

// sprintf((char *)header, "P4\n%ld %ld\n", wid_ht, wid_ht);

// calculate initial values, store in r0, i0

__m128d r0[wid_ht/2];
double i0[wid_ht];

for(long xy=0; xy<wid_ht; xy+=2)
{
r0[xy>>1] = 2.0 / wid_ht * (__m128d){xy,  xy+1} - 1.5;
i0[xy]    = 2.0 / wid_ht *  xy    - 1.0;
i0[xy+1]  = 2.0 / wid_ht * (xy+1) - 1.0;
}

// generate the bitmap

long use8 = wid_ht%64;
if (use8)
{
// process 8 pixels (one byte) at a time
#pragma omp parallel for schedule(guided)
for(long y=0; y<wid_ht; y++)
{
__m128d init_i = (__m128d){i0[y], i0[y]};
long rowstart = y*wid_ht/8;
for(long x=0; x<wid_ht; x+=8)
{
pixels[rowstart + x/8] = mand8(r0+x/2, init_i);
}
}
}
else
{
// process 64 pixels (8 bytes) at a time
#pragma omp parallel for schedule(guided)
for(long y=0; y<wid_ht; y++)
{
__m128d init_i = (__m128d){i0[y], i0[y]};
long rowstart = y*wid_ht/64;
for(long x=0; x<wid_ht; x+=64)
{
((unsigned long *)pixels)[rowstart + x/64] = mand64(r0+x/2, init_i);
}
}
}

// write the data

// long ret = ret = write(STDOUT_FILENO, header, dataLength);

free(buffer);

}
extern void benchmark(int);  
mandel.c
Julia
// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// Contributed by Jeremy Zerfas
// Based on the Ada program by Jonathan Parker and Georg Bauhaus which in turn
// was based on code by Dave Fladebo, Eckehard Berns, Heiner Marxen, Hongwei Xi,
// and The Anh Tran and also the Java program by Oleg Mazurov.

// This value controls how many blocks the workload is broken up into (as long
// as the value is less than or equal to the factorial of the argument to this
// program) in order to allow the blocks to be processed in parallel if
// possible. PREFERRED_NUMBER_OF_BLOCKS_TO_USE should be some number which
// divides evenly into all factorials larger than it. It should also be around
// 2-8 times the amount of threads you want to use in order to create enough
#define PREFERRED_NUMBER_OF_BLOCKS_TO_USE 12

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>

// intptr_t should be the native integer type on most sane systems.
typedef intptr_t intnative_t;

void benchmark(int n){

// Create and initialize factorial_Lookup_Table.
intnative_t factorial_Lookup_Table[n+1];
factorial_Lookup_Table[0]=1;
for(intnative_t i=0; ++i<=n;)
factorial_Lookup_Table[i]=i*factorial_Lookup_Table[i-1];

// Determine the block_Size to use. If n! is less than
// PREFERRED_NUMBER_OF_BLOCKS_TO_USE then just use a single block to prevent
// block_Size from being set to 0. This also causes smaller values of n to
// be computed serially which is faster and uses less resources for small
// values of n.
const intnative_t block_Size=factorial_Lookup_Table[n]/
(factorial_Lookup_Table[n]<PREFERRED_NUMBER_OF_BLOCKS_TO_USE ?
1 : PREFERRED_NUMBER_OF_BLOCKS_TO_USE);

intnative_t maximum_Flip_Count=0, checksum=0;

// Iterate over each block.
#pragma omp parallel for \
reduction(max:maximum_Flip_Count) reduction(+:checksum)
for(intnative_t initial_Permutation_Index_For_Block=0;
initial_Permutation_Index_For_Block<factorial_Lookup_Table[n];
initial_Permutation_Index_For_Block+=block_Size){

intnative_t count[n];
int8_t temp_Permutation[n], current_Permutation[n];

// Initialize count and current_Permutation.
count[0]=0;
for(intnative_t i=0; i<n; ++i)
current_Permutation[i]=i;
for(intnative_t i=n-1,
permutation_Index=initial_Permutation_Index_For_Block; i>0; --i){
const intnative_t d=permutation_Index/factorial_Lookup_Table[i];
permutation_Index=permutation_Index%factorial_Lookup_Table[i];
count[i]=d;

for(intnative_t j=0; j<n; ++j)
temp_Permutation[j]=current_Permutation[j];
for(intnative_t j=0; j<=i; ++j)
current_Permutation[j]= j+d<=i ?
temp_Permutation[j+d] : temp_Permutation[j+d-i-1];
}

// Iterate over each permutation in the block.
const intnative_t last_Permutation_Index_In_Block=
initial_Permutation_Index_For_Block+block_Size-1;
for(intnative_t permutation_Index=initial_Permutation_Index_For_Block; ;
++permutation_Index){

// If the first value in the current_Permutation is not 1 (0) then
// we will need to do at least one flip for the current_Permutation.
if(current_Permutation[0]>0){

// Make a copy of current_Permutation[] to work on. Note that we
// don't need to copy the first value since that will be stored
// in a separate variable since it gets used a lot.
for(intnative_t i=0; ++i<n;)
temp_Permutation[i]=current_Permutation[i];

intnative_t flip_Count=1;

// Flip temp_Permutation until the element at the first_Value
// index is 1 (0).
for(intnative_t first_Value=current_Permutation[0];
temp_Permutation[first_Value]>0; ++flip_Count){

// Record the new_First_Value and restore the old
// first_Value at its new flipped position.
const int8_t new_First_Value=temp_Permutation[first_Value];
temp_Permutation[first_Value]=first_Value;

// If first_Value is greater than 3 (2) then we are flipping
// a series of four or more values so we will also need to
// flip additional elements in the middle of the
// temp_Permutation.
if(first_Value>2){
intnative_t low_Index=1, high_Index=first_Value-1;
// Note that this loop is written so that it will run at
// most 16 times so that compilers will be more willing
// to unroll it. Consequently this won't work right when
// n is greater than 35. This would probably be the
// least of your concerns since 21! won't fit into 64
// bit integers and even if it did you probably wouldn't
// want to run this program with a value that large
// since it would take thousands of years to do on a
// modern desktop computer. ;-)
do{
const int8_t temp=temp_Permutation[high_Index];
temp_Permutation[high_Index]=
temp_Permutation[low_Index];
temp_Permutation[low_Index]=temp;
}while(low_Index+++3<=high_Index-- && low_Index<16);
}

// Update first_Value to new_First_Value that we recorded
// earlier.
first_Value=new_First_Value;
}

// Update the checksum.
if(permutation_Index%2==0)
checksum+=flip_Count;
else
checksum-=flip_Count;

// Update maximum_Flip_Count if necessary.
if(flip_Count>maximum_Flip_Count)
maximum_Flip_Count=flip_Count;
}

// Break out of the loop when we get to the
// last_Permutation_Index_In_Block.
if(permutation_Index>=last_Permutation_Index_In_Block)
break;

// Generate the next permutation.
int8_t first_Value=current_Permutation[1];
current_Permutation[1]=current_Permutation[0];
current_Permutation[0]=first_Value;
for(intnative_t i=1; ++count[i]>i;){
count[i++]=0;
const int8_t new_First_Value=current_Permutation[0]=
current_Permutation[1];

for(intnative_t j=0; ++j<i;)
current_Permutation[j]=current_Permutation[j+1];

current_Permutation[i]=first_Value;
first_Value=new_First_Value;
}
}
}
}

extern void benchmark(int);  
fannkuch.c
Julia
/* The Computer Language Benchmarks Game
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/

contributed by Mark C. Lewis
converted from java to c++,added sse support, by Branimir Maksimovic
converted from c++ to c, by Alexey Medvedchikov
*/

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <immintrin.h>
#include <time.h>

#define PI 3.141592653589793
#define SOLAR_MASS ( 4 * PI * PI )
#define DAYS_PER_YEAR 365.24

struct body {
double x[3], fill, v[3], mass;
};

static struct body solar_bodies[] = {
/* sun */
{
.x = { 0., 0., 0. },
.v = { 0., 0., 0. },
.mass = SOLAR_MASS
},
/* jupiter */
{
.x = { 4.84143144246472090e+00,
-1.16032004402742839e+00,
-1.03622044471123109e-01 },
.v = { 1.66007664274403694e-03 * DAYS_PER_YEAR,
7.69901118419740425e-03 * DAYS_PER_YEAR,
-6.90460016972063023e-05 * DAYS_PER_YEAR },
.mass = 9.54791938424326609e-04 * SOLAR_MASS
},
/* saturn */
{
.x = { 8.34336671824457987e+00,
4.12479856412430479e+00,
-4.03523417114321381e-01 },
.v = { -2.76742510726862411e-03 * DAYS_PER_YEAR,
4.99852801234917238e-03 * DAYS_PER_YEAR,
2.30417297573763929e-05 * DAYS_PER_YEAR },
.mass = 2.85885980666130812e-04 * SOLAR_MASS
},
/* uranus */
{
.x = { 1.28943695621391310e+01,
-1.51111514016986312e+01,
-2.23307578892655734e-01 },
.v = { 2.96460137564761618e-03 * DAYS_PER_YEAR,
2.37847173959480950e-03 * DAYS_PER_YEAR,
-2.96589568540237556e-05 * DAYS_PER_YEAR },
.mass = 4.36624404335156298e-05 * SOLAR_MASS
},
/* neptune */
{
.x = { 1.53796971148509165e+01,
-2.59193146099879641e+01,
1.79258772950371181e-01 },
.v = { 2.68067772490389322e-03 * DAYS_PER_YEAR,
1.62824170038242295e-03 * DAYS_PER_YEAR,
-9.51592254519715870e-05 * DAYS_PER_YEAR },
.mass = 5.15138902046611451e-05 * SOLAR_MASS
}
};

static const int BODIES_SIZE = sizeof(solar_bodies) / sizeof(solar_bodies[0]);

void offset_momentum(struct body *bodies, unsigned int nbodies)
{
unsigned int i, k;
for (i = 0; i < nbodies; ++i)
for (k = 0; k < 3; ++k)
bodies[0].v[k] -= bodies[i].v[k] * bodies[i].mass
/ SOLAR_MASS;
}

void bodies_advance(struct body *bodies, unsigned int nbodies, double dt)
{
unsigned int N = (nbodies - 1) * nbodies / 2;
static struct {
double dx[3], fill;
} r[1000];
static __attribute__((aligned(16))) double mag[1000];
unsigned int i, j, k, m;
__m128d dx[3], dsquared, distance, dmag;

for(k = 0, i = 0; i < nbodies - 1; ++i)
for(j = i + 1; j < nbodies; ++j, ++k)
for ( m = 0; m < 3; ++m)
r[k].dx[m] = bodies[i].x[m] - bodies[j].x[m];

for (i = 0; i < N; i += 2) {
for (m = 0; m < 3; ++m) {
}

dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
distance = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dsquared)));

for (j = 0; j < 2; ++j)
distance = distance * _mm_set1_pd(1.5)
- ((_mm_set1_pd(0.5) * dsquared) * distance)
* (distance * distance);

dmag = _mm_set1_pd(dt) / (dsquared) * distance;
_mm_store_pd(&mag[i], dmag);
}

for (i = 0, k = 0; i < nbodies - 1; ++i)
for ( j = i + 1; j < nbodies; ++j, ++k)
for ( m = 0; m < 3; ++m) {
bodies[i].v[m] -= r[k].dx[m] * bodies[j].mass
* mag[k];
bodies[j].v[m] += r[k].dx[m] * bodies[i].mass
* mag[k];
}

for (i = 0; i < nbodies; ++i)
for ( m = 0; m < 3; ++m)
bodies[i].x[m] += dt * bodies[i].v[m];
}

double bodies_energy(struct body *bodies, unsigned int nbodies) {
double dx[3], distance, e = 0.0;
unsigned int i, j, k;

for (i=0; i < nbodies; ++i) {
e += bodies[i].mass * ( bodies[i].v[0] * bodies[i].v[0]
+ bodies[i].v[1] * bodies[i].v[1]
+ bodies[i].v[2] * bodies[i].v[2] ) / 2.;

for (j=i+1; j < nbodies; ++j) {
for (k = 0; k < 3; ++k)
dx[k] = bodies[i].x[k] - bodies[j].x[k];

distance = sqrt(dx[0] * dx[0] + dx[1] * dx[1]
+ dx[2] * dx[2]);
e -= (bodies[i].mass * bodies[j].mass) / distance;
}
}
return e;
}

void benchmark(int n)
{
int i;
offset_momentum(solar_bodies, BODIES_SIZE);
for (i = 0; i < n; ++i){
}
}

extern void benchmark(int n);                                              
nbody.c
Julia
run(gcc -c -fpic -Wall -O3 -fomit-frame-pointer -march=native -mno-fma -fno-finite-math-only -fopenmp mandel.c -o mandel.o)
run(gcc -c -fpic -Wall -O3 -fomit-frame-pointer -march=native -mfpmath=sse -msse3 nbody.c -o nbody.o -lm)
run(gcc -c -fpic -Wall -O3 -fomit-frame-pointer -march=native -fopenmp fannkuch.c -o fannkuch.o)
for name in ("fannkuch", "nbody", "mandel")
run(gcc -shared -mno-fma -fno-finite-math-only -mfpmath=sse -msse3 -fopenmp -fomit-frame-pointer -march=native -O3 -o $name.so$name.o)
cname = Symbol("c_$name") @eval$cname(n) = ccall((:benchmark, $("/$name.so")), Cvoid, (Cint,), n)
end
c_results = Dict{String, Vector{BenchmarkTools.Trial}}()
for (name, n) in (
c_nbody => 50000000,
c_mandel => 16000,
c_fannkuch => 12,
)
c_results[replace(string(name), "c_" => "")] = map(1:N) do i
@benchmark $(name)($n)
end
end
using Serialization
open("results/result.jls", "w") do io
serialize(io, Dict(
"julia" => julia_results,
"c" => c_results
))
end
result.jls