Simon Danisch / Nov 27 2018
Remix of Julia Shared by Simon Danisch
Julia Dedicated
Julia Dedicated
using BenchmarkTools, InteractiveUtils versioninfo()
global N = 100
0.8s
# The Computer Language Benchmarks Game # https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ # based on Oleg Mazurov's Java Implementation and Jeremy Zerfas' C implementation # transliterated by Hamza Yusuf Çakır global const preferred_num_blocks = 12 struct Fannkuch n::Int64 blocksz::Int64 maxflips::Vector{Int32} chksums::Vector{Int32} function Fannkuch(n, nthreads) nfact = factorial(n) blocksz = nfact ÷ (nfact < preferred_num_blocks ? 1 : preferred_num_blocks) maxflips = zeros(Int32,nthreads) chksums = zeros(Int32, nthreads) new(n, blocksz, maxflips, chksums) end end struct Perm p::Vector{Int8} pp::Vector{Int8} count::Vector{Int32} function Perm(n) p = zeros(Int8, n) pp = zeros(Int8, n) count = zeros(Int32, n) new(p, pp, count) end end Base. function first_permutation(perm::Perm, idx) p = perm.p pp = perm.pp for i = 2:length(p) p[i] = i - 1 end for i = length(p):-1:2 ifact = factorial(i-1) d = idx ÷ ifact perm.count[i] = d idx = idx % ifact for j = 1:i pp[j] = p[j] end for j = 1:i p[j] = j+d <= i ? pp[j+d] : pp[j+d-i] end end end Base. function next_permutation(perm::Perm) p = perm.p count = perm.count first = p[2] p[2] = p[1] p[1] = first i = 2 while (count[i] + 1) >= i count[i] = 0 i += 1 next = p[1] = p[2] for j = 1:i-1 p[j] = p[j+1] end p[i] = first first = next end count[i] += 1 end Base. function count_flips(perm::Perm) p = perm.p pp = perm.pp flips = 1 first = p[1] + 1 if p[first] != 0 copyto!(pp, 2, p, 2, length(p) - 1) while true flips += 1 new_first = pp[first] pp[first] = first - 1 if first > 3 lo = 2; hi = first-1 # see the note in Jeremy Zerfas' C implementation for k = 1:15 t = pp[lo] pp[lo] = pp[hi] pp[hi] = t !((lo + 3) <= hi) && break lo += 1 hi -= 1 end end first = new_first + 1 pp[first] == 0 && break end end return flips end Base. function run_task(f::Fannkuch, perm::Perm, idxmin, idxmax) maxflips = 1 chksum = 0 let i = idxmin while true if perm.p[1] != 0 flips = count_flips(perm) maxflips = max(maxflips, flips) chksum += iseven(i) ? flips : -flips end if i == idxmax break end i += 1 next_permutation(perm) end end id = Threads.threadid() f.maxflips[id] = max(f.maxflips[id], maxflips) f.chksums[id] += chksum end function runf(f::Fannkuch) factn = factorial(f.n) Threads. for idxmin=0:f.blocksz:factn-1 perm = Perm(f.n) first_permutation(perm, idxmin) idxmax = idxmin + f.blocksz - 1 run_task(f, perm, idxmin, idxmax) end end function fannkuch(n = 12) f = Fannkuch(n,Threads.nthreads()) runf(f) # reduce results chk = sum(f.chksums) res = maximum(f.maxflips) (chk, res) end
0.4s
#= The Computer Language Benchmarks Game https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn: https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html modified for Julia 1.0 by Simon Danisch =# const zerov8 = ntuple(x-> 0f0, 8) # Calculate mandelbrot set for one Vec8 into one byte Base. function mand8(cr, ci) Zr = zerov8 Zi = zerov8 Tr = zerov8 Ti = zerov8 t = zerov8 for i in 0:49 Zi = 2f0 .* Zr .* Zi .+ ci Zr = Tr .- Ti .+ cr Tr = Zr .* Zr Ti = Zi .* Zi t = Tr .+ Ti all(x-> x > 4f0, t) && break end byte = UInt8(0) t[1] <= 4f0 && (byte |= 0x80) t[2] <= 4f0 && (byte |= 0x40) t[3] <= 4f0 && (byte |= 0x20) t[4] <= 4f0 && (byte |= 0x10) t[5] <= 4f0 && (byte |= 0x08) t[6] <= 4f0 && (byte |= 0x04) t[7] <= 4f0 && (byte |= 0x02) t[8] <= 4f0 && (byte |= 0x01) return byte end function mandel_inner(rows, ci, y, N, xvals) for x in 1:8:N begin cr = ntuple(i-> xvals[x + (i - 1)], 8) rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci) end end end function mandel(n = 16000) inv_ = 2.0 / n N = n xvals = zeros(Float32, n) yvals = zeros(Float32, n) Threads. for i in 0:(N-1) xvals[i + 1] = i * inv_ - 1.5 yvals[i + 1] = i * inv_ - 1.0 end rows = zeros(UInt8, n*N÷8) Threads. for y in 1:N ci = yvals[y] mandel_inner(rows, ci, y, N, xvals) end end
0.4s
# The Computer Language Benchmarks Game # https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ # contributed by Jarret Revels and Alex Arslan # based on the Java version module NBody using Printf using LinearAlgebra # Constants const solar_mass = 4 * pi * pi const days_per_year = 365.24 # Use a 4-tuple here to get better SIMD instructions. # This is a space-time tradeoff, but this benchmark is well within L1 cache limits. struct Vec3 x::NTuple{4, Float64} end Vec3(x, y, z) = Vec3((x,y,z,0.0)) Base.:/(v::Vec3, n::Number) = Vec3(1/n .* v.x) Base.:*(v::Vec3, n::Number) = Vec3(n .* v.x) Base.:-(v1::Vec3, v2::Vec3) = Vec3(v1.x .- v2.x) Base.:+(v1::Vec3, v2::Vec3) = Vec3(v1.x .+ v2.x) # Todo, prettify squarednorm(v1::Vec3) = v1.x[1]^2 + v1.x[2]^2 + v1.x[3]^2 Base.muladd(x::Vec3, y::Number, z::Vec3) = Vec3(muladd.(x.x, y, z.x)) # A heavenly body in the system mutable struct Body pos::Vec3 v::Vec3 mass::Float64 end function offset_momentum!(b::Body, p::Vec3) b.v -= p / solar_mass end function init_sun!(bodies::Vector{Body}) p = Vec3(0.0, 0.0, 0.0) for b in bodies p += b.v * b.mass end offset_momentum!(bodies[1], p) end function advance(bodies::Vector{Body}, dt::Number) for i = 1:length(bodies) bi = bodies[i] for j = i+1:length(bodies) bj = bodies[j] delta = bi.pos - bj.pos dsq = squarednorm(delta) distance = sqrt(dsq) mag = dt / (dsq * distance) bi.v = muladd(delta, -(bj.mass * mag), bi.v) bj.v = muladd(delta, (bi.mass * mag), bj.v) end end for b in bodies b.pos = muladd(b.v, dt, b.pos) end end function energy(bodies::Vector{Body}) e = 0.0 for i = 1:length(bodies) bi = bodies[i] e += 0.5 * bi.mass * squarednorm(bi.v) for j = i+1:length(bodies) bj = bodies[j] delta = bi.pos - bj.pos distance = sqrt(squarednorm(delta)) dinv = 1.0 / distance e = muladd((bi.mass * bj.mass), -dinv, e) end end return e end function perf_nbody(N::Int=1000) jupiter = Body( Vec3(4.84143144246472090e+00, # pos[1] = x -1.16032004402742839e+00, # pos[2] = y -1.03622044471123109e-01), # pos[3] = z Vec3(1.66007664274403694e-03 * days_per_year, # v[1] = vx 7.69901118419740425e-03 * days_per_year, # v[2] = vy -6.90460016972063023e-05 * days_per_year), # v[3] = vz 9.54791938424326609e-04 * solar_mass) # mass saturn = Body(Vec3(8.34336671824457987e+00, 4.12479856412430479e+00, -4.03523417114321381e-01), Vec3(-2.76742510726862411e-03 * days_per_year, 4.99852801234917238e-03 * days_per_year, 2.30417297573763929e-05 * days_per_year), 2.85885980666130812e-04 * solar_mass) uranus = Body(Vec3(1.28943695621391310e+01, -1.51111514016986312e+01, -2.23307578892655734e-01), Vec3(2.96460137564761618e-03 * days_per_year, 2.37847173959480950e-03 * days_per_year, -2.96589568540237556e-05 * days_per_year), 4.36624404335156298e-05 * solar_mass) neptune = Body(Vec3(1.53796971148509165e+01, -2.59193146099879641e+01, 1.79258772950371181e-01), Vec3(2.68067772490389322e-03 * days_per_year, 1.62824170038242295e-03 * days_per_year, -9.51592254519715870e-05 * days_per_year), 5.15138902046611451e-05 * solar_mass) sun = Body(Vec3(0.0, 0.0, 0.0), Vec3(0.0, 0.0, 0.0), solar_mass) bodies = [sun, jupiter, saturn, uranus, neptune] init_sun!(bodies) for i = 1:N advance(bodies, 0.01) end end end # module nbody(n) = NBody.perf_nbody(n)
using BenchmarkTools julia_results = Dict{String, Vector{BenchmarkTools.Trial}}() for (name, n) in ( nbody => 50000000, mandel => 16000, fannkuch => 12, ) julia_results[string(name)] = map(1:N) do i $(name)($n) end end
# for small function benchmark take double julia_results["sin"] = map(1:(N * 2)) do x sin($(Ref(1.0))[]) end
#include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <emmintrin.h> #include <time.h> long numDigits(long n) { long len = 0; while(n) { n=n/10; len++; } return len; } inline long vec_nle(__m128d *v, double f) { return (v[0][0] <= f || v[0][1] <= f || v[1][0] <= f || v[1][1] <= f || v[2][0] <= f || v[2][1] <= f || v[3][0] <= f || v[3][1] <= f) ? 0 : -1; } inline void clrPixels_nle(__m128d *v, double f, unsigned long * pix8) { if(!(v[0][0] <= f)) *pix8 &= 0x7f; if(!(v[0][1] <= f)) *pix8 &= 0xbf; if(!(v[1][0] <= f)) *pix8 &= 0xdf; if(!(v[1][1] <= f)) *pix8 &= 0xef; if(!(v[2][0] <= f)) *pix8 &= 0xf7; if(!(v[2][1] <= f)) *pix8 &= 0xfb; if(!(v[3][0] <= f)) *pix8 &= 0xfd; if(!(v[3][1] <= f)) *pix8 &= 0xfe; } inline void calcSum(__m128d *r, __m128d *i, __m128d *sum, __m128d *init_r, __m128d init_i) { for(long pair=0; pair<4; pair++) { __m128d r2 = r[pair] * r[pair]; __m128d i2 = i[pair] * i[pair]; __m128d ri = r[pair] * i[pair]; sum[pair] = r2 + i2; r[pair]=r2 - i2 + init_r[pair]; i[pair]=ri + ri + init_i; } } inline unsigned long mand8(__m128d *init_r, __m128d init_i) { __m128d r[4], i[4], sum[4]; for(long pair=0; pair<4; pair++) { r[pair]=init_r[pair]; i[pair]=init_i; } unsigned long pix8 = 0xff; for (long j = 0; j < 6; j++) { for(long k=0; k<8; k++) calcSum(r, i, sum, init_r, init_i); if (vec_nle(sum, 4.0)) { pix8 = 0x00; break; } } if (pix8) { calcSum(r, i, sum, init_r, init_i); calcSum(r, i, sum, init_r, init_i); clrPixels_nle(sum, 4.0, &pix8); } return pix8; } unsigned long mand64(__m128d *init_r, __m128d init_i) { unsigned long pix64 = 0; for(long byte=0; byte<8; byte++) { unsigned long pix8 = mand8(init_r, init_i); pix64 = (pix64 >> 8) | (pix8 << 56); init_r += 4; } return pix64; } void benchmark(int n) { // get width/height from arguments long wid_ht = n; wid_ht = (wid_ht+7) & ~7; // allocate memory for header and pixels long headerLength = numDigits(wid_ht)*2+5; long pad = ((headerLength + 7) & ~7) - headerLength; // pad aligns pixels on 8 long dataLength = headerLength + wid_ht*(wid_ht>>3); unsigned char * const buffer = malloc(pad + dataLength); unsigned char * const header = buffer + pad; unsigned char * const pixels = header + headerLength; // generate the bitmap header // sprintf((char *)header, "P4\n%ld %ld\n", wid_ht, wid_ht); // calculate initial values, store in r0, i0 __m128d r0[wid_ht/2]; double i0[wid_ht]; for(long xy=0; xy<wid_ht; xy+=2) { r0[xy>>1] = 2.0 / wid_ht * (__m128d){xy, xy+1} - 1.5; i0[xy] = 2.0 / wid_ht * xy - 1.0; i0[xy+1] = 2.0 / wid_ht * (xy+1) - 1.0; } // generate the bitmap long use8 = wid_ht%64; if (use8) { // process 8 pixels (one byte) at a time #pragma omp parallel for schedule(guided) for(long y=0; y<wid_ht; y++) { __m128d init_i = (__m128d){i0[y], i0[y]}; long rowstart = y*wid_ht/8; for(long x=0; x<wid_ht; x+=8) { pixels[rowstart + x/8] = mand8(r0+x/2, init_i); } } } else { // process 64 pixels (8 bytes) at a time #pragma omp parallel for schedule(guided) for(long y=0; y<wid_ht; y++) { __m128d init_i = (__m128d){i0[y], i0[y]}; long rowstart = y*wid_ht/64; for(long x=0; x<wid_ht; x+=64) { ((unsigned long *)pixels)[rowstart + x/64] = mand64(r0+x/2, init_i); } } } // write the data // long ret = ret = write(STDOUT_FILENO, header, dataLength); free(buffer); } extern void benchmark(int);
mandel.c
Julia
// The Computer Language Benchmarks Game // https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ // // Contributed by Jeremy Zerfas // Based on the Ada program by Jonathan Parker and Georg Bauhaus which in turn // was based on code by Dave Fladebo, Eckehard Berns, Heiner Marxen, Hongwei Xi, // and The Anh Tran and also the Java program by Oleg Mazurov. // This value controls how many blocks the workload is broken up into (as long // as the value is less than or equal to the factorial of the argument to this // program) in order to allow the blocks to be processed in parallel if // possible. PREFERRED_NUMBER_OF_BLOCKS_TO_USE should be some number which // divides evenly into all factorials larger than it. It should also be around // 2-8 times the amount of threads you want to use in order to create enough // blocks to more evenly distribute the workload amongst the threads. #define PREFERRED_NUMBER_OF_BLOCKS_TO_USE 12 #include <stdint.h> #include <stdlib.h> #include <stdio.h> #include <time.h> // intptr_t should be the native integer type on most sane systems. typedef intptr_t intnative_t; void benchmark(int n){ // Create and initialize factorial_Lookup_Table. intnative_t factorial_Lookup_Table[n+1]; factorial_Lookup_Table[0]=1; for(intnative_t i=0; ++i<=n;) factorial_Lookup_Table[i]=i*factorial_Lookup_Table[i-1]; // Determine the block_Size to use. If n! is less than // PREFERRED_NUMBER_OF_BLOCKS_TO_USE then just use a single block to prevent // block_Size from being set to 0. This also causes smaller values of n to // be computed serially which is faster and uses less resources for small // values of n. const intnative_t block_Size=factorial_Lookup_Table[n]/ (factorial_Lookup_Table[n]<PREFERRED_NUMBER_OF_BLOCKS_TO_USE ? 1 : PREFERRED_NUMBER_OF_BLOCKS_TO_USE); intnative_t maximum_Flip_Count=0, checksum=0; // Iterate over each block. #pragma omp parallel for \ reduction(max:maximum_Flip_Count) reduction(+:checksum) for(intnative_t initial_Permutation_Index_For_Block=0; initial_Permutation_Index_For_Block<factorial_Lookup_Table[n]; initial_Permutation_Index_For_Block+=block_Size){ intnative_t count[n]; int8_t temp_Permutation[n], current_Permutation[n]; // Initialize count and current_Permutation. count[0]=0; for(intnative_t i=0; i<n; ++i) current_Permutation[i]=i; for(intnative_t i=n-1, permutation_Index=initial_Permutation_Index_For_Block; i>0; --i){ const intnative_t d=permutation_Index/factorial_Lookup_Table[i]; permutation_Index=permutation_Index%factorial_Lookup_Table[i]; count[i]=d; for(intnative_t j=0; j<n; ++j) temp_Permutation[j]=current_Permutation[j]; for(intnative_t j=0; j<=i; ++j) current_Permutation[j]= j+d<=i ? temp_Permutation[j+d] : temp_Permutation[j+d-i-1]; } // Iterate over each permutation in the block. const intnative_t last_Permutation_Index_In_Block= initial_Permutation_Index_For_Block+block_Size-1; for(intnative_t permutation_Index=initial_Permutation_Index_For_Block; ; ++permutation_Index){ // If the first value in the current_Permutation is not 1 (0) then // we will need to do at least one flip for the current_Permutation. if(current_Permutation[0]>0){ // Make a copy of current_Permutation[] to work on. Note that we // don't need to copy the first value since that will be stored // in a separate variable since it gets used a lot. for(intnative_t i=0; ++i<n;) temp_Permutation[i]=current_Permutation[i]; intnative_t flip_Count=1; // Flip temp_Permutation until the element at the first_Value // index is 1 (0). for(intnative_t first_Value=current_Permutation[0]; temp_Permutation[first_Value]>0; ++flip_Count){ // Record the new_First_Value and restore the old // first_Value at its new flipped position. const int8_t new_First_Value=temp_Permutation[first_Value]; temp_Permutation[first_Value]=first_Value; // If first_Value is greater than 3 (2) then we are flipping // a series of four or more values so we will also need to // flip additional elements in the middle of the // temp_Permutation. if(first_Value>2){ intnative_t low_Index=1, high_Index=first_Value-1; // Note that this loop is written so that it will run at // most 16 times so that compilers will be more willing // to unroll it. Consequently this won't work right when // n is greater than 35. This would probably be the // least of your concerns since 21! won't fit into 64 // bit integers and even if it did you probably wouldn't // want to run this program with a value that large // since it would take thousands of years to do on a // modern desktop computer. ;-) do{ const int8_t temp=temp_Permutation[high_Index]; temp_Permutation[high_Index]= temp_Permutation[low_Index]; temp_Permutation[low_Index]=temp; }while(low_Index+++3<=high_Index-- && low_Index<16); } // Update first_Value to new_First_Value that we recorded // earlier. first_Value=new_First_Value; } // Update the checksum. if(permutation_Index%2==0) checksum+=flip_Count; else checksum-=flip_Count; // Update maximum_Flip_Count if necessary. if(flip_Count>maximum_Flip_Count) maximum_Flip_Count=flip_Count; } // Break out of the loop when we get to the // last_Permutation_Index_In_Block. if(permutation_Index>=last_Permutation_Index_In_Block) break; // Generate the next permutation. int8_t first_Value=current_Permutation[1]; current_Permutation[1]=current_Permutation[0]; current_Permutation[0]=first_Value; for(intnative_t i=1; ++count[i]>i;){ count[i++]=0; const int8_t new_First_Value=current_Permutation[0]= current_Permutation[1]; for(intnative_t j=0; ++j<i;) current_Permutation[j]=current_Permutation[j+1]; current_Permutation[i]=first_Value; first_Value=new_First_Value; } } } } extern void benchmark(int);
fannkuch.c
Julia
/* The Computer Language Benchmarks Game https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ contributed by Mark C. Lewis modified slightly by Chad Whipkey converted from java to c++,added sse support, by Branimir Maksimovic converted from c++ to c, by Alexey Medvedchikov */ #include <stdio.h> #include <math.h> #include <stdlib.h> #include <immintrin.h> #include <time.h> #define PI 3.141592653589793 #define SOLAR_MASS ( 4 * PI * PI ) #define DAYS_PER_YEAR 365.24 struct body { double x[3], fill, v[3], mass; }; static struct body solar_bodies[] = { /* sun */ { .x = { 0., 0., 0. }, .v = { 0., 0., 0. }, .mass = SOLAR_MASS }, /* jupiter */ { .x = { 4.84143144246472090e+00, -1.16032004402742839e+00, -1.03622044471123109e-01 }, .v = { 1.66007664274403694e-03 * DAYS_PER_YEAR, 7.69901118419740425e-03 * DAYS_PER_YEAR, -6.90460016972063023e-05 * DAYS_PER_YEAR }, .mass = 9.54791938424326609e-04 * SOLAR_MASS }, /* saturn */ { .x = { 8.34336671824457987e+00, 4.12479856412430479e+00, -4.03523417114321381e-01 }, .v = { -2.76742510726862411e-03 * DAYS_PER_YEAR, 4.99852801234917238e-03 * DAYS_PER_YEAR, 2.30417297573763929e-05 * DAYS_PER_YEAR }, .mass = 2.85885980666130812e-04 * SOLAR_MASS }, /* uranus */ { .x = { 1.28943695621391310e+01, -1.51111514016986312e+01, -2.23307578892655734e-01 }, .v = { 2.96460137564761618e-03 * DAYS_PER_YEAR, 2.37847173959480950e-03 * DAYS_PER_YEAR, -2.96589568540237556e-05 * DAYS_PER_YEAR }, .mass = 4.36624404335156298e-05 * SOLAR_MASS }, /* neptune */ { .x = { 1.53796971148509165e+01, -2.59193146099879641e+01, 1.79258772950371181e-01 }, .v = { 2.68067772490389322e-03 * DAYS_PER_YEAR, 1.62824170038242295e-03 * DAYS_PER_YEAR, -9.51592254519715870e-05 * DAYS_PER_YEAR }, .mass = 5.15138902046611451e-05 * SOLAR_MASS } }; static const int BODIES_SIZE = sizeof(solar_bodies) / sizeof(solar_bodies[0]); void offset_momentum(struct body *bodies, unsigned int nbodies) { unsigned int i, k; for (i = 0; i < nbodies; ++i) for (k = 0; k < 3; ++k) bodies[0].v[k] -= bodies[i].v[k] * bodies[i].mass / SOLAR_MASS; } void bodies_advance(struct body *bodies, unsigned int nbodies, double dt) { unsigned int N = (nbodies - 1) * nbodies / 2; static struct { double dx[3], fill; } r[1000]; static __attribute__((aligned(16))) double mag[1000]; unsigned int i, j, k, m; __m128d dx[3], dsquared, distance, dmag; for(k = 0, i = 0; i < nbodies - 1; ++i) for(j = i + 1; j < nbodies; ++j, ++k) for ( m = 0; m < 3; ++m) r[k].dx[m] = bodies[i].x[m] - bodies[j].x[m]; for (i = 0; i < N; i += 2) { for (m = 0; m < 3; ++m) { dx[m] = _mm_loadl_pd(dx[m], &r[i].dx[m]); dx[m] = _mm_loadh_pd(dx[m], &r[i+1].dx[m]); } dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; distance = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dsquared))); for (j = 0; j < 2; ++j) distance = distance * _mm_set1_pd(1.5) - ((_mm_set1_pd(0.5) * dsquared) * distance) * (distance * distance); dmag = _mm_set1_pd(dt) / (dsquared) * distance; _mm_store_pd(&mag[i], dmag); } for (i = 0, k = 0; i < nbodies - 1; ++i) for ( j = i + 1; j < nbodies; ++j, ++k) for ( m = 0; m < 3; ++m) { bodies[i].v[m] -= r[k].dx[m] * bodies[j].mass * mag[k]; bodies[j].v[m] += r[k].dx[m] * bodies[i].mass * mag[k]; } for (i = 0; i < nbodies; ++i) for ( m = 0; m < 3; ++m) bodies[i].x[m] += dt * bodies[i].v[m]; } double bodies_energy(struct body *bodies, unsigned int nbodies) { double dx[3], distance, e = 0.0; unsigned int i, j, k; for (i=0; i < nbodies; ++i) { e += bodies[i].mass * ( bodies[i].v[0] * bodies[i].v[0] + bodies[i].v[1] * bodies[i].v[1] + bodies[i].v[2] * bodies[i].v[2] ) / 2.; for (j=i+1; j < nbodies; ++j) { for (k = 0; k < 3; ++k) dx[k] = bodies[i].x[k] - bodies[j].x[k]; distance = sqrt(dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]); e -= (bodies[i].mass * bodies[j].mass) / distance; } } return e; } void benchmark(int n) { int i; offset_momentum(solar_bodies, BODIES_SIZE); for (i = 0; i < n; ++i){ bodies_advance(solar_bodies, BODIES_SIZE, 0.01); } } extern void benchmark(int n);
nbody.c
Julia
run(`gcc -c -fpic -Wall -O3 -fomit-frame-pointer -march=native -mno-fma -fno-finite-math-only -fopenmp mandel.c -o mandel.o`) run(`gcc -c -fpic -Wall -O3 -fomit-frame-pointer -march=native -mfpmath=sse -msse3 nbody.c -o nbody.o -lm`) run(`gcc -c -fpic -Wall -O3 -fomit-frame-pointer -march=native -fopenmp fannkuch.c -o fannkuch.o`) for name in ("fannkuch", "nbody", "mandel") run(`gcc -shared -mno-fma -fno-finite-math-only -mfpmath=sse -msse3 -fopenmp -fomit-frame-pointer -march=native -O3 -o $name.so $name.o`) cname = Symbol("c_$name") $cname(n) = ccall((:benchmark, $("/$name.so")), Cvoid, (Cint,), n) end
c_results = Dict{String, Vector{BenchmarkTools.Trial}}() for (name, n) in ( c_nbody => 50000000, c_mandel => 16000, c_fannkuch => 12, ) c_results[replace(string(name), "c_" => "")] = map(1:N) do i $(name)($n) end end