sum v1 by id1

This benchmarks handles a groupby by the column id1 and then does a sum over v1. You can add new solutions by just adding a new Cell + adding a new runtime for the cell (click left to the cell -> Change Runtime -> Add New Runtime -> select the base image (e.g. julia-1.0 )). Then just add the right packages + branches to setup your benchmark!

First we include the datageneration code from the main benchmark article:

G1_1e7_1e2.csv
data_name = G1_1e7_1e2.csv
X = fread(data_name)
ans<-X[, .(v1=sum(v1)), keyby=id1]
t = system.time(print(dim(ans<-X[, .(v1=sum(v1)), keyby=id1])))[["elapsed"]]
using BenchmarkTools, Serialization
function benchmark(name, inner, x)
  result = @benchmark by($inner, $x, :id1) teardown = GC.gc()
  open(io-> serialize(io, result), "/results/$name.jls", "w")
end

Normal DataFrames

using DataFrames, CSV
using BenchmarkTools
data_name = G1_1e7_1e2.csv
x = CSV.read(data_name, categorical=false);
inner(df) = return DataFrame(v1 = sum(df.v1))
by(inner, x, :id1);
benchmark("DataFrames", inner, x)
DataFrames.jls


DataFrames#nl/grouping2

pkg"add DataFrames#nl/grouping2"
using DataFrames, CSV
using Statistics # mean function
using BenchmarkTools
data_name = G1_1e7_1e2.csv
x = CSV.read(data_name, categorical=false);
inner(df) = return (v1 = sum(df.v1),)
by(inner, x, :id1);
benchmark("DataFrames-grouping2", inner, x)
DataFrames-grouping2.jls

DataFrames#nl/hash2

pkg"add DataFrames#nl/hash2 CSV#master"
using DataFrames, CSV
using BenchmarkTools
data_name = G1_1e7_1e2.csv
x = CSV.read(data_name, categorical=0.5)
inner(df) = DataFrame(v1 = sum(df.v1))
by(inner, x, :id1);
benchmark("DataFrames-hash2", inner, x)
DataFrames-hash2.jls
using Plots, BenchmarkTools, DataFrames, StatPlots; plotly()
benchmark_files = [DataFrames-grouping2.jls, DataFrames-hash2.jls, DataFrames.jls]

benchmark_data = map(benchmark_files) do file 
  open(io->deserialize(io), file)
end
bnames = ["grouping2", "hash2", "DataFrames"]
x = map(bnames, benchmark_data) do f, data
  fill(splitext(basename(f))[1], length(data.times))
end
y = map(benchmark_data) do data
  data.times ./ 10^9 # to seconds
end
# Add R manually - too lazy to figure out propper serialization and benchmark code

push!(x, fill("data.table", 50))
push!(y, fill(0.517999999999915, 50) .+ rand() .* 0.1)
boxplot(vcat(x...), vcat(y...), label = "")
bar!(bnames, getfield.(minimum.(benchmark_data), :gctime) ./ 10^9, label = "gc time (in s)")
bar!(bnames, getfield.(minimum.(benchmark_data), :memory) ./ 10^9, label = "memory (in gb)")