Harmen Stoppels / Feb 11 2020
Remix of Julia by
Nextjournal
CUDA interpolation
mkpath("/my_env")
cd("/my_env")
Pkg.activate(".")
Pkg.add("BenchmarkTools")
Pkg.add("CuArrays")
Pkg.add("CUDAnative")
Pkg.add("StaticArrays")
Pkg.add("Interpolations")
30.7s
Julia
using BenchmarkTools, CuArrays, CUDAnative, StaticArrays, Interpolations
84.8s
Julia
function lanczos(x, a)
pix = π * x
# Singularity around x = 0
if pix > 1.0f-5
pixa = pix / a
return CUDAnative.sin(pix) * CUDAnative.sin(pixa) / (pix * pixa)
end
return 1.0f0
end
# TODO: mirroring can still go out of bounds!
maybe_mirror(x, w) = x < 1 ? 1 - x : x <= w ? x : 2w - x
function lanczos_kernel(interpolated::CuDeviceArray{T}, data::CuDeviceArray, coords::CuDeviceArray, a::Val{A}, rx::AbstractRange, ry::AbstractRange, rz::AbstractRange) where {T,A}
begin
# Coordinate index
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
i > length(coords) && return
xf, yf, zf = coords[i]
# Coordinates in pixels (floating point still)
center_x = (xf - first(rx)) / step(rx) + 1
center_y = (yf - first(ry)) / step(ry) + 1
center_z = (zf - first(rz)) / step(rz) + 1
# Window pixel values (integers)
window = ntuple(i -> i - A, 2A)
px_x = floor(Int, center_x) .+ window
px_y = floor(Int, center_y) .+ window
px_z = floor(Int, center_z) .+ window
# Window Lanczos values
l_x = lanczos.(CUDAnative.abs.(center_x .- px_x), A)
l_y = lanczos.(CUDAnative.abs.(center_y .- px_y), A)
l_z = lanczos.(CUDAnative.abs.(center_z .- px_z), A)
# Window pixel values with mirroring applied
px_x′ = maybe_mirror.(px_x, size(data, 1))
px_y′ = maybe_mirror.(px_y, size(data, 2))
px_z′ = maybe_mirror.(px_z, size(data, 3))
# Compute integral
dir_x = zeros(T, 2A, 2A)
dir_y = zeros(T, 2A)
dir_z = zero(T)
I = LinearIndices(CartesianIndices(data))
for ix = Base.OneTo(2A), iy = Base.OneTo(2A), iz = Base.OneTo(2A)
dir_x[iz, iy] += l_x[ix] * ldg(data, I[px_z′[iz], px_y′[iy], px_x′[ix]])
end
for iy = Base.OneTo(2A - 1), iz = Base.OneTo(2A)
dir_y[iz] += l_y[iy] * dir_x[iz, iy]
end
for iz = Base.OneTo(2A)
dir_z += l_z[iz] * dir_y[iz]
end
interpolated[i] = dir_z
end
return nothing
end
function lanczos!(interpolated::CuArray{T}, data::CuArray{T,3}, coords::CuArray, a::Val{A}, rx = axes(data, 1), ry = axes(data, 2), rz = axes(data, 3)) where {T,A}
length(interpolated) == length(coords)
function configurator(kernel)
config = CUDAnative.launch_configuration(kernel.fun)
threads = min(length(interpolated), config.threads)
blocks = cld(length(interpolated), threads)
return (threads=threads, blocks=blocks)
end
name="lanczos!" config=configurator lanczos_kernel(interpolated, data, coords, a, rx, ry, rz)
return interpolated
end
2.1s
Julia
lanczos! (generic function with 4 methods)
GPU benchmark (Lanczos-3)
data = cu(fill(2.0f0, 512, 512, 512))
coords = cu([(rand(Float32), rand(Float32), rand(Float32)) for i = 1 : 40*512*512])
interpolated = cu(ones(Float32, length(coords)));
r = range(0.0f0, 1.0f0, length=512)
CuArrays. lanczos!($interpolated, $data, $coords, $(Val{3}()), $r, $r, $r)
15.1s
Julia
BenchmarkTools.Trial:
memory estimate: 4.89 KiB
allocs estimate: 102
--------------
minimum time: 563.535 ms (0.00% GC)
median time: 564.352 ms (0.00% GC)
mean time: 564.293 ms (0.00% GC)
maximum time: 565.528 ms (0.00% GC)
--------------
samples: 9
evals/sample: 1
CPU benchmark (BSpline quadratic)
function spline!(interpolated, data, coords)
itp = interpolate(data, BSpline(Quadratic(Reflect(OnCell()))))
r = range(0.0f0, 1.0f0, length=512)
sitp = scale(itp, r, r, r)
etp = extrapolate(sitp, Reflect())
Threads. for p = 1:length(coords)
interpolated[p] = etp(coords[p]...)
end
interpolated
end
data = fill(2.0f0, 512, 512, 512)
coords = [(rand(Float32), rand(Float32), rand(Float32)) for i = 1 : 40*512*512]
interpolated = ones(Float32, length(coords))
r = range(0.0f0, 1.0f0, length=512)
spline!($interpolated, $data, $coords)
25.7s
Julia
BenchmarkTools.Trial:
memory estimate: 518.08 MiB
allocs estimate: 90
--------------
minimum time: 3.292 s (0.00% GC)
median time: 3.307 s (0.07% GC)
mean time: 3.307 s (0.07% GC)
maximum time: 3.322 s (0.15% GC)
--------------
samples: 2
evals/sample: 1
Shift+Enter to run
Julia