CUDA interpolation

mkpath("/my_env")
cd("/my_env")
Pkg.activate(".")
Pkg.add("BenchmarkTools")
Pkg.add("CuArrays")
Pkg.add("CUDAnative")
Pkg.add("StaticArrays")
Pkg.add("Interpolations")
30.7s
using BenchmarkTools, CuArrays, CUDAnative, StaticArrays, Interpolations
84.8s
function lanczos(x, a)
    pix = π * x
  
    # Singularity around x = 0
    if pix > 1.0f-5
        pixa = pix / a
        return CUDAnative.sin(pix) * CUDAnative.sin(pixa) / (pix * pixa)
    end
  
    return 1.0f0
end
# TODO: mirroring can still go out of bounds!
maybe_mirror(x, w) = x < 1 ? 1 - x : x <= w ? x : 2w - x
  
function lanczos_kernel(interpolated::CuDeviceArray{T}, data::CuDeviceArray, coords::CuDeviceArray, a::Val{A}, rx::AbstractRange, ry::AbstractRange, rz::AbstractRange) where {T,A}
  @inbounds begin
	  # Coordinate index
    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
    i > length(coords) && return
    xf, yf, zf = coords[i]
    # Coordinates in pixels (floating point still)
    center_x = (xf - first(rx)) / step(rx) + 1
    center_y = (yf - first(ry)) / step(ry) + 1
    center_z = (zf - first(rz)) / step(rz) + 1
    # Window pixel values (integers)
		window = ntuple(i -> i - A, 2A)
    px_x = floor(Int, center_x) .+ window
    px_y = floor(Int, center_y) .+ window
    px_z = floor(Int, center_z) .+ window
   
    # Window Lanczos values
    l_x = lanczos.(CUDAnative.abs.(center_x .- px_x), A)
    l_y = lanczos.(CUDAnative.abs.(center_y .- px_y), A)
    l_z = lanczos.(CUDAnative.abs.(center_z .- px_z), A)
    # Window pixel values with mirroring applied
    px_x′ = maybe_mirror.(px_x, size(data, 1))
    px_y′ = maybe_mirror.(px_y, size(data, 2))
    px_z′ = maybe_mirror.(px_z, size(data, 3))
    # Compute integral
    dir_x = @MMatrix zeros(T, 2A, 2A)
    dir_y = @MVector zeros(T, 2A)
    dir_z = zero(T)
    
    I = LinearIndices(CartesianIndices(data))
    
    for ix = Base.OneTo(2A), iy = Base.OneTo(2A), iz = Base.OneTo(2A)
      dir_x[iz, iy] += l_x[ix] * ldg(data, I[px_z′[iz], px_y′[iy], px_x′[ix]])
    end
    for iy = Base.OneTo(2A - 1), iz = Base.OneTo(2A)
      dir_y[iz] += l_y[iy] * dir_x[iz, iy]
    end
    for iz = Base.OneTo(2A)
      dir_z += l_z[iz] * dir_y[iz]
    end
    
    interpolated[i] = dir_z
  end
  return nothing
end
function lanczos!(interpolated::CuArray{T}, data::CuArray{T,3}, coords::CuArray, a::Val{A}, rx = axes(data, 1), ry = axes(data, 2), rz = axes(data, 3)) where {T,A}
  
  @assert length(interpolated) == length(coords)
  
  function configurator(kernel)
    config = CUDAnative.launch_configuration(kernel.fun)
    threads = min(length(interpolated), config.threads)
    blocks = cld(length(interpolated), threads)
    return (threads=threads, blocks=blocks)
  end
  @cuda name="lanczos!" config=configurator lanczos_kernel(interpolated, data, coords, a, rx, ry, rz)
  
  return interpolated
end
2.1s
lanczos! (generic function with 4 methods)

GPU benchmark (Lanczos-3)

data = cu(fill(2.0f0, 512, 512, 512))
coords = cu([(rand(Float32), rand(Float32), rand(Float32)) for i = 1 : 40*512*512])
interpolated = cu(ones(Float32, length(coords)));
r = range(0.0f0, 1.0f0, length=512)
@benchmark CuArrays.@sync lanczos!($interpolated, $data, $coords, $(Val{3}()), $r, $r, $r)
15.1s
BenchmarkTools.Trial: memory estimate: 4.89 KiB allocs estimate: 102 -------------- minimum time: 563.535 ms (0.00% GC) median time: 564.352 ms (0.00% GC) mean time: 564.293 ms (0.00% GC) maximum time: 565.528 ms (0.00% GC) -------------- samples: 9 evals/sample: 1

CPU benchmark (BSpline quadratic)

function spline!(interpolated, data, coords)
  itp = interpolate(data, BSpline(Quadratic(Reflect(OnCell()))))
  r = range(0.0f0, 1.0f0, length=512)
  sitp = scale(itp, r, r, r)
  etp = extrapolate(sitp, Reflect())
  
  Threads.@threads for p = 1:length(coords)
    @inbounds interpolated[p] = etp(coords[p]...)
  end
  
  interpolated
end
data = fill(2.0f0, 512, 512, 512)
coords = [(rand(Float32), rand(Float32), rand(Float32)) for i = 1 : 40*512*512]
interpolated = ones(Float32, length(coords))
r = range(0.0f0, 1.0f0, length=512)
@benchmark spline!($interpolated, $data, $coords)
25.7s
BenchmarkTools.Trial: memory estimate: 518.08 MiB allocs estimate: 90 -------------- minimum time: 3.292 s (0.00% GC) median time: 3.307 s (0.07% GC) mean time: 3.307 s (0.07% GC) maximum time: 3.322 s (0.15% GC) -------------- samples: 2 evals/sample: 1
Shift+Enter to run
Runtimes (1)