Julia Flux Environment

Showcase

Adapted from the Flux model zoo Simple Auto-Encoder.

# Encode MNIST images as compressed vectors that can later be decoded back into
# images.
using Flux, Flux.Data.MNIST
using Flux: @epochs, onehotbatch, mse, throttle
using Base.Iterators: partition
using Parameters: @with_kw
using CUDA
if has_cuda()
    @info "CUDA is on"
    import CUDA
    CUDA.allowscalar(false)
end
@with_kw mutable struct Args
    lr::Float64 = 1e-3		# Learning rate
    epochs::Int = 10		# Number of epochs
    N::Int = 32			# Size of the encoding
    batchsize::Int = 1000	# Batch size for training
    sample_len::Int = 20 	# Number of random digits in the sample image
    throttle::Int = 5		# Throttle timeout
end
function get_processed_data(args)
    # Loading Images
    imgs = MNIST.images()
    #Converting image of type RGB to float 
    imgs = channelview.(imgs)
    # Partition into batches of size 1000
    train_data = [float(hcat(vec.(imgs)...)) for imgs in partition(imgs, args.batchsize)]
    
    train_data = gpu.(train_data)
    return train_data
end
function train(; kws...)
    args = Args(; kws...)	
    train_data = get_processed_data(args)
    @info("Constructing model......")
    # You can try to make the encoder/decoder network larger
    # Also, the output of encoder is a coding of the given input.
    # In this case, the input dimension is 28^2 and the output dimension of
    # encoder is 32. This implies that the coding is a compressed representation.
    # We can make lossy compression via this `encoder`.
    encoder = Dense(28^2, args.N, leakyrelu) |> gpu
    decoder = Dense(args.N, 28^2, leakyrelu) |> gpu 
    # Defining main model as a Chain of encoder and decoder models
    m = Chain(encoder, decoder)
    @info("Training model.....")
    loss(x) = mse(m(x), x)
    ## Training
    evalcb = throttle(() -> @show(loss(train_data[1])), args.throttle)
    opt = ADAM(args.lr)
	
    @epochs args.epochs Flux.train!(loss, params(m), zip(train_data), opt, cb = evalcb)
	
    return m, args
end
0.9s
Flux Test (Julia)
Julia Flux GPU
train
using Images
img(x::Vector) = Gray.(reshape(clamp.(x, 0, 1), 28, 28))
function sample(m, args)
    imgs = MNIST.images()
    #Converting image of type RGB to float 
    imgs = channelview.(imgs)
    # `args.sample_len` random digits
    before = [imgs[i] for i in rand(1:length(imgs), args.sample_len)]
    # Before and after images
    after = img.(map(x -> cpu(m)(float(vec(x))), before))
    # Stack them all together
    hcat(vcat.(before, after)...)
end
0.4s
Flux Test (Julia)
Julia Flux GPU
sample
cd(@__DIR__)
m, args= train()
# Sample output
@info("Saving image sample as sample_ae.png")
save("/results/sample_ae.png", sample(m, args))
70.9s
Flux Test (Julia)
Julia Flux GPU
0

Setup

Install Flux

]up
16.9s
Julia Flux GPU (Julia)
]add Flux NNlib FFTW Adapt Images ImageFiltering GPUArrays CUDA
19.6s
Julia Flux GPU (Julia)
]build
27.6s
Julia Flux GPU (Julia)
]precompile
339.0s
Julia Flux GPU (Julia)

Preload some data for Flux.

using Flux
# download data, to make sure they persist in the file system
for Mod in (Flux.Data.FashionMNIST, Flux.Data.MNIST)
  Mod.images(:train)
  Mod.labels(:train)
end;
40.9s
Julia Flux GPU (Julia)

Import everything so that any artifacts get installed (looking at you, CUDA).

using Flux, NNlib, FFTW, DataFrames, StatsBase, CSV, BSON, Unitful, Adapt, Parameters, GR, Plots, StatsPlots, WGLMakie, Images, ImageCore, ImageShow, ImageFiltering, Colors, ProgressMeter, BenchmarkTools, GPUArrays, CUDA
169.8s
Julia Flux GPU (Julia)

Build a new System Image

Precompilation code in a Code Listing, mounted as a file to the runtime. NB: Trying to include ArrayFire in the sysimg makes it broken and unrunnable.

#CUDAnative, CUDAdrv, CuArrays; 
pc_pkgs = "Flux, NNlib, FFTW, DataFrames, StatsBase, CSV, BSON, Unitful, Adapt, Parameters, GR, Plots, StatsPlots, WGLMakie, Images, ImageCore, ImageShow, ImageFiltering, Colors, ProgressMeter, BenchmarkTools, GPUArrays"
#CUDA?
for pkg in split(pc_pkgs, ",")
  pkg = String(strip(pkg))
  ps = Base.find_package(pkg)
  if !isnothing(ps)
    psym = Symbol(pkg)
    eval(:(using $psym))
    try
      include(abspath(joinpath(dirname(ps), "../test/runtests.jl")))
    catch; end
  end
end
Plots.plot([1,2,3])
precompile.jl
Julia

Target Broadwell CPUs, as those are the oldest we could possibly get allocated on GCE.

using PackageCompiler
pc_pkgs = "Flux, NNlib, FFTW, DataFrames, StatsBase, CSV, BSON, Unitful, Adapt, Parameters, GR, Plots, StatsPlots, WGLMakie, Images, ImageCore, ImageShow, ImageFiltering, Colors, ProgressMeter, BenchmarkTools, GPUArrays"
#CUDA?
create_sysimage([Symbol(String(strip(pkg))) for pkg in split(pc_pkgs, ",")],
  replace_default=true,
  precompile_execution_file="/root/precompile.jl",
  cpu_target="broadwell")
1943.0s
Julia Flux GPU (Julia)
julia -v
julia -e 'print("test\n")'
du -hsx /
7.8s
Julia Flux GPU (Bash in Julia)

Test

"$VERSION"
0.4s
Flux Test (Julia)
Julia Flux GPU
"1.5.1"

This should be super-fast:

using Flux, BSON, ImageFiltering, Unitful, Adapt, BenchmarkTools, Colors, FileIO, ImageShow, Plots, GR, ProgressMeter, GPUArrays, CUDA, NNlib
0.3s
Flux Test (Julia)
Julia Flux GPU
Runtimes (2)