Julia Flux Environment

Showcase

Adapted from the Flux model zoo Simple Auto-Encoder.

# Encode MNIST images as compressed vectors that can later be decoded back into
# images.
using Flux, Flux.Data.MNIST
using Flux: @epochs, onehotbatch, mse, throttle
using Base.Iterators: partition
using Parameters: @with_kw
using CUDA
if has_cuda()
    @info "CUDA is on"
    import CUDA
    CUDA.allowscalar(false)
end
@with_kw mutable struct Args
    lr::Float64 = 1e-3		# Learning rate
    epochs::Int = 10		# Number of epochs
    N::Int = 32			# Size of the encoding
    batchsize::Int = 1000	# Batch size for training
    sample_len::Int = 20 	# Number of random digits in the sample image
    throttle::Int = 5		# Throttle timeout
end
function get_processed_data(args)
    # Loading Images
    imgs = MNIST.images()
    #Converting image of type RGB to float 
    imgs = channelview.(imgs)
    # Partition into batches of size 1000
    train_data = [float(hcat(vec.(imgs)...)) for imgs in partition(imgs, args.batchsize)]
    
    train_data = gpu.(train_data)
    return train_data
end
function train(; kws...)
    args = Args(; kws...)	
    train_data = get_processed_data(args)
    @info("Constructing model......")
    # You can try to make the encoder/decoder network larger
    # Also, the output of encoder is a coding of the given input.
    # In this case, the input dimension is 28^2 and the output dimension of
    # encoder is 32. This implies that the coding is a compressed representation.
    # We can make lossy compression via this `encoder`.
    encoder = Dense(28^2, args.N, leakyrelu) |> gpu
    decoder = Dense(args.N, 28^2, leakyrelu) |> gpu 
    # Defining main model as a Chain of encoder and decoder models
    m = Chain(encoder, decoder)
    @info("Training model.....")
    loss(x) = mse(m(x), x)
    ## Training
    evalcb = throttle(() -> @show(loss(train_data[1])), args.throttle)
    opt = ADAM(args.lr)
	
    @epochs args.epochs Flux.train!(loss, Flux.params(m), zip(train_data), opt, cb = evalcb)
	
    return m, args
end
0.5s
Flux Test (Julia)
Julia Flux GPU
train
using Images
img(x::Vector) = Gray.(reshape(clamp.(x, 0, 1), 28, 28))
function sample(m, args)
    imgs = MNIST.images()
    #Converting image of type RGB to float 
    imgs = channelview.(imgs)
    # `args.sample_len` random digits
    before = [imgs[i] for i in rand(1:length(imgs), args.sample_len)]
    # Before and after images
    after = img.(map(x -> cpu(m)(float(vec(x))), before))
    # Stack them all together
    hcat(vcat.(before, after)...)
end
0.1s
Flux Test (Julia)
Julia Flux GPU
sample
cd(@__DIR__)
m, args= train()
# Sample output
@info("Saving image sample as sample_ae.png")
save("/results/sample_ae.png", sample(m, args))
67.1s
Flux Test (Julia)
Julia Flux GPU
0

Setup

Install Flux

]up
11.3s
Julia Flux GPU (Julia)
]add Flux NNlib FFTW Adapt Images ImageFiltering GPUArrays CUDA
15.5s
Julia Flux GPU (Julia)
]build
30.9s
Julia Flux GPU (Julia)
]precompile
321.4s
Julia Flux GPU (Julia)

Preload some data for Flux.

using Flux
# download data, to make sure they persist in the file system
for Mod in (Flux.Data.FashionMNIST, Flux.Data.MNIST)
  Mod.images(:train)
  Mod.labels(:train)
end;
40.5s
Julia Flux GPU (Julia)

Import everything so that any artifacts get installed (looking at you, CUDA).

using Flux, NNlib, FFTW, DataFrames, StatsBase, CSV, BSON, Unitful, Adapt, Parameters, GR, Plots, StatsPlots, WGLMakie, Images, ImageCore, ImageShow, ImageFiltering, Colors, ProgressMeter, BenchmarkTools, GPUArrays, CUDA
127.4s
Julia Flux GPU (Julia)

Build a new System Image

Precompilation code in a Code Listing, mounted as a file to the runtime. NB: Cut out a lot of packages from the sysimg precompilation due to unpredictable segfaults.

#CUDAnative, CUDAdrv, CuArrays; 
pc_pkgs = "Flux, NNlib, FFTW, GPUArrays, Adapt, GR, Plots, StatsBase, StatsPlots, WGLMakie"
#
#DataFrames, CSV, BSON, Unitful, Parameters, Colors, ProgressMeter, BenchmarkTools, Images, ImageCore, ImageShow, ImageFiltering
#CUDA?
for pkg in split(pc_pkgs, ",")
  pkg = String(strip(pkg))
  ps = Base.find_package(pkg)
  if !isnothing(ps)
    psym = Symbol(pkg)
    eval(:(using $psym))
    try
      include(abspath(joinpath(dirname(ps), "../test/runtests.jl")))
    catch; end
  end
end
Plots.plot([1,2,3])
precompile.jl
Julia

Target Broadwell CPUs, as those are the oldest we could possibly get allocated on GCE.

using PackageCompiler
pc_pkgs = "Flux, NNlib, FFTW, GPUArrays, Adapt, GR, Plots, StatsBase, StatsPlots, WGLMakie"
#
#DataFrames, CSV, BSON, Unitful, Parameters, Colors, ProgressMeter, BenchmarkTools, GPUArrays, Adapt, Images, ImageCore, ImageShow, ImageFiltering
#CUDA?
create_sysimage([Symbol(String(strip(pkg))) for pkg in split(pc_pkgs, ",")],
  replace_default=true,
  precompile_execution_file="/root/precompile.jl",
  cpu_target="broadwell")
814.9s
Julia Flux GPU (Julia)
julia -v
julia -g 2 -e 'print("Hello, whirled.\n")'
du -hsx /
7.8s
Julia Flux GPU (Bash in Julia)
"$VERSION"
0.6s
Julia Flux GPU (Julia)
"1.5.2"

Test

"$VERSION"
0.2s
Flux Test (Julia)
Julia Flux GPU
"1.5.2"

This should be super-fast:

using Flux, BSON, ImageFiltering, Unitful, Adapt, BenchmarkTools, Colors, FileIO, ImageShow, Plots, GR, ProgressMeter, GPUArrays, CUDA, NNlib
0.2s
Flux Test (Julia)
Julia Flux GPU
Runtimes (2)