Hugh Murrell / Aug 15 2019

Chapter 7, Softmax formulation

Deeper learning with Flux

Load some datasets

Apples.dat
Bananas.dat
Grapes.dat
using CSV, DataFrames, Flux, Plots
apples = DataFrame(CSV.File(
Grapes.dat
, delim='\t', allowmissing=:none, normalizenames=true)) bananas = DataFrame(CSV.File(
Grapes.dat
, delim='\t', allowmissing=:none, normalizenames=true)) grapes = DataFrame(CSV.File(
Grapes.dat
, delim='\t', allowmissing=:none, normalizenames=true)) # Extract out the features and construct the corresponding labels x_apples = [ [apples[i, :red], apples[i, :blue]] for i in 1:size(apples, 1) ] x_bananas = [ [bananas[i, :red], bananas[i, :blue]] for i in 1:size(bananas, 1) ] x_grapes = [ [grapes[i, :red], grapes[i, :blue]] for i in 1:size(grapes, 1) ] xs = vcat(x_apples, x_bananas, x_grapes) ys = vcat(fill(Flux.onehot(1, 1:3), size(x_apples)), fill(Flux.onehot(2, 1:3), size(x_bananas)), fill(Flux.onehot(3, 1:3), size(x_grapes)));

Now we construct multiple layers and chain them together:

layer1 = Dense(2, 4, σ)
layer2 = Dense(4, 3, σ)
Dense(4, 3, NNlib.σ)
layer2(layer1(xs[1]))
Tracked 3-element Array{Float32,1}: 0.4166586f0 0.4007773f0 0.55417085f0
m = Chain(layer1, layer2)
m(xs[1])
Tracked 3-element Array{Float32,1}: 0.4166586f0 0.4007773f0 0.55417085f0
xs[1] |> layer1 |> layer2
Tracked 3-element Array{Float32,1}: 0.4166586f0 0.4007773f0 0.55417085f0

Create a model with a loss function and train it

# model = Chain(Dense(2, 3, σ)) # Update this!
model = Chain(layer1, layer2)
L(x,y) = Flux.mse(model(x), y)
# opt = SGD(params(model))
opt = Descent(0.1)
Flux.train!(L, params(model),zip(xs, ys), opt)
data = zip(xs, ys)
@time Flux.train!(L, params(model), data, opt)
@time Flux.train!(L, params(model), data, opt)

Improve efficiency by batching

length(data)
2935
first(data)
([0.708703, 0.341998], Bool[true, false, false])

Recall our matrix-vector multiplication from the previous lecture:

W = [10 1;
     20 2;
     30 3]
x = [3;
     2]
W*x
3-element Array{Int64,1}: 32 64 96
Flux.batch(xs)
2×2935 Array{Float64,2}: 0.708703 0.648376 0.647237 0.647963 … 0.721761 0.722839 0.722266 0.341998 0.284163 0.282579 0.283689 0.416422 0.417423 0.417273
model(Flux.batch(xs))
Tracked 3×2935 Array{Float32,2}: 0.0503457 0.04908 0.049046 … 0.051946 0.0519681 0.0519646 0.0630965 0.0660939 0.0661592 0.061789 0.0617402 0.061762 0.930956 0.92976 0.929731 0.931778 0.931797 0.93179
databatch = (Flux.batch(xs), Flux.batch(ys))
@time Flux.train!(L, params(model), (databatch,), opt)
@time Flux.train!(L, params(model), (databatch,), opt)
Flux.train!(L, params(model), Iterators.repeated(databatch, 10000), opt)
L(databatch[1], databatch[2])
0.17932141f0 (tracked)

Visualization

using Plots
function plot_decision_boundaries(model, x_apples, x_bananas, x_grapes)
    plot(fmt=:png)

    contour!(0:0.01:1, 0:0.01:1, (x,y)->model([x,y]).data[1], levels=[0.5, 0.501], color = cgrad([:blue, :blue]), colorbar=:none)
    contour!(0:0.01:1, 0:0.01:1, (x,y)->model([x,y]).data[2], levels=[0.5,0.501], color = cgrad([:green, :green]), colorbar=:none)
    contour!(0:0.01:1, 0:0.01:1, (x,y)->model([x,y]).data[3], levels=[0.5,0.501], color = cgrad([:red, :red]), colorbar=:none)

    scatter!(first.(x_apples), last.(x_apples), m=:cross, label="apples", color = :blue)
    scatter!(first.(x_bananas), last.(x_bananas), m=:circle, label="bananas", color = :green)
    scatter!(first.(x_grapes), last.(x_grapes), m=:square, label="grapes", color = :red)
end
plot_decision_boundaries(model, x_apples, x_bananas, x_grapes)

Further improvements

scatter([0],[0], label="correct answer", 
    xlabel="model output: [1-x,x]", 
    ylabel="loss against [1, 0]", 
    legend=:topleft, 
    title="Loss function behavior", fmt=:png)
plot!(x->Flux.mse([1-x, x/2], [1,0]), -1.5, 1.5, label="mse")
plot!(x->Flux.crossentropy([1-x, x/2], [1,0]), 0, 1, label="crossentropy")
sum(model(xs[1]))
1.0520396f0 (tracked)
Flux.mse([0.01,0.98,0.01], [1.0,0,0])
0.6468666666666666
softmax([1.0,-3,0])
3-element Array{Float64,1}: 0.7213991842739687 0.013212886953789414 0.26538792877224193

Using SoftMax and CrossEntropy

Use softmax as a final normalization and change the loss function to crossentropy:

model = Chain(Dense(2, 4, σ), Dense(4, 3, identity), softmax)
L(x,y) = Flux.crossentropy(model(x), y)
# opt = SGD(params(model))
opt = Descent(0.1)
Descent(0.1)
Flux.train!(L, params(model), Iterators.repeated(databatch,5000), opt)
plot_decision_boundaries(model, x_apples, x_bananas, x_grapes)