Use transformer in Julia

Matthew Leung
4 min readMay 26, 2022

--

I’m trying to follow Kasper Groes Albin Ludvigsen’s post <How to make a Transformer for time series forecasting with PyTorch> to use transformer in time series prediction. However, I will use Julia instead of Pytorch. Full code can be found in my repo in GitHub.

The model can be coded easily using the 2 Julia packages: Flux, and Transformers.jl.

Model building

Here is the code for transformer encoder and decorder:

begin#define 2 layer of transformer
encode_t1 = Transformer(dim_val, n_heads, 64, 2048;future=false,pdrop=0.2)|> gpu
encode_t2 = Transformer(dim_val, n_heads, 64, 2048;future=false,pdrop=0.2)|> gpu
#define 2 layer of transformer decoder
decode_t1 = TransformerDecoder(dim_val, n_heads, 64, 2048,pdrop=0.2) |> gpu
decode_t2 = TransformerDecoder(dim_val, n_heads, 64, 2048,pdrop=0.2) |> gpu
encoder_input_layer = Dense(input_size,dim_val) |> gpu
decoder_input_layer = Dense(input_size,dim_val) |> gpu
positional_encoding_layer = PositionEmbedding(dim_val) |> gpu
p = 0.2
dropout_pos_enc = Dropout(p) |> gpu
linear = Dense(output_sequence_length*dim_val,output_sequence_length) |> gpufunction encoder_forward(x)
x = encoder_input_layer(x)
e = positional_encoding_layer(x)
t1 = x .+ e
t1 = dropout_pos_enc(t1)
t1 = encode_t1(t1)
t1 = encode_t2(t1)
return t1
end
function decoder_forward(tgt, t1)
decoder_output = decoder_input_layer(tgt)
t2 = decode_t1(decoder_output,t1)
t2 = decode_t1(decoder_output,t2)
t2 = Flux.flatten(t2)
p = linear(t2)
return p
end
end

The Transformers package has the built-in position embedding layer, so I don’t need to code myself. Besides, I don’t need to construct the mask myself because the transformer layer has already built in the parameter: future. When future is false, the k-th token can't see the j-th tokens where j > k.

Data Preparation

Each training data sample is a sub-sequence of the time series. Here is the function to extract each sub-sequence of length seq_len by shifting x right each time.

function generate_seq(x, seq_len)
result = Matrix{Float64}[]
for i in 1:length(x)-seq_len+1
ele = reshape(x[i:i+seq_len-1],(seq_len,1))
push!(result,ele)
end
return result
end

Then, I follow Kasper’s post to break each sub-sequence into 3 parts: encoder-sequence (src), decoder-sequence (trg), and target-sequence (trg_y).

function get_src_trg(sequence, enc_seq_len, dec_seq_len, target_seq_len)
nseq = size(sequence)[2]
@assert nseq == enc_seq_len + target_seq_len
src = sequence[:,1:enc_seq_len,:]
trg = sequence[:,enc_seq_len:nseq-1,:]
@assert size(trg)[2] == target_seq_len
trg_y = sequence[:,nseq-target_seq_len+1:nseq,:]
@assert size(trg_y)[2] == target_seq_len
if size(trg_y)[1] == 1
return src, trg, dropdims(trg_y; dims=1)
else
return src, trg, trg_y
end
end

Model Training

Here is the training loop code. Half of the data is for training, and the other half is for verification testing in prediction.

function loss(src, trg, trg_y)
enc = encoder_forward(src)
dec = decoder_forward(trg, enc)
err = Flux.Losses.mse(dec,trg_y)
return err
end

lg=TBLogger("tensorboard_logs/run", min_level=Logging.Info)
data = generate_seq(values(ta_mv[:"10 YR"]),enc_seq_len+output_sequence_length)
data = reduce(hcat,data)
data = convert(Array{Float32,2}, data)
data_sz = size(data)
thd = floor(Int,data_sz[2]/2)
testdata = data[:,thd+1:end]
data = data[:,1:thd]
ps = params(encoder_input_layer, positional_encoding_layer, encode_t1, encode_t2, decoder_input_layer, decode_t1,decode_t2, linear)all_layers = [ encoder_input_layer, positional_encoding_layer, encode_t1, encode_t2, decoder_input_layer, decode_t1, decode_t2, linear ]opt = ADAM(1e-4)
train_loader = Flux.Data.DataLoader(data, batchsize=32)
l = 100
for i = 1:1000
for x in train_loader
sz = size(x)
sub_sequence = reshape(x,(1,sz[1],sz[2]))
src, trg, trg_y = get_src_trg(sub_sequence, enc_seq_len, dec_seq_len, output_sequence_length)
src, trg, trg_y = todevice(src, trg, trg_y) #move to gpu
grad = gradient(()->loss(src, trg, trg_y), ps)
Flux.update!(opt, ps, grad)
global l = collect(loss(src, trg, trg_y))[1]
if l < 1e-3
continue
end
end
if i % 10 == 0
for (j,layer) in enumerate(all_layers)
@save "checkpoint/model-"*string(j)*".bson" layer
end
with_logger(lg) do
@info "train" loss=l log_step_increment=1
end
end
if l < 1e-3
continue
end
end

Prediction Testing

Here is the function for model prediction with the sub-sequence ix, where the encoder input (ix[:,1:enc_seq_len,:]), and decoder input by shifting the target by 1 left (ix[:,enc_seq_len:sz[1]-1,:]) were fed into the encoder_forward and decoder_forward to predict the last value (dec[end,:]) in the sequence.

function prediction(test_data)
seq = Array{Float32}[]
test_loader = Flux.Data.DataLoader(test_data, batchsize=32)
for x in test_loader
sz = size(x)
sub_sequence = reshape(x,(1,sz[1],sz[2]))
ix = sub_sequence[:,1:enc_seq_len+output_sequence_length,:]
ix = todevice(ix)
enc = encoder_forward(ix[:,1:enc_seq_len,:])
trg = ix[:,enc_seq_len:sz[1]-1,:]
dec = decoder_forward(trg, enc)
seq = vcat(seq,collect(dec[end,:]))
end
return seq
end

Test 1 with sin curve

First, I tried to verify the model with simple sin curve data, which it should be predicted well.

sincurve1 = Vector{Float64}()
for i in 1:500
append!(sincurve1,sin(i))
end
data = generate_seq(sincurve1,enc_seq_len+output_sequence_length)

Both the predicted value and actual value are well fit each other. The result is good.

Test 2 with interest rate data

I would like to use the actual data to test the model. The data I use is the 10-year daily treasury real yield downloaded since 2003 from the US Treasury. I use 15-day moving average to smoothen the data.

ta = readtimearray("rate.csv", format="mm/dd/yy", delim=',')
ta_mv = moving(mean,ta,15)
data = generate_seq(values(ta_mv[:"10 YR"]),enc_seq_len+output_sequence_length)

Both the predicted value (blue) and actual value (red) are well fit each other. The result is good.

Test 3 with stock price data

Trying to predict the stock price which is more volatile than interest rate. I use the Julia Package MarketData to get the daily closing stock price. Again, I smoothen the data by taking 15-day moving average.

data = generate_seq(values(moving(mean,cl,15)),enc_seq_len+output_sequence_length)

Both the predicted value (blue) and actual value (red) are not well fit each other. The result is not good.

--

--

No responses yet