Skip to content

Commit

Permalink
Merge pull request #79 from BerkeleyLab/checkpointing
Browse files Browse the repository at this point in the history
New app features & refactoring: checkpoint/restart and user-specified training time range
  • Loading branch information
rouson authored Sep 8, 2023
2 parents 6049804 + 57e04c8 commit 2bd236f
Show file tree
Hide file tree
Showing 8 changed files with 292 additions and 233 deletions.
367 changes: 194 additions & 173 deletions app/train-cloud-microphysics.f90

Large diffs are not rendered by default.

77 changes: 38 additions & 39 deletions example/train-and-write.f90
Original file line number Diff line number Diff line change
Expand Up @@ -18,75 +18,74 @@ program train_and_write
'Usage: ./build/run-fpm.sh run --example train-and-write -- --output-file "<file-name>"'
end if

and_gate_with_skewed_training_data: &
block
logical, allocatable :: test_passes(:)
type(mini_batch_t), allocatable :: mini_batches(:)
type(tensor_t), allocatable, dimension(:,:) :: training_inputs, training_outputs
type(tensor_t), allocatable, dimension(:) :: tmp, tmp2, test_inputs, expected_test_outputs, actual_outputs
type(trainable_engine_t) trainable_engine
type(inference_engine_t) inference_engine
type(file_t) json_file
type(mini_batch_t), allocatable :: mini_batches(:)
type(tensor_t), allocatable :: training_inputs(:,:), tmp(:), inputs(:)
type(tensor_t), allocatable :: training_outputs(:,:), tmp2(:), expected_outputs(:)
real(rkind) t_start, t_end
real(rkind), parameter :: tolerance = 1.E-02_rkind
real(rkind), allocatable :: harvest(:,:,:)
integer, parameter :: num_inputs=2, mini_batch_size = 1, num_iterations=8000000
integer, parameter :: num_inputs=2, mini_batch_size = 1, num_iterations=20000
integer batch, iter, i
type(file_t) json_file

allocate(harvest(num_inputs, mini_batch_size, num_iterations))
call random_number(harvest)
harvest = 2.*(harvest - 0.5) ! skew toward more input values being true

! The following temporary copies are required by gfortran bug 100650 and possibly 49324
! See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100650 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49324
tmp = [([(tensor_t(merge(true, false, harvest(:,batch,iter) < 0.5E0)), batch=1, mini_batch_size)], iter=1, num_iterations)]
training_inputs = reshape(tmp, [mini_batch_size, num_iterations])

tmp2 = [([(xor(training_inputs(batch, iter)), batch = 1, mini_batch_size)], iter = 1, num_iterations )]
tmp2 = [([(and(training_inputs(batch, iter)), batch = 1, mini_batch_size)], iter = 1, num_iterations )]
training_outputs = reshape(tmp2, [mini_batch_size, num_iterations])

mini_batches = [(mini_batch_t(input_output_pair_t(training_inputs(:,iter), training_outputs(:,iter))), iter=1, num_iterations)]
trainable_engine = one_random_hidden_layer()
trainable_engine = two_zeroed_hidden_layers()

call trainable_engine%train(mini_batches,adam=.true.)

call cpu_time(t_start)
call trainable_engine%train(mini_batches)
call cpu_time(t_end)
test_inputs = [tensor_t([true,true]), tensor_t([false,true]), tensor_t([true,false]), tensor_t([false,false])]
expected_test_outputs = [(and(test_inputs(i)), i=1, size(test_inputs))]
actual_outputs = trainable_engine%infer(test_inputs)

print *,"Training time: ",t_end - t_start
print *," Input 1 Input 2 Expected output Actual output"
do i = 1, size(test_inputs)
print *, test_inputs(i)%values(), actual_outputs(i)%values(), expected_test_outputs(i)%values()
end do

inputs = [tensor_t([true,true]), tensor_t([true,false]), tensor_t([false,true]), tensor_t([false,false])]
print *, "sample inputs: ",("[",inputs(i)%values(),"]", i=1, size(inputs))
expected_outputs = xor(inputs)
print *, "expected outputs: ",(expected_outputs(i)%values(), i=1, size(expected_outputs))
associate(outputs => trainable_engine%infer(inputs))
print *, "actual outputs: ",(outputs(i)%values(), i=1, size(outputs))
end associate
inference_engine = trainable_engine%to_inference_engine()
json_file = inference_engine%to_json()
call json_file%write_lines(file_name)

inference_engine = trainable_engine%to_inference_engine()
json_file = inference_engine%to_json()
call json_file%write_lines(file_name)
end block
end block and_gate_with_skewed_training_data

contains

elemental function xor(inputs) result(expected_outputs)
type(tensor_t), intent(in) :: inputs
type(tensor_t) expected_outputs
associate(sum_inputs => sum(inputs%values()))
expected_outputs = tensor_t([merge(true, false, sum_inputs > 0.99 .and. sum_inputs < 1.01)])
end associate
elemental function and(inputs_object) result(expected_outputs_object)
type(tensor_t), intent(in) :: inputs_object
type(tensor_t) expected_outputs_object
expected_outputs_object = tensor_t([merge(true, false, sum(inputs_object%values()) > 1.99_rkind)])
end function

function one_random_hidden_layer() result(trainable_engine)
function two_zeroed_hidden_layers() result(trainable_engine)
type(trainable_engine_t) trainable_engine
integer, parameter :: inputs = 2, outputs = 1, hidden = 2 ! number of neurons in input, output, and hidden layers
integer, parameter :: n(*) = [inputs, hidden, outputs] ! neurons per layer
integer, parameter :: n_max = maxval(n), layers=size(n) ! max layer width, number of layers
real(rkind) w(n_max, n_max, layers-1), b(n_max, n_max)
integer, parameter :: inputs = 2, outputs = 1, hidden = 3 ! number of neurons in input, output, and hidden layers
integer, parameter :: neurons(*) = [inputs, hidden, hidden, outputs] ! neurons per layer
integer, parameter :: max_neurons = maxval(neurons), layers=size(neurons) ! max layer width, number of layers
real(rkind) w(max_neurons, max_neurons, layers-1), b(max_neurons, max_neurons)

call random_number(b)
call random_number(w)
w = 0.
b = 0.

trainable_engine = trainable_engine_t( &
nodes = n, weights = w, biases = b, differentiable_activation_strategy = sigmoid_t(), &
metadata = [string_t("1 hide|2 wide"), string_t("D. Rouson"), string_t("2023-06-30"), string_t("sigmoid"), string_t("false")]&
)
nodes = neurons, weights = w, biases = b, differentiable_activation_strategy = sigmoid_t(), metadata = &
[string_t("2-hide|3-wide"), string_t("Damian Rouson"), string_t("2023-06-30"), string_t("sigmoid"), string_t("false")] &
)
end function

end program
2 changes: 1 addition & 1 deletion fpm.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ maintainer = "rouson@lbl.gov"

[dependencies]
assert = {git = "https://github.com/sourceryinstitute/assert", tag = "1.5.0"}
sourcery = {git = "https://github.com/sourceryinstitute/sourcery", tag = "3.9.0"}
sourcery = {git = "https://github.com/sourceryinstitute/sourcery", tag = "3.9.1"}
netcdf-interfaces = {git = "https://github.com/rouson/netcdf-interfaces.git", branch = "implicit-interfaces"}
15 changes: 15 additions & 0 deletions src/inference_engine/inference_engine_m_.f90
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ module inference_engine_m_
private
public :: inference_engine_t
public :: difference_t
public :: exchange_t

character(len=*), parameter :: key(*) = [character(len=len("usingSkipConnections")) :: &
"modelName", "modelAuthor", "compilationDate", "activationFunction", "usingSkipConnections"]
Expand All @@ -35,6 +36,14 @@ module inference_engine_m_
procedure, private :: subtract
generic :: operator(-) => subtract
procedure :: activation_function_name
procedure :: to_exchange
end type

type exchange_t
type(string_t) metadata_(size(key))
real(rkind), allocatable :: weights_(:,:,:), biases_(:,:)
integer, allocatable :: nodes_(:)
class(activation_strategy_t), allocatable :: activation_strategy_ ! Strategy Pattern facilitates elemental activation
end type

type difference_t
Expand Down Expand Up @@ -65,6 +74,12 @@ impure elemental module function construct_from_json(file_) result(inference_eng

interface

pure module function to_exchange(self) result(exchange)
implicit none
class(inference_engine_t), intent(in) :: self
type(exchange_t) exchange
end function

impure elemental module function to_json(self) result(json_file)
implicit none
class(inference_engine_t), intent(in) :: self
Expand Down
8 changes: 8 additions & 0 deletions src/inference_engine/inference_engine_s.f90
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@

contains

module procedure to_exchange
exchange%metadata_ = self%metadata_
exchange%weights_ = self%weights_
exchange%biases_ = self%biases_
exchange%nodes_ = self%nodes_
exchange%activation_strategy_ = self%activation_strategy_
end procedure

module procedure infer

real(rkind), allocatable :: z(:,:), a(:,:)
Expand Down
6 changes: 6 additions & 0 deletions src/inference_engine/trainable_engine_m.f90
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ pure module function construct_from_padded_arrays(nodes, weights, biases, differ
type(trainable_engine_t) trainable_engine
end function

pure module function construct_from_inference_engine(inference_engine) result(trainable_engine)
implicit none
type(inference_engine_t), intent(in) :: inference_engine
type(trainable_engine_t) trainable_engine
end function

end interface

interface
Expand Down
43 changes: 29 additions & 14 deletions src/inference_engine/trainable_engine_s.f90
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,23 @@
n_layers = size(self%n,1)
end procedure

module procedure construct_from_inference_engine

associate(exchange => inference_engine%to_exchange())
trainable_engine%metadata_ = exchange%metadata_
trainable_engine%w = exchange%weights_
trainable_engine%b = exchange%biases_
trainable_engine%n = exchange%nodes_
select type(activation => exchange%activation_strategy_)
class is(differentiable_activation_strategy_t)
trainable_engine%differentiable_activation_strategy_ = activation
class default
error stop "trainable_engine_s(from_inference_engine): activation strategy must be a differentiable_activation_stragegy_t"
end select
end associate

end procedure

module procedure assert_consistent

associate( &
Expand Down Expand Up @@ -160,28 +177,26 @@

block
! Adam parameters
real, parameter :: beta1 = .9
real, parameter :: beta2 = .999
real, parameter :: obeta1 = 1.d0 - beta1
real, parameter :: obeta2 = 1.d0 - beta2
real, parameter :: epsilon = 1.d-8
real, parameter :: alpha = 1.5d0 ! Learning parameter
real, parameter :: beta(*) = [.9_rkind, .999_rkind]
real, parameter :: obeta(*) = [1._rkind - beta(1), 1._rkind - beta(2)]
real, parameter :: epsilon = real(1.D-08,rkind)
real, parameter :: alpha = 1.5_rkind ! Learning parameter

adjust_weights_and_biases: &
do l = 1,output_layer
dcdw(1:n(l),1:n(l-1),l) = dcdw(1:n(l),1:n(l-1),l)/(mini_batch_size)
vdw(1:n(l),1:n(l-1),l) = beta1*vdw(1:n(l),1:n(l-1),l) + obeta1*dcdw(1:n(l),1:n(l-1),l)
sdw (1:n(l),1:n(l-1),l) = beta2*sdw(1:n(l),1:n(l-1),l) + obeta2*(dcdw(1:n(l),1:n(l-1),l)**2)
vdwc(1:n(l),1:n(l-1),l) = vdw(1:n(l),1:n(l-1),l)/(1.D0 - beta1**num_mini_batches)
sdwc(1:n(l),1:n(l-1),l) = sdw(1:n(l),1:n(l-1),l)/(1.D0 - beta2**num_mini_batches)
vdw(1:n(l),1:n(l-1),l) = beta(1)*vdw(1:n(l),1:n(l-1),l) + obeta(1)*dcdw(1:n(l),1:n(l-1),l)
sdw (1:n(l),1:n(l-1),l) = beta(2)*sdw(1:n(l),1:n(l-1),l) + obeta(2)*(dcdw(1:n(l),1:n(l-1),l)**2)
vdwc(1:n(l),1:n(l-1),l) = vdw(1:n(l),1:n(l-1),l)/(1._rkind - beta(1)**num_mini_batches)
sdwc(1:n(l),1:n(l-1),l) = sdw(1:n(l),1:n(l-1),l)/(1._rkind - beta(2)**num_mini_batches)
w(1:n(l),1:n(l-1),l) = w(1:n(l),1:n(l-1),l) &
- alpha*vdwc(1:n(l),1:n(l-1),l)/(sqrt(sdwc(1:n(l),1:n(l-1),l))+epsilon) ! Adjust weights

dcdb(1:n(l),l) = dcdb(1:n(l),l)/mini_batch_size
vdb(1:n(l),l) = beta1*vdb(1:n(l),l) + obeta1*dcdb(1:n(l),l)
sdb(1:n(l),l) = beta2*sdb(1:n(l),l) + obeta2*(dcdb(1:n(l),l)**2)
vdbc(1:n(l),l) = vdb(1:n(l),l)/(1.D0 - beta1**num_mini_batches)
sdbc(1:n(l),l) = sdb(1:n(l),l)/(1.D0 - beta2**num_mini_batches)
vdb(1:n(l),l) = beta(1)*vdb(1:n(l),l) + obeta(1)*dcdb(1:n(l),l)
sdb(1:n(l),l) = beta(2)*sdb(1:n(l),l) + obeta(2)*(dcdb(1:n(l),l)**2)
vdbc(1:n(l),l) = vdb(1:n(l),l)/(1._rkind - beta(1)**num_mini_batches)
sdbc(1:n(l),l) = sdb(1:n(l),l)/(1._rkind - beta(2)**num_mini_batches)
b(1:n(l),l) = b(1:n(l),l) - alpha*vdbc(1:n(l),l)/(sqrt(sdbc(1:n(l),l))+epsilon) ! Adjust weights
end do adjust_weights_and_biases
end block
Expand Down
7 changes: 1 addition & 6 deletions test/inference_engine_test_m.f90
Original file line number Diff line number Diff line change
Expand Up @@ -103,18 +103,13 @@ function distinct_parameters() result(inference_engine)
function multi_hidden_layer_net_to_from_json() result(test_passes)
logical, allocatable :: test_passes
type(inference_engine_t) inference_engine, from_json
type(file_t) json_file !, round_trip
type(file_t) json_file
type(difference_t) difference
real, parameter :: tolerance = 1.0E-06

inference_engine = distinct_parameters()
json_file = inference_engine%to_json()
from_json = inference_engine_t(json_file)

!call json_file%write_lines()
!round_trip = from_json%to_json()
!call round_trip%write_lines()

difference = inference_engine - from_json
test_passes = difference%norm() < tolerance
end function
Expand Down

0 comments on commit 2bd236f

Please sign in to comment.