Skip to content

Commit

Permalink
Fix example & readme
Browse files Browse the repository at this point in the history
  • Loading branch information
abarajithan11 committed Nov 22, 2024
1 parent e811855 commit 75d0edb
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 113 deletions.
2 changes: 1 addition & 1 deletion deepsocflow/c/xilinx_example.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ int main()
// Memory_st *p_mem = (Memory_st *)mmap(NULL, sizeof(Memory_st), PROT_READ | PROT_WRITE, MAP_SHARED, dh, MEM_BASEADDR);
// void *p_config = mmap(NULL, 4*16+N_BUNDLES*32, PROT_READ | PROT_WRITE, MAP_SHARED, dh, CONFIG_BASEADDR);

xil_printf("Welcome to DeepSoCFlow!\n Store wbx at: %p; y:%p; buffers {0:%p,1:%p};\n", &p_mem->w, &mp->y, &p_mem->out_buffers[0], &p_mem->out_buffers[1]);
xil_printf("Welcome to DeepSoCFlow!\n Store wbx at: %p; y:%p; buffers {0:%p,1:%p};\n", &p_mem->w, &p_mem->y, &p_mem->out_buffers[0], &p_mem->out_buffers[1]);

model_setup(p_mem, p_config);
model_run_timed(p_mem, p_config, 20); // run model and measure time
Expand Down
15 changes: 8 additions & 7 deletions deepsocflow/py/dataflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,28 +313,29 @@ def predict_bundle_performance(hw, r):
utilization = operations / (hw.ROWS * hw.COLS * clocks)


return clocks, mem_bits, utilization
return clocks, mem_bits, utilization, operations


def predict_model_performance(hw):

d_out = {
'clocks_total': 0,
'mem_bytes_total': 0,
'operations': [],
'utilization_all': [],
'clocks_all': [],
'mem_bytes_all': [],
}
for b in BUNDLES:
clocks, mem_bits, utilization = predict_bundle_performance(hw=hw, r=b.r)
d_out['clocks_total'] += clocks
d_out['mem_bytes_total'] += mem_bits/8

clocks, mem_bits, utilization, operations = predict_bundle_performance(hw=hw, r=b.r)
d_out['operations'] += [operations]
d_out['utilization_all'] += [utilization]
d_out['clocks_all'] += [clocks]
d_out['mem_bytes_all'] += [mem_bits/8]

print(f'---{b.ib}: util:{100*utilization:.2f} mem_mb:{mem_bits/1024**2:.2f} {b.r.XN=} {b.r.XH=} {b.r.XW=} {b.r.CI=} {b.r.CO=} {b.r.KH=} {b.r.KW=}')

d_out['g_ops'] = sum(d_out['operations'])/1e9
d_out['clocks_total'] = sum(d_out['clocks_all'])
d_out['mem_bytes_total'] = sum(d_out['mem_bytes_all'])

d_out['seconds_per_batch'] = d_out['clocks_total'] / (hw.FREQ * 1e6)
d_out['frames_per_sec'] = hw.ROWS / d_out['seconds_per_batch']
Expand Down
Binary file added docs/overview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
238 changes: 133 additions & 105 deletions run/example.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,30 @@
import os
import pytest
import itertools
import sys
sys.path.append("../../")
from deepsocflow import Bundle, Hardware, QModel, QInput
from tensorflow import keras
from keras.layers import Input, Flatten
from qkeras import Model
from qkeras.utils import load_qmodel

import numpy as np
from keras.layers import Input
from keras.models import Model, save_model
from keras.datasets import mnist
from keras.optimizers import Adam
from keras.utils import to_categorical
from qkeras.utils import load_qmodel
import numpy as np
import pprint
# import tensorflow as tf
#tf.keras.utils.set_random_seed(0)

'''
0. Specify Hardware
'''
hw = Hardware ( # Alternatively: hw = Hardware.from_json('hardware.json')
processing_elements = (8, 24) , # (rows, columns) of multiply-add units
frequency_mhz = 250 , #
bits_input = 8 , # bit width of input pixels and activations
bits_weights = 8 , # bit width of weights
bits_sum = 24 , # bit width of accumulator
bits_bias = 16 , # bit width of bias
max_batch_size = 64 , #
max_channels_in = 2048 , #
max_kernel_size = 13 , #
max_image_size = 512 , #
ram_weights_depth = 20 , #
ram_edges_depth = 288 , #
axi_width = 64 , #
target_cpu_int_bits = 32 , #
valid_prob = 1 , # probability in which AXI-Stream s_valid signal should be toggled in simulation
ready_prob = 1 , # probability in which AXI-Stream m_ready signal should be toggled in simulation
data_dir = 'vectors', # directory to store generated test vectors
)
hw.export() # Generates: config_hw.svh, config_hw.tcl, config_tb.svh, hardware.json
hw.export_vivado_tcl(board='zcu104')
from deepsocflow import *

(SIM, SIM_PATH) = ('xsim', "F:/Xilinx/Vivado/2022.2/bin/") if os.name=='nt' else ('verilator', '')

'''
Dataset
'''

NB_EPOCH = 2
BATCH_SIZE = 64
VERBOSE = 1
VALIDATION_SPLIT = 0.1
NB_CLASSES = 10

Expand All @@ -57,107 +38,154 @@

y_train = to_categorical(y_train, NB_CLASSES)
y_test = to_categorical(y_test, NB_CLASSES)
input_shape = x_train.shape[1:]


'''
1. Build Model
Define Model
'''
XN = 1
input_shape = (XN,18,18,3) # (XN, XH, XW, CI)

QINT_BITS = 0
qq = f'quantized_bits({hw.K_BITS},{QINT_BITS},False,True,1)'
qr = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0)'
ql = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0.125)'
kq = bq = qq

sys_bits = SYS_BITS(x=4, k=4, b=16)

@keras.saving.register_keras_serializable()
class UserModel(QModel):

def __init__(self, x_bits, x_int_bits):
super().__init__(x_bits, x_int_bits)
self.b1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qr}, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({x_bits},0,False,False,1)'})
# self.b2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qq}, add = {'act_str':f'quantized_bits({x_bits},0,False,True,1)'})
# self.b3 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':qq}, add = {'act_str':f'quantized_bits({x_bits},0,False,True,1)'})
# self.b4 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}, add = {'act_str':f'quantized_bits({x_bits},0,False,True,1)'})
self.b5 = Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qr},)
# self.b6 = Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}) #, flatten= True)
self.flat = Flatten()
self.b7 = Bundle( core= {'type':'dense', 'units' :10, 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}, softmax= True)

def call(self, input_tensor, training=False):
x = self.quantize_input(input_tensor) # implicit, from QModel
class UserModel(XModel):
def __init__(self, sys_bits, x_int_bits, *args, **kwargs):
super().__init__(sys_bits, x_int_bits, *args, **kwargs)

self.b1 = XBundle(
core=XConvBN(
k_int_bits=0, b_int_bits=0, filters=8, kernel_size=7, strides=(2,1),
act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)),
pool=XPool(
type='avg', pool_size=(3,4), strides=(2,3), padding='same',
act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),)
)

self.b2 = XBundle(
core=XConvBN(
k_int_bits=0, b_int_bits=0, filters=8, kernel_size=1,
act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None)),
add_act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0.125)
)

self.b3 = XBundle(
core=XConvBN(
k_int_bits=0, b_int_bits=0, filters=8, kernel_size=7,
act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),),
add_act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)
)

self.b4 = XBundle(
core=XConvBN(
k_int_bits=0, b_int_bits=0, filters=8, kernel_size=5,
act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),),
add_act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)
)

self.b5 = XBundle(
core=XConvBN(
k_int_bits=0, b_int_bits=0, filters=24, kernel_size=3,
act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),),
)

self.b6 = XBundle(
core=XConvBN(
k_int_bits=0, b_int_bits=0, filters=10, kernel_size=1,
act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),),
flatten=True
)

self.b7 = XBundle(
core=XDense(
k_int_bits=0, b_int_bits=0, units=NB_CLASSES, use_bias=False,
act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),),
softmax=True
)

def call (self, x):
x = self.input_quant_layer(x)

x = x_skip1 = self.b1(x)
# x = x_skip2 = self.b2(x, x_skip1)
# x = self.b3(x, x_skip2)
# x = self.b4(x, x_skip1)
x = x_skip2 = self.b2(x, x_skip1)
x = self.b3(x, x_skip2)
x = self.b4(x, x_skip1)
x = self.b5(x)
# x = self.b6(x)
x = self.flat(x)
x = self.b6(x)
x = self.b7(x)
return x

# def __init__(self, x_bits, x_int_bits):
# super().__init__(x_bits, x_int_bits)
x = x_in = Input(input_shape, name="input")
user_model = UserModel(sys_bits=sys_bits, x_int_bits=0)
x = user_model(x_in)

# self.b1 = Bundle( core= {'type':'conv' , 'filters':32 , 'kernel_size':(3,3), 'strides':(2,2), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qr})
# self.flat = Flatten()
# self.b4 = Bundle( core= {'type':'dense', 'units' :10, 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}, softmax= True)

# def call(self, input_tensor, training=False):
# x = self.quantize_input(input_tensor)
model = Model(inputs=[x_in], outputs=[x])

# x = self.b1(x)
# x = self.flat(x)
# x = self.b4(x)
# return x

x_in = Input(x_train.shape[1:], name="input")
user_model = UserModel(x_bits=hw.X_BITS, x_int_bits=0)
x_out = user_model(x_in)
model = Model(inputs=[x_in], outputs=[x_out])
'''
Train Model
'''

model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.0001), metrics=["accuracy"])
model.summary()
history = model.fit(
x_train,
y_train,
batch_size=BATCH_SIZE,
epochs=NB_EPOCH,
initial_epoch=1,
verbose=True,
validation_split=VALIDATION_SPLIT)



'''
2. TRAIN (using qkeras)
Save & Reload
'''
history = model.fit(
x_train,
y_train,
batch_size=BATCH_SIZE,
epochs=NB_EPOCH,
initial_epoch=1,
verbose=VERBOSE,
validation_split=VALIDATION_SPLIT)

keras.models.save_model(model, "mnist.h5")

save_model(model, "mnist.h5")
loaded_model = load_qmodel("mnist.h5")

score = loaded_model.evaluate(x_test, y_test, verbose=0)
print(f"Test loss:{score[0]}, Test accuracy:{score[1]}")

# print(loaded_model.layers[1].conv1.get_raw())




'''
Specify Hardware
'''
hw = Hardware ( # Alternatively: hw = Hardware.from_json('hardware.json')
processing_elements = (8, 24) , # (rows, columns) of multiply-add units
frequency_mhz = 250 , #
bits_input = 4 , # bit width of input pixels and activations
bits_weights = 4 , # bit width of weights
bits_sum = 20 , # bit width of accumulator
bits_bias = 16 , # bit width of bias
max_batch_size = 64 , #
max_channels_in = 512 , #
max_kernel_size = 9 , #
max_image_size = 512 , #
max_n_bundles = 64 ,
ram_weights_depth = 512 , #
ram_edges_depth = 3584 , #
axi_width = 128 , #
config_baseaddr = "B0000000",
target_cpu_int_bits = 32 , #
valid_prob = 1 , # probability in which AXI-Stream s_valid signal should be toggled in simulation
ready_prob = 1 , # probability in which AXI-Stream m_ready signal should be toggled in simulation
data_dir = 'vectors', # directory to store generated test vectors
)

# '''
# 3. EXPORT FOR INFERENCE
# '''
# SIM, SIM_PATH = 'xsim', "F:/Xilinx/Vivado/2022.1/bin/" # For Xilinx Vivado
# # SIM, SIM_PATH = 'verilator', "" # For Verilator
hw.export_json()
hw = Hardware.from_json('hardware.json')
hw.export() # Generates: config_hw.svh, config_hw.tcl
hw.export_vivado_tcl(board='zcu104')

# model.export_inference(x=model.random_input, hw=hw) # Runs forward pass in float & int, compares them. Generates: config_fw.h (C firmware), weights.bin, expected.bin
# model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH) # Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation

# '''
# 4. IMPLEMENTATION
'''
VERIFY & EXPORT
'''
export_inference(loaded_model, hw, batch_size=1)
verify_inference(loaded_model, hw, SIM=SIM, SIM_PATH=SIM_PATH)

# a. FPGA: Open vivado, source vivado_flow.tcl
# b. ASIC: Set PDK paths, run syn.tcl & pnr.tcl
# c. Compile C firmware with generated header (config_fw.h) and run on device
# '''
d_perf = predict_model_performance(hw)
pp = pprint.PrettyPrinter(indent=4)
print(f"Predicted Performance")
pp.pprint(d_perf)

0 comments on commit 75d0edb

Please sign in to comment.