diff --git a/deepsocflow/c/xilinx_example.c b/deepsocflow/c/xilinx_example.c index 1913565..9a08fe9 100644 --- a/deepsocflow/c/xilinx_example.c +++ b/deepsocflow/c/xilinx_example.c @@ -13,7 +13,7 @@ int main() // Memory_st *p_mem = (Memory_st *)mmap(NULL, sizeof(Memory_st), PROT_READ | PROT_WRITE, MAP_SHARED, dh, MEM_BASEADDR); // void *p_config = mmap(NULL, 4*16+N_BUNDLES*32, PROT_READ | PROT_WRITE, MAP_SHARED, dh, CONFIG_BASEADDR); - xil_printf("Welcome to DeepSoCFlow!\n Store wbx at: %p; y:%p; buffers {0:%p,1:%p};\n", &p_mem->w, &mp->y, &p_mem->out_buffers[0], &p_mem->out_buffers[1]); + xil_printf("Welcome to DeepSoCFlow!\n Store wbx at: %p; y:%p; buffers {0:%p,1:%p};\n", &p_mem->w, &p_mem->y, &p_mem->out_buffers[0], &p_mem->out_buffers[1]); model_setup(p_mem, p_config); model_run_timed(p_mem, p_config, 20); // run model and measure time diff --git a/deepsocflow/py/dataflow.py b/deepsocflow/py/dataflow.py index 61c5799..ab3034d 100644 --- a/deepsocflow/py/dataflow.py +++ b/deepsocflow/py/dataflow.py @@ -313,28 +313,29 @@ def predict_bundle_performance(hw, r): utilization = operations / (hw.ROWS * hw.COLS * clocks) - return clocks, mem_bits, utilization + return clocks, mem_bits, utilization, operations def predict_model_performance(hw): d_out = { - 'clocks_total': 0, - 'mem_bytes_total': 0, + 'operations': [], 'utilization_all': [], 'clocks_all': [], 'mem_bytes_all': [], } for b in BUNDLES: - clocks, mem_bits, utilization = predict_bundle_performance(hw=hw, r=b.r) - d_out['clocks_total'] += clocks - d_out['mem_bytes_total'] += mem_bits/8 - + clocks, mem_bits, utilization, operations = predict_bundle_performance(hw=hw, r=b.r) + d_out['operations'] += [operations] d_out['utilization_all'] += [utilization] d_out['clocks_all'] += [clocks] d_out['mem_bytes_all'] += [mem_bits/8] print(f'---{b.ib}: util:{100*utilization:.2f} mem_mb:{mem_bits/1024**2:.2f} {b.r.XN=} {b.r.XH=} {b.r.XW=} {b.r.CI=} {b.r.CO=} {b.r.KH=} {b.r.KW=}') + + d_out['g_ops'] = sum(d_out['operations'])/1e9 + d_out['clocks_total'] = sum(d_out['clocks_all']) + d_out['mem_bytes_total'] = sum(d_out['mem_bytes_all']) d_out['seconds_per_batch'] = d_out['clocks_total'] / (hw.FREQ * 1e6) d_out['frames_per_sec'] = hw.ROWS / d_out['seconds_per_batch'] diff --git a/docs/overview.png b/docs/overview.png new file mode 100644 index 0000000..cb008df Binary files /dev/null and b/docs/overview.png differ diff --git a/run/example.py b/run/example.py index 17c3307..0c7fe3b 100644 --- a/run/example.py +++ b/run/example.py @@ -1,41 +1,23 @@ +import os +import pytest +import itertools import sys sys.path.append("../../") -from deepsocflow import Bundle, Hardware, QModel, QInput from tensorflow import keras -from keras.layers import Input, Flatten -from qkeras import Model -from qkeras.utils import load_qmodel - -import numpy as np +from keras.layers import Input +from keras.models import Model, save_model from keras.datasets import mnist from keras.optimizers import Adam from keras.utils import to_categorical +from qkeras.utils import load_qmodel +import numpy as np +import pprint +# import tensorflow as tf +#tf.keras.utils.set_random_seed(0) -''' -0. Specify Hardware -''' -hw = Hardware ( # Alternatively: hw = Hardware.from_json('hardware.json') - processing_elements = (8, 24) , # (rows, columns) of multiply-add units - frequency_mhz = 250 , # - bits_input = 8 , # bit width of input pixels and activations - bits_weights = 8 , # bit width of weights - bits_sum = 24 , # bit width of accumulator - bits_bias = 16 , # bit width of bias - max_batch_size = 64 , # - max_channels_in = 2048 , # - max_kernel_size = 13 , # - max_image_size = 512 , # - ram_weights_depth = 20 , # - ram_edges_depth = 288 , # - axi_width = 64 , # - target_cpu_int_bits = 32 , # - valid_prob = 1 , # probability in which AXI-Stream s_valid signal should be toggled in simulation - ready_prob = 1 , # probability in which AXI-Stream m_ready signal should be toggled in simulation - data_dir = 'vectors', # directory to store generated test vectors - ) -hw.export() # Generates: config_hw.svh, config_hw.tcl, config_tb.svh, hardware.json -hw.export_vivado_tcl(board='zcu104') +from deepsocflow import * +(SIM, SIM_PATH) = ('xsim', "F:/Xilinx/Vivado/2022.2/bin/") if os.name=='nt' else ('verilator', '') ''' Dataset @@ -43,7 +25,6 @@ NB_EPOCH = 2 BATCH_SIZE = 64 -VERBOSE = 1 VALIDATION_SPLIT = 0.1 NB_CLASSES = 10 @@ -57,107 +38,154 @@ y_train = to_categorical(y_train, NB_CLASSES) y_test = to_categorical(y_test, NB_CLASSES) +input_shape = x_train.shape[1:] + ''' -1. Build Model +Define Model ''' -XN = 1 -input_shape = (XN,18,18,3) # (XN, XH, XW, CI) - -QINT_BITS = 0 -qq = f'quantized_bits({hw.K_BITS},{QINT_BITS},False,True,1)' -qr = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0)' -ql = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0.125)' -kq = bq = qq - +sys_bits = SYS_BITS(x=4, k=4, b=16) @keras.saving.register_keras_serializable() -class UserModel(QModel): - - def __init__(self, x_bits, x_int_bits): - super().__init__(x_bits, x_int_bits) - self.b1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qr}, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({x_bits},0,False,False,1)'}) - # self.b2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qq}, add = {'act_str':f'quantized_bits({x_bits},0,False,True,1)'}) - # self.b3 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':qq}, add = {'act_str':f'quantized_bits({x_bits},0,False,True,1)'}) - # self.b4 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}, add = {'act_str':f'quantized_bits({x_bits},0,False,True,1)'}) - self.b5 = Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qr},) - # self.b6 = Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}) #, flatten= True) - self.flat = Flatten() - self.b7 = Bundle( core= {'type':'dense', 'units' :10, 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}, softmax= True) - - def call(self, input_tensor, training=False): - x = self.quantize_input(input_tensor) # implicit, from QModel +class UserModel(XModel): + def __init__(self, sys_bits, x_int_bits, *args, **kwargs): + super().__init__(sys_bits, x_int_bits, *args, **kwargs) + + self.b1 = XBundle( + core=XConvBN( + k_int_bits=0, b_int_bits=0, filters=8, kernel_size=7, strides=(2,1), + act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)), + pool=XPool( + type='avg', pool_size=(3,4), strides=(2,3), padding='same', + act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),) + ) + + self.b2 = XBundle( + core=XConvBN( + k_int_bits=0, b_int_bits=0, filters=8, kernel_size=1, + act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None)), + add_act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0.125) + ) + + self.b3 = XBundle( + core=XConvBN( + k_int_bits=0, b_int_bits=0, filters=8, kernel_size=7, + act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),), + add_act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0) + ) + + self.b4 = XBundle( + core=XConvBN( + k_int_bits=0, b_int_bits=0, filters=8, kernel_size=5, + act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),), + add_act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0) + ) + + self.b5 = XBundle( + core=XConvBN( + k_int_bits=0, b_int_bits=0, filters=24, kernel_size=3, + act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),), + ) + + self.b6 = XBundle( + core=XConvBN( + k_int_bits=0, b_int_bits=0, filters=10, kernel_size=1, + act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),), + flatten=True + ) + + self.b7 = XBundle( + core=XDense( + k_int_bits=0, b_int_bits=0, units=NB_CLASSES, use_bias=False, + act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),), + softmax=True + ) + + def call (self, x): + x = self.input_quant_layer(x) x = x_skip1 = self.b1(x) - # x = x_skip2 = self.b2(x, x_skip1) - # x = self.b3(x, x_skip2) - # x = self.b4(x, x_skip1) + x = x_skip2 = self.b2(x, x_skip1) + x = self.b3(x, x_skip2) + x = self.b4(x, x_skip1) x = self.b5(x) - # x = self.b6(x) - x = self.flat(x) + x = self.b6(x) x = self.b7(x) return x - # def __init__(self, x_bits, x_int_bits): - # super().__init__(x_bits, x_int_bits) +x = x_in = Input(input_shape, name="input") +user_model = UserModel(sys_bits=sys_bits, x_int_bits=0) +x = user_model(x_in) - # self.b1 = Bundle( core= {'type':'conv' , 'filters':32 , 'kernel_size':(3,3), 'strides':(2,2), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qr}) - # self.flat = Flatten() - # self.b4 = Bundle( core= {'type':'dense', 'units' :10, 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}, softmax= True) - - # def call(self, input_tensor, training=False): - # x = self.quantize_input(input_tensor) +model = Model(inputs=[x_in], outputs=[x]) - # x = self.b1(x) - # x = self.flat(x) - # x = self.b4(x) - # return x -x_in = Input(x_train.shape[1:], name="input") -user_model = UserModel(x_bits=hw.X_BITS, x_int_bits=0) -x_out = user_model(x_in) -model = Model(inputs=[x_in], outputs=[x_out]) +''' +Train Model +''' model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.0001), metrics=["accuracy"]) -model.summary() +history = model.fit( + x_train, + y_train, + batch_size=BATCH_SIZE, + epochs=NB_EPOCH, + initial_epoch=1, + verbose=True, + validation_split=VALIDATION_SPLIT) + ''' -2. TRAIN (using qkeras) +Save & Reload ''' -history = model.fit( - x_train, - y_train, - batch_size=BATCH_SIZE, - epochs=NB_EPOCH, - initial_epoch=1, - verbose=VERBOSE, - validation_split=VALIDATION_SPLIT) - -keras.models.save_model(model, "mnist.h5") + +save_model(model, "mnist.h5") loaded_model = load_qmodel("mnist.h5") + score = loaded_model.evaluate(x_test, y_test, verbose=0) print(f"Test loss:{score[0]}, Test accuracy:{score[1]}") -# print(loaded_model.layers[1].conv1.get_raw()) - - +''' +Specify Hardware +''' +hw = Hardware ( # Alternatively: hw = Hardware.from_json('hardware.json') + processing_elements = (8, 24) , # (rows, columns) of multiply-add units + frequency_mhz = 250 , # + bits_input = 4 , # bit width of input pixels and activations + bits_weights = 4 , # bit width of weights + bits_sum = 20 , # bit width of accumulator + bits_bias = 16 , # bit width of bias + max_batch_size = 64 , # + max_channels_in = 512 , # + max_kernel_size = 9 , # + max_image_size = 512 , # + max_n_bundles = 64 , + ram_weights_depth = 512 , # + ram_edges_depth = 3584 , # + axi_width = 128 , # + config_baseaddr = "B0000000", + target_cpu_int_bits = 32 , # + valid_prob = 1 , # probability in which AXI-Stream s_valid signal should be toggled in simulation + ready_prob = 1 , # probability in which AXI-Stream m_ready signal should be toggled in simulation + data_dir = 'vectors', # directory to store generated test vectors + ) -# ''' -# 3. EXPORT FOR INFERENCE -# ''' -# SIM, SIM_PATH = 'xsim', "F:/Xilinx/Vivado/2022.1/bin/" # For Xilinx Vivado -# # SIM, SIM_PATH = 'verilator', "" # For Verilator +hw.export_json() +hw = Hardware.from_json('hardware.json') +hw.export() # Generates: config_hw.svh, config_hw.tcl +hw.export_vivado_tcl(board='zcu104') -# model.export_inference(x=model.random_input, hw=hw) # Runs forward pass in float & int, compares them. Generates: config_fw.h (C firmware), weights.bin, expected.bin -# model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH) # Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation -# ''' -# 4. IMPLEMENTATION +''' +VERIFY & EXPORT +''' +export_inference(loaded_model, hw, batch_size=1) +verify_inference(loaded_model, hw, SIM=SIM, SIM_PATH=SIM_PATH) -# a. FPGA: Open vivado, source vivado_flow.tcl -# b. ASIC: Set PDK paths, run syn.tcl & pnr.tcl -# c. Compile C firmware with generated header (config_fw.h) and run on device -# ''' \ No newline at end of file +d_perf = predict_model_performance(hw) +pp = pprint.PrettyPrinter(indent=4) +print(f"Predicted Performance") +pp.pprint(d_perf) \ No newline at end of file