diff --git a/deepsocflow/c/xilinx_example.c b/deepsocflow/c/xilinx_example.c
index 1913565..9a08fe9 100644
--- a/deepsocflow/c/xilinx_example.c
+++ b/deepsocflow/c/xilinx_example.c
@@ -13,7 +13,7 @@ int main()
     // Memory_st *p_mem = (Memory_st *)mmap(NULL, sizeof(Memory_st), PROT_READ | PROT_WRITE, MAP_SHARED, dh, MEM_BASEADDR);
     // void *p_config = mmap(NULL, 4*16+N_BUNDLES*32, PROT_READ | PROT_WRITE, MAP_SHARED, dh, CONFIG_BASEADDR);
 
-    xil_printf("Welcome to DeepSoCFlow!\n Store wbx at: %p; y:%p; buffers {0:%p,1:%p};\n", &p_mem->w, &mp->y, &p_mem->out_buffers[0], &p_mem->out_buffers[1]);
+    xil_printf("Welcome to DeepSoCFlow!\n Store wbx at: %p; y:%p; buffers {0:%p,1:%p};\n", &p_mem->w, &p_mem->y, &p_mem->out_buffers[0], &p_mem->out_buffers[1]);
 
     model_setup(p_mem, p_config);
     model_run_timed(p_mem, p_config, 20);    // run model and measure time
diff --git a/deepsocflow/py/dataflow.py b/deepsocflow/py/dataflow.py
index 61c5799..ab3034d 100644
--- a/deepsocflow/py/dataflow.py
+++ b/deepsocflow/py/dataflow.py
@@ -313,28 +313,29 @@ def predict_bundle_performance(hw, r):
     utilization = operations / (hw.ROWS * hw.COLS * clocks)
 
 
-    return clocks, mem_bits, utilization
+    return clocks, mem_bits, utilization, operations
 
 
 def predict_model_performance(hw):
 
     d_out = {
-        'clocks_total': 0,
-        'mem_bytes_total': 0,
+        'operations': [],
         'utilization_all': [],
         'clocks_all': [],
         'mem_bytes_all': [],
     }
     for b in BUNDLES:
-        clocks, mem_bits, utilization = predict_bundle_performance(hw=hw, r=b.r)
-        d_out['clocks_total'] += clocks
-        d_out['mem_bytes_total'] += mem_bits/8
-
+        clocks, mem_bits, utilization, operations = predict_bundle_performance(hw=hw, r=b.r)
+        d_out['operations'] += [operations]
         d_out['utilization_all'] += [utilization]
         d_out['clocks_all'] += [clocks]
         d_out['mem_bytes_all'] += [mem_bits/8]
 
         print(f'---{b.ib}: util:{100*utilization:.2f} mem_mb:{mem_bits/1024**2:.2f} {b.r.XN=} {b.r.XH=} {b.r.XW=} {b.r.CI=} {b.r.CO=} {b.r.KH=} {b.r.KW=}')
+    
+    d_out['g_ops'] = sum(d_out['operations'])/1e9
+    d_out['clocks_total'] = sum(d_out['clocks_all'])
+    d_out['mem_bytes_total'] = sum(d_out['mem_bytes_all'])
 
     d_out['seconds_per_batch'] = d_out['clocks_total'] / (hw.FREQ * 1e6)
     d_out['frames_per_sec'] = hw.ROWS / d_out['seconds_per_batch']
diff --git a/docs/overview.png b/docs/overview.png
new file mode 100644
index 0000000..cb008df
Binary files /dev/null and b/docs/overview.png differ
diff --git a/run/example.py b/run/example.py
index 17c3307..0c7fe3b 100644
--- a/run/example.py
+++ b/run/example.py
@@ -1,41 +1,23 @@
+import os
+import pytest
+import itertools
 import sys
 sys.path.append("../../")
-from deepsocflow import Bundle, Hardware, QModel, QInput
 from tensorflow import keras
-from keras.layers import Input, Flatten
-from qkeras import Model
-from qkeras.utils import load_qmodel
-
-import numpy as np
+from keras.layers import Input
+from keras.models import Model, save_model
 from keras.datasets import mnist
 from keras.optimizers import Adam
 from keras.utils import to_categorical
+from qkeras.utils import load_qmodel
+import numpy as np
+import pprint
+# import tensorflow as tf
+#tf.keras.utils.set_random_seed(0)
 
-'''
-0. Specify Hardware
-'''
-hw = Hardware (                          # Alternatively: hw = Hardware.from_json('hardware.json')
-        processing_elements = (8, 24)  , # (rows, columns) of multiply-add units
-        frequency_mhz       = 250      , #  
-        bits_input          = 8        , # bit width of input pixels and activations
-        bits_weights        = 8        , # bit width of weights
-        bits_sum            = 24       , # bit width of accumulator
-        bits_bias           = 16       , # bit width of bias
-        max_batch_size      = 64       , # 
-        max_channels_in     = 2048     , #
-        max_kernel_size     = 13       , #
-        max_image_size      = 512      , #
-        ram_weights_depth   = 20       , #
-        ram_edges_depth     = 288      , #
-        axi_width           = 64       , #
-        target_cpu_int_bits = 32       , #
-        valid_prob          = 1        , # probability in which AXI-Stream s_valid signal should be toggled in simulation
-        ready_prob          = 1        , # probability in which AXI-Stream m_ready signal should be toggled in simulation
-        data_dir            = 'vectors', # directory to store generated test vectors
-     )
-hw.export() # Generates: config_hw.svh, config_hw.tcl, config_tb.svh, hardware.json
-hw.export_vivado_tcl(board='zcu104')
+from deepsocflow import *
 
+(SIM, SIM_PATH) = ('xsim', "F:/Xilinx/Vivado/2022.2/bin/") if os.name=='nt' else ('verilator', '')
 
 '''
 Dataset
@@ -43,7 +25,6 @@
 
 NB_EPOCH = 2
 BATCH_SIZE = 64
-VERBOSE = 1
 VALIDATION_SPLIT = 0.1
 NB_CLASSES = 10
 
@@ -57,107 +38,154 @@
 
 y_train = to_categorical(y_train, NB_CLASSES)
 y_test = to_categorical(y_test, NB_CLASSES)
+input_shape = x_train.shape[1:]
+
 
 '''
-1. Build Model 
+Define Model
 '''
-XN = 1
-input_shape = (XN,18,18,3) # (XN, XH, XW, CI)
-
-QINT_BITS = 0
-qq = f'quantized_bits({hw.K_BITS},{QINT_BITS},False,True,1)'
-qr = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0)'    
-ql = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0.125)'
-kq = bq = qq
-
+sys_bits = SYS_BITS(x=4, k=4, b=16)
 
 @keras.saving.register_keras_serializable()
-class UserModel(QModel):
-
-    def __init__(self, x_bits, x_int_bits):
-        super().__init__(x_bits, x_int_bits)
-        self.b1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qr}, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({x_bits},0,False,False,1)'})
-        # self.b2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qq}, add = {'act_str':f'quantized_bits({x_bits},0,False,True,1)'})
-        # self.b3 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':qq}, add = {'act_str':f'quantized_bits({x_bits},0,False,True,1)'})
-        # self.b4 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}, add = {'act_str':f'quantized_bits({x_bits},0,False,True,1)'})
-        self.b5 = Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qr},)
-        # self.b6 = Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}) #, flatten= True)
-        self.flat = Flatten()
-        self.b7 = Bundle( core= {'type':'dense', 'units'  :10,                                                           'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}, softmax= True)
-
-    def call(self, input_tensor, training=False):
-        x = self.quantize_input(input_tensor) # implicit, from QModel
+class UserModel(XModel):
+    def __init__(self, sys_bits, x_int_bits, *args, **kwargs):
+        super().__init__(sys_bits, x_int_bits, *args, **kwargs)
+
+        self.b1 = XBundle( 
+            core=XConvBN( 
+                k_int_bits=0, b_int_bits=0, filters=8, kernel_size=7, strides=(2,1),
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)),
+            pool=XPool(
+                type='avg', pool_size=(3,4), strides=(2,3), padding='same',
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),)
+            )
+        
+        self.b2 = XBundle( 
+            core=XConvBN(
+                k_int_bits=0, b_int_bits=0, filters=8, kernel_size=1,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None)),
+            add_act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0.125)
+        )
+        
+        self.b3 = XBundle( 
+            core=XConvBN( 
+                k_int_bits=0, b_int_bits=0, filters=8, kernel_size=7,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),),
+            add_act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)
+        )
+
+        self.b4 = XBundle( 
+            core=XConvBN( 
+                k_int_bits=0, b_int_bits=0, filters=8, kernel_size=5,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),),
+            add_act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)
+        )
+
+        self.b5 = XBundle( 
+            core=XConvBN(
+                k_int_bits=0, b_int_bits=0, filters=24, kernel_size=3,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),),
+        )
+
+        self.b6 = XBundle( 
+            core=XConvBN(
+                k_int_bits=0, b_int_bits=0, filters=10, kernel_size=1,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),),
+            flatten=True
+        )
+
+        self.b7 = XBundle(
+            core=XDense(
+                k_int_bits=0, b_int_bits=0, units=NB_CLASSES, use_bias=False,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),),
+            softmax=True
+        )
+
+    def call (self, x):
+        x = self.input_quant_layer(x)
 
         x = x_skip1 = self.b1(x)
-        # x = x_skip2 = self.b2(x, x_skip1)
-        # x =           self.b3(x, x_skip2)
-        # x =           self.b4(x, x_skip1)
+        x = x_skip2 = self.b2(x, x_skip1)
+        x =           self.b3(x, x_skip2)
+        x =           self.b4(x, x_skip1)
         x =           self.b5(x)
-        # x =           self.b6(x)
-        x = self.flat(x)
+        x =           self.b6(x)
         x =           self.b7(x)
         return x
 
-    # def __init__(self, x_bits, x_int_bits):
-    #     super().__init__(x_bits, x_int_bits)
+x = x_in =  Input(input_shape, name="input")
+user_model = UserModel(sys_bits=sys_bits, x_int_bits=0)
+x = user_model(x_in)
 
-    #     self.b1 = Bundle( core= {'type':'conv' , 'filters':32 , 'kernel_size':(3,3), 'strides':(2,2), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':qr})
-    #     self.flat = Flatten()
-    #     self.b4 = Bundle( core= {'type':'dense', 'units'  :10,                                                          'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':ql}, softmax= True)
-        
-    # def call(self, input_tensor, training=False):
-    #     x = self.quantize_input(input_tensor)
+model = Model(inputs=[x_in], outputs=[x])
 
-    #     x = self.b1(x)
-    #     x = self.flat(x)
-    #     x = self.b4(x)
-    #     return x
 
-x_in =  Input(x_train.shape[1:], name="input")
-user_model = UserModel(x_bits=hw.X_BITS, x_int_bits=0)
-x_out = user_model(x_in)
-model = Model(inputs=[x_in], outputs=[x_out])
+'''
+Train Model
+'''
 
 model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.0001), metrics=["accuracy"])
-model.summary()
+history = model.fit(
+        x_train, 
+        y_train, 
+        batch_size=BATCH_SIZE,
+        epochs=NB_EPOCH, 
+        initial_epoch=1, 
+        verbose=True,
+        validation_split=VALIDATION_SPLIT)
+
 
 
 '''
-2. TRAIN (using qkeras)
+Save & Reload
 '''
-history = model.fit(
-            x_train, 
-            y_train, 
-            batch_size=BATCH_SIZE,
-            epochs=NB_EPOCH, 
-            initial_epoch=1, 
-            verbose=VERBOSE,
-            validation_split=VALIDATION_SPLIT)
-
-keras.models.save_model(model, "mnist.h5")
+
+save_model(model, "mnist.h5")
 loaded_model = load_qmodel("mnist.h5")
+
 score = loaded_model.evaluate(x_test, y_test, verbose=0)
 print(f"Test loss:{score[0]}, Test accuracy:{score[1]}")
 
-# print(loaded_model.layers[1].conv1.get_raw())
-
-
 
 
+'''
+Specify Hardware
+'''
+hw = Hardware (                          # Alternatively: hw = Hardware.from_json('hardware.json')
+        processing_elements = (8, 24)  , # (rows, columns) of multiply-add units
+        frequency_mhz       = 250      , #  
+        bits_input          = 4        , # bit width of input pixels and activations
+        bits_weights        = 4        , # bit width of weights
+        bits_sum            = 20       , # bit width of accumulator
+        bits_bias           = 16       , # bit width of bias
+        max_batch_size      = 64       , # 
+        max_channels_in     = 512      , #
+        max_kernel_size     = 9        , #
+        max_image_size      = 512      , #
+        max_n_bundles       = 64       ,
+        ram_weights_depth   = 512      , #
+        ram_edges_depth     = 3584     , #
+        axi_width           = 128      , #
+        config_baseaddr     = "B0000000",
+        target_cpu_int_bits = 32       , #
+        valid_prob          = 1        , # probability in which AXI-Stream s_valid signal should be toggled in simulation
+        ready_prob          = 1        , # probability in which AXI-Stream m_ready signal should be toggled in simulation
+        data_dir            = 'vectors', # directory to store generated test vectors
+     )
 
-# '''
-# 3. EXPORT FOR INFERENCE
-# '''
-# SIM, SIM_PATH = 'xsim', "F:/Xilinx/Vivado/2022.1/bin/" # For Xilinx Vivado
-# # SIM, SIM_PATH = 'verilator', "" # For Verilator
+hw.export_json()
+hw = Hardware.from_json('hardware.json')
+hw.export() # Generates: config_hw.svh, config_hw.tcl
+hw.export_vivado_tcl(board='zcu104')
 
-# model.export_inference(x=model.random_input, hw=hw)  # Runs forward pass in float & int, compares them. Generates: config_fw.h (C firmware), weights.bin, expected.bin
-# model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH)   # Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation
 
-# '''
-# 4. IMPLEMENTATION
+'''
+VERIFY & EXPORT
+'''
+export_inference(loaded_model, hw, batch_size=1)
+verify_inference(loaded_model, hw, SIM=SIM, SIM_PATH=SIM_PATH)
 
-# a. FPGA: Open vivado, source vivado_flow.tcl
-# b. ASIC: Set PDK paths, run syn.tcl & pnr.tcl
-# c. Compile C firmware with generated header (config_fw.h) and run on device
-# '''
\ No newline at end of file
+d_perf = predict_model_performance(hw)
+pp = pprint.PrettyPrinter(indent=4)
+print(f"Predicted Performance")
+pp.pprint(d_perf)
\ No newline at end of file