-
Notifications
You must be signed in to change notification settings - Fork 50
Basic Examples With Poorly Drawn Diagrams
JulianKemmerer edited this page May 30, 2022
·
22 revisions
- Floating Point Adder
- Binary Adder Tree
- Low latency, high resource usage matrix multiply
- High latency, low resource usage matrix multiply
Here is the PipelineC code that describes a pipelined floating point adder:
float main(float x, float y)
{
return x + y;
}
Here is the PipelineC code that describes a pipelined binary tree summation of 8 values by instantiating 7 floating point adders:
float main(float x0, float x1, float x2, float x3,
float x4, float x5, float x6, float x7)
{
// First layer of 4 sums in parallel
float sum0;
float sum1;
float sum2;
float sum3;
sum0 = x0 + x1;
sum1 = x2 + x3;
sum2 = x4 + x5;
sum3 = x6 + x7;
// Next layer of two sums in parallel
float sum4;
float sum5;
sum4 = sum0 + sum1;
sum5 = sum2 + sum3;
// Final layer of a single sum
float sum6;
sum6 = sum4 + sum5;
return sum6;
}
This code describes a pipelined floating point matrix multiplier by instantiating::
- N^3 floating point multipliers
- N^2 binary tree summations of N elements ('float_array_sumN') (of which each is is 2^(log2(N)+1)-1 floating point adders )
// Lowest latency, most resource usage
// Resource usage grows O(N^3)
#include "uintN_t.h"
#define N 2
#define float_array_sumN float_array_sum2 // Built in function
#define iter_t uint1_t
typedef struct an_array_t
{
float a[N][N];
} an_array_t;
an_array_t main(float mat1[N][N], float mat2[N][N])
{
an_array_t res;
iter_t i;
iter_t j;
iter_t k;
for (i = 0; i < N; i = i + 1)
{
for (j = 0; j < N; j = j + 1)
{
float res_k[N];
for (k = 0; k < N; k = k + 1)
{
res_k[k] = mat1[i][k] * mat2[k][j];
}
res.a[i][j] = float_array_sumN(res_k);
}
}
return res;
}
The diagram I tried to hand draw for this was embarrassing. Use your imagination for now.
This code describes two floating point units, a multiplier and an adder. The multiply+add computation takes M total clock cycles. Every M clock cycles the i,j,k iterators are incremented to unroll the loop. Latency = N^3 * M clock cycles.
// High latency, low resource usage matrix multiply
#include "uintN_t.h"
// Matrix dimension
#define N 2
#define N_MINUS_1 1 // To not need an extra bit in iter_t
#define iter_t uint1_t
// Type holding result matrix
typedef struct result_matrix_t
{
float mat[N][N];
uint1_t done;
} result_matrix_t;
// Indicates if volatile stateful variables are valid
volatile uint1_t volatiles_valid;
// Volatile stateful variables
// i,j,k iterators
volatile iter_t i;
volatile iter_t j;
volatile iter_t k;
// Result matrix
volatile float mat[N][N];
// An unrolled version of the i,j,k matrix multiply nested loops
/*
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
for (k = 0; k < N; k++)
res[i][j] += mat1[i][k] * mat2[k][j];
}
}
*/
// Signal the start of the operation with 'start'
// return value '.done' signals the completion of the operation
// Both input matrices are assumed to have data from start->done
result_matrix_t main(
uint1_t start,
float mat1[N][N], float mat2[N][N])
{
// Reset everything if this is the start
if(start)
{
// Validate the volatiles as being reset to 0
volatiles_valid = 1;
// Clear the matrix and iterators
i = 0;
j = 0;
k = 0;
for (i = 0; i < N; i = i + 1)
{
for (j = 0; j < N; j = j + 1)
{
mat[i][j] = 0;
}
}
}
// Default return accumulated result and not done yet
result_matrix_t rv;
rv.mat = mat;
rv.done = 0;
// Accumulate into volatile state matrix
// (only if the volatile stateful variables have valid data)
if(volatiles_valid)
{
// This is like unrolling to a
// single iteration of the inner most loop
// Do the math for this iteration
// Two separate floating point operatations
// Multiply
float product;
product = mat1[i][k] * mat2[k][j];
// Accumulate
mat[i][j] = mat[i][j] + product;
// Done?
if(
(i == N_MINUS_1) &
(j == N_MINUS_1) &
(k == N_MINUS_1)
)
{
// Multiply is done now
rv.mat = mat;
rv.done = 1;
// Invalidate state, wait for start again
volatiles_valid = 0;
}
// Increment iterators unless going back to start
// Increment k every time
// k
if(k == N_MINUS_1)
{
// End of k loop, reset
k = 0;
}
else
{
// Increment k
k = k + 1;
}
// j
// Increment j when k is done
if(k == 0) // Was N-1
{
if(j == N_MINUS_1)
{
// End of j loop, reset
j = 0;
}
else
{
// Increment j
j = j + 1;
}
}
// i
// Increment i when j and k are done
if(
(j == 0) & // Was N-1
(k == 0) // Was N-1
)
{
if (i == N_MINUS_1)
{
// End of i loop, reset
i = 0;
}
else
{
// Increment i
i = i + 1;
}
}
}
return rv;
}
Diagram: TBD