GPU Image Performance Tiles Test

CUDA Convolution test comparing image tiled and untiled blur filters.

Run the demo

This demo can be build for Windows and Linux.

Windows
(1) Use the Visual Studio solution.
(2) CMake
Create folder build_windows
cmake ../
cmake ../ -G "Visual Studio 15 2017"

Linux
Create folder build_linux .
cmake ../
make

Requirements:
CUDA Toolkit
CMake (>=3.26)

Results

Typical output:

Filter: 7 x 7
image size: 5200 x 6500
filter                                  time                FPS
---------------------------------------------------------------------
gauss_gpu_tiles                         9.57 [ms]           104.49
gauss_gpu                               8.82 [ms]           113.38

Untiled image filter

__global__ void conv_kernel(const float* __restrict__ in, float * __restrict__ out, int w, int h)
{
    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
    unsigned int x = idx % w;
    unsigned int c = idx / w;   
    unsigned int w3 = 3 * w;

    if ((idx >= w3) || (y >= h))
        return;

    unsigned int id = y * w3 + c*w + x;   // image index RGB/NCWH

    if (x <= R-1 || y <= R-1 || x >= w-R || y >= h-R)
    {
        out[id] = in[id];
        return;
    }
    
    float sum = 0;
    for(int i = -R; i <=R; i++)
    for(int j = -R; j <=R; j++)
    {
        sum += in[id + j + w3*i] *kernel_blur[(i + R) * D + (j + R)];
    }

    out[id] = sum;
}

Tiled image filter

__global__ void conv_tiles_kernel(const float* __restrict__ in, float * __restrict__ out, int w, int h)
{
    const int BLOCK_W = (TILE_W + 2*R);
    const int BLOCK_H = (TILE_H + 2*R);
    __shared__ float smem[BLOCK_H][BLOCK_W];

    int tx = threadIdx.x;
    int ty = threadIdx.y;
    unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;
    unsigned int idy = blockIdx.y*blockDim.y+threadIdx.y;

    unsigned int x = idx % w;
    unsigned int c = idx / w;
    unsigned int y = idy;

    int w3 = w * 3;
    int idx_out = y * w3 + c * w + x;

    // load central block
    smem[ty+R][tx+R] = in[idx_out]; 
    // load left bar
    if ((x >= R) && (tx < R)) smem[ty+R][tx] = in[idx_out-R];
    // load right bar
    if ((x < w-R) && (tx >= blockDim.x-R) ) smem[ty+R][tx+2*R] = in[idx_out+R];
    // load top bar
    if ((y > R-1) && (ty < R)) smem[ty][tx+R] = in[idx_out-R*w3];
    // load bottom bar
    if ((y < h-R) && (ty >=blockDim.y-R)) smem[ty+2*R][tx+R] = in[idx_out+R*w3];
    // load UL corner
    if ((x > R-1) && (y > R-1) && (tx < R) && (ty < R)) smem[ty][tx] = in[idx_out - R*w3 - R];
    // load UR corner
    if ((x < w-R) && (y > R-1) && (tx >= blockDim.x-R) && (ty < R) ) smem[ty][tx+2*R] = in[idx_out - R * w3 + R];
    // load LL corner
    if ((x > R-1) && (y < h-R) && (tx < R) && (ty >= blockDim.y-R) ) smem[ty+2*R][tx] = in[idx_out + R * w3 - R];
    // load LR corner
    if ((x < w-R) && (y < h-R) && (tx >=blockDim.x-R) && (ty >= blockDim.y-R)) smem[ty+2*R][tx+2*R] = in[idx_out + R * w3 + R];
    
    __syncthreads();

    if (x <= R-1 || y <= R-1 || x >= w-R || y >= h-R)
    {
        out[idx_out] = in[idx_out];
        return;
    }

    float sum = 0;
    for(int i = -R; i <=R; i++)
    for(int j = -R; j <=R; j++)
    {
        sum += smem[ty+R+i][tx+R+j] * kernel_blur[(i+R)*D+(j+R)];
    }
    out[idx_out] = sum;
}

Benchmark comparison

The figure below shows the comparison of the tiled and untiled CUDA convolution image filter for kernel radius between 1-16.

These results shows that tiled implementation with CUDA, using effectively the shared memory is accually not better for filter size less than 11 x 11. For detailed discussion See.

Name		Name	Last commit message	Last commit date
Latest commit History 10 Commits
Sources		Sources
README.md		README.md
bulr_performance.png		bulr_performance.png

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

GPU Image Performance Tiles Test

Results

Untiled image filter

Tiled image filter

Benchmark comparison

About

Releases

Packages

Languages

antonmilev/GPU_Image_tiles_test

Folders and files

Latest commit

History

Repository files navigation

GPU Image Performance Tiles Test

Results

Untiled image filter

Tiled image filter

Benchmark comparison

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages