Skip to content

antonmilev/GPU_Image_tiles_test

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

10 Commits
 
 
 
 
 
 

Repository files navigation

GPU Image Performance Tiles Test

CUDA Convolution test comparing image tiled and untiled blur filters.

Run the demo

This demo can be build for Windows and Linux.

Windows
(1) Use the Visual Studio solution.
(2) CMake
Create folder build_windows
cmake ../
cmake ../ -G "Visual Studio 15 2017"

Linux
Create folder build_linux .
cmake ../
make

Requirements:
CUDA Toolkit
CMake (>=3.26)

Results

Typical output:

Filter: 7 x 7
image size: 5200 x 6500
filter                                  time                FPS
---------------------------------------------------------------------
gauss_gpu_tiles                         9.57 [ms]           104.49
gauss_gpu                               8.82 [ms]           113.38

Untiled image filter

__global__ void conv_kernel(const float* __restrict__ in, float * __restrict__ out, int w, int h)
{
    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
    unsigned int x = idx % w;
    unsigned int c = idx / w;   
    unsigned int w3 = 3 * w;

    if ((idx >= w3) || (y >= h))
        return;

    unsigned int id = y * w3 + c*w + x;   // image index RGB/NCWH

    if (x <= R-1 || y <= R-1 || x >= w-R || y >= h-R)
    {
        out[id] = in[id];
        return;
    }
    
    float sum = 0;
    for(int i = -R; i <=R; i++)
    for(int j = -R; j <=R; j++)
    {
        sum += in[id + j + w3*i] *kernel_blur[(i + R) * D + (j + R)];
    }

    out[id] = sum;
}

Tiled image filter

__global__ void conv_tiles_kernel(const float* __restrict__ in, float * __restrict__ out, int w, int h)
{
    const int BLOCK_W = (TILE_W + 2*R);
    const int BLOCK_H = (TILE_H + 2*R);
    __shared__ float smem[BLOCK_H][BLOCK_W];

    int tx = threadIdx.x;
    int ty = threadIdx.y;
    unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;
    unsigned int idy = blockIdx.y*blockDim.y+threadIdx.y;

    unsigned int x = idx % w;
    unsigned int c = idx / w;
    unsigned int y = idy;

    int w3 = w * 3;
    int idx_out = y * w3 + c * w + x;

    // load central block
    smem[ty+R][tx+R] = in[idx_out]; 
    // load left bar
    if ((x >= R) && (tx < R)) smem[ty+R][tx] = in[idx_out-R];
    // load right bar
    if ((x < w-R) && (tx >= blockDim.x-R) ) smem[ty+R][tx+2*R] = in[idx_out+R];
    // load top bar
    if ((y > R-1) && (ty < R)) smem[ty][tx+R] = in[idx_out-R*w3];
    // load bottom bar
    if ((y < h-R) && (ty >=blockDim.y-R)) smem[ty+2*R][tx+R] = in[idx_out+R*w3];
    // load UL corner
    if ((x > R-1) && (y > R-1) && (tx < R) && (ty < R)) smem[ty][tx] = in[idx_out - R*w3 - R];
    // load UR corner
    if ((x < w-R) && (y > R-1) && (tx >= blockDim.x-R) && (ty < R) ) smem[ty][tx+2*R] = in[idx_out - R * w3 + R];
    // load LL corner
    if ((x > R-1) && (y < h-R) && (tx < R) && (ty >= blockDim.y-R) ) smem[ty+2*R][tx] = in[idx_out + R * w3 - R];
    // load LR corner
    if ((x < w-R) && (y < h-R) && (tx >=blockDim.x-R) && (ty >= blockDim.y-R)) smem[ty+2*R][tx+2*R] = in[idx_out + R * w3 + R];
    
    __syncthreads();

    if (x <= R-1 || y <= R-1 || x >= w-R || y >= h-R)
    {
        out[idx_out] = in[idx_out];
        return;
    }

    float sum = 0;
    for(int i = -R; i <=R; i++)
    for(int j = -R; j <=R; j++)
    {
        sum += smem[ty+R+i][tx+R+j] * kernel_blur[(i+R)*D+(j+R)];
    }
    out[idx_out] = sum;
}

Benchmark comparison

The figure below shows the comparison of the tiled and untiled CUDA convolution image filter for kernel radius between 1-16.

These results shows that tiled implementation with CUDA, using effectively the shared memory is accually not better for filter size less than 11 x 11. For detailed discussion See.

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages