// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains an (incomplete) header to interpret OpenCL kernels as CUDA kernels. // // ================================================================================================= // Replaces the OpenCL keywords with CUDA equivalent #define __kernel __placeholder__ #define __global #define __placeholder__ extern "C" __global__ #define __local __shared__ #define restrict __restrict__ #define __constant const #define inline __device__ inline // assumes all device functions are annotated with inline in OpenCL // Replaces OpenCL synchronisation with CUDA synchronisation #define barrier(x) __syncthreads() // Replaces the OpenCL get_xxx_ID with CUDA equivalents __device__ int get_local_id(int x) { if (x == 0) { return threadIdx.x; } if (x == 1) { return threadIdx.y; } return threadIdx.z; } __device__ int get_group_id(int x) { if (x == 0) { return blockIdx.x; } if (x == 1) { return blockIdx.y;} return blockIdx.z; } __device__ int get_global_id(int x) { if (x == 0) { return blockIdx.x*blockDim.x + threadIdx.x; } if (y == 0) { return blockIdx.y*blockDim.y + threadIdx.y; } return blockIdx.z*blockDim.z + threadIdx.z; } // Adds the data-types which are not available natively under CUDA typedef struct { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; } float8; typedef struct { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float s10; float s11; float s12; float s13; float s14; float s15; } float16; // =================================================================================================