From 313fc796b2a3063cab7b5847864a524efb69aee4 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 14 Oct 2017 16:01:12 +0200 Subject: Fixed several (not all) CUDA kernel compilation issues --- src/kernels/common.opencl | 27 +++++++++++++----------- src/kernels/opencl_to_cuda.h | 49 +++++++++++++++++++++++++++++--------------- 2 files changed, 48 insertions(+), 28 deletions(-) (limited to 'src/kernels') diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 9481881e..a34877d9 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -23,15 +23,18 @@ R"( #endif // ================================================================================================= +#ifndef CUDA -// Enable support for double-precision -#if PRECISION == 16 - #pragma OPENCL EXTENSION cl_khr_fp16: enable -#endif + // Enable support for double-precision + #if PRECISION == 16 + #pragma OPENCL EXTENSION cl_khr_fp16: enable + #endif + + // Enable support for double-precision + #if PRECISION == 64 || PRECISION == 6464 + #pragma OPENCL EXTENSION cl_khr_fp64: enable + #endif -// Enable support for double-precision -#if PRECISION == 64 || PRECISION == 6464 - #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif // Half-precision @@ -254,18 +257,18 @@ R"( // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf // More details: https://github.com/CNugteren/CLBlast/issues/53 #if USE_STAGGERED_INDICES == 1 - INLINE_FUNC size_t GetGroupIDFlat() { + INLINE_FUNC int GetGroupIDFlat() { return get_group_id(0) + get_num_groups(0) * get_group_id(1); } - INLINE_FUNC size_t GetGroupID1() { + INLINE_FUNC int GetGroupID1() { return (GetGroupIDFlat()) % get_num_groups(1); } - INLINE_FUNC size_t GetGroupID0() { + INLINE_FUNC int GetGroupID0() { return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0); } #else - INLINE_FUNC size_t GetGroupID1() { return get_group_id(1); } - INLINE_FUNC size_t GetGroupID0() { return get_group_id(0); } + INLINE_FUNC int GetGroupID1() { return get_group_id(1); } + INLINE_FUNC int GetGroupID0() { return get_group_id(0); } #endif // ================================================================================================= diff --git a/src/kernels/opencl_to_cuda.h b/src/kernels/opencl_to_cuda.h index 2e46bc2b..94a1549e 100644 --- a/src/kernels/opencl_to_cuda.h +++ b/src/kernels/opencl_to_cuda.h @@ -16,32 +16,28 @@ R"( // ================================================================================================= -// Replaces the OpenCL keywords with CUDA equivalent -#define __kernel __placeholder__ -#define __global -#define __placeholder__ extern "C" __global__ -#define __local __shared__ -#define restrict __restrict__ -#define __constant const -#define inline __device__ inline // assumes all device functions are annotated with inline in OpenCL - -// Replaces OpenCL synchronisation with CUDA synchronisation -#define barrier(x) __syncthreads() +// CLBlast specific additions +#define CUDA 1 // Replaces the OpenCL get_xxx_ID with CUDA equivalents -__device__ int get_local_id(int x) { +__device__ int get_local_id(const int x) { if (x == 0) { return threadIdx.x; } if (x == 1) { return threadIdx.y; } return threadIdx.z; } -__device__ int get_group_id(int x) { +__device__ int get_group_id(const int x) { if (x == 0) { return blockIdx.x; } - if (x == 1) { return blockIdx.y;} + if (x == 1) { return blockIdx.y; } return blockIdx.z; } -__device__ int get_global_id(int x) { +__device__ int get_global_size(const int x) { + if (x == 0) { return gridDim.x; } + if (x == 1) { return gridDim.y; } + return gridDim.z; +} +__device__ int get_global_id(const int x) { if (x == 0) { return blockIdx.x*blockDim.x + threadIdx.x; } - if (y == 0) { return blockIdx.y*blockDim.y + threadIdx.y; } + if (x == 1) { return blockIdx.y*blockDim.y + threadIdx.y; } return blockIdx.z*blockDim.z + threadIdx.z; } @@ -52,6 +48,27 @@ typedef struct { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float s10; float s11; float s12; float s13; float s14; float s15; } float16; +typedef struct { double s0; double s1; double s2; double s3; + double s4; double s5; double s6; double s7; } double8; +typedef struct { double s0; double s1; double s2; double s3; + double s4; double s5; double s6; double s7; + double s8; double s9; double s10; double s11; + double s12; double s13; double s14; double s15; } double16; + +// Replaces the OpenCL keywords with CUDA equivalent +#define __kernel __placeholder__ +#define __global +#define __placeholder__ extern "C" __global__ +#define __local __shared__ +#define restrict __restrict__ +#define __constant const +#define inline __device__ // assumes all device functions are annotated with inline in OpenCL + +// Kernel attributes (don't replace currently) +#define reqd_work_group_size(x, y, z) + +// Replaces OpenCL synchronisation with CUDA synchronisation +#define barrier(x) __syncthreads() // ================================================================================================= -- cgit v1.2.3