summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-10-14 10:49:25 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2017-10-14 10:49:25 +0200
commit2d7b648a243a97d18899677a51c9e441d6edf508 (patch)
tree5e440ccdb1985d619e878a04a44826068b0147e0 /src
parentcc5b4754250b3c03b9b0f8d72f32d1eacac15b18 (diff)
Added OpenCL to CUDA translation header for the kernels
Diffstat (limited to 'src')
-rw-r--r--src/kernels/opencl_to_cuda.h51
-rw-r--r--src/routine.cpp7
2 files changed, 58 insertions, 0 deletions
diff --git a/src/kernels/opencl_to_cuda.h b/src/kernels/opencl_to_cuda.h
new file mode 100644
index 00000000..43a26a2f
--- /dev/null
+++ b/src/kernels/opencl_to_cuda.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains an (incomplete) header to interpret OpenCL kernels as CUDA kernels.
+//
+// =================================================================================================
+
+// Replaces the OpenCL keywords with CUDA equivalent
+#define __kernel __placeholder__
+#define __global
+#define __placeholder__ extern "C" __global__
+#define __local __shared__
+#define restrict __restrict__
+#define __constant const
+#define inline __device__ inline // assumes all device functions are annotated with inline in OpenCL
+
+// Replaces OpenCL synchronisation with CUDA synchronisation
+#define barrier(x) __syncthreads()
+
+// Replaces the OpenCL get_xxx_ID with CUDA equivalents
+__device__ int get_local_id(int x) {
+ if (x == 0) { return threadIdx.x; }
+ if (x == 1) { return threadIdx.y; }
+ return threadIdx.z;
+}
+__device__ int get_group_id(int x) {
+ if (x == 0) { return blockIdx.x; }
+ if (x == 1) { return blockIdx.y;}
+ return blockIdx.z;
+}
+__device__ int get_global_id(int x) {
+ if (x == 0) { return blockIdx.x*blockDim.x + threadIdx.x; }
+ if (y == 0) { return blockIdx.y*blockDim.y + threadIdx.y; }
+ return blockIdx.z*blockDim.z + threadIdx.z;
+}
+
+// Adds the data-types which are not available natively under CUDA
+typedef struct { float s0; float s1; float s2; float s3;
+ float s4; float s5; float s6; float s7; } float8;
+typedef struct { float s0; float s1; float s2; float s3;
+ float s4; float s5; float s6; float s7;
+ float s8; float s9; float s10; float s11;
+ float s12; float s13; float s14; float s15; } float16;
+
+// =================================================================================================
diff --git a/src/routine.cpp b/src/routine.cpp
index aaa85fde..0f9fe360 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -167,6 +167,13 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
source_string += "#define GLOBAL_MEM_FENCE 1\n";
}
+ // Optionally adds a translation header from OpenCL kernels to CUDA kernels
+ #ifdef CUDA_API
+ source_string +=
+ #include "kernels/opencl_to_cuda.h"
+ ;
+ #endif
+
// Loads the common header (typedefs and defines and such)
source_string +=
#include "kernels/common.opencl"