From 313fc796b2a3063cab7b5847864a524efb69aee4 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 14 Oct 2017 16:01:12 +0200
Subject: Fixed several (not all) CUDA kernel compilation issues

---
 src/kernels/common.opencl    | 27 +++++++++++++-----------
 src/kernels/opencl_to_cuda.h | 49 +++++++++++++++++++++++++++++---------------
 2 files changed, 48 insertions(+), 28 deletions(-)

(limited to 'src/kernels')

diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 9481881e..a34877d9 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -23,15 +23,18 @@ R"(
 #endif
 
 // =================================================================================================
+#ifndef CUDA
 
-// Enable support for double-precision
-#if PRECISION == 16
-  #pragma OPENCL EXTENSION cl_khr_fp16: enable
-#endif
+  // Enable support for double-precision
+  #if PRECISION == 16
+    #pragma OPENCL EXTENSION cl_khr_fp16: enable
+  #endif
+
+  // Enable support for double-precision
+  #if PRECISION == 64 || PRECISION == 6464
+    #pragma OPENCL EXTENSION cl_khr_fp64: enable
+  #endif
 
-// Enable support for double-precision
-#if PRECISION == 64 || PRECISION == 6464
-   #pragma OPENCL EXTENSION cl_khr_fp64: enable
 #endif
 
 // Half-precision
@@ -254,18 +257,18 @@ R"(
 // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
 // More details: https://github.com/CNugteren/CLBlast/issues/53
 #if USE_STAGGERED_INDICES == 1
-  INLINE_FUNC size_t GetGroupIDFlat() {
+  INLINE_FUNC int GetGroupIDFlat() {
     return get_group_id(0) + get_num_groups(0) * get_group_id(1);
   }
-  INLINE_FUNC size_t GetGroupID1() {
+  INLINE_FUNC int GetGroupID1() {
     return (GetGroupIDFlat()) % get_num_groups(1);
   }
-  INLINE_FUNC size_t GetGroupID0() {
+  INLINE_FUNC int GetGroupID0() {
     return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0);
   }
 #else
-  INLINE_FUNC size_t GetGroupID1() { return get_group_id(1); }
-  INLINE_FUNC size_t GetGroupID0() { return get_group_id(0); }
+  INLINE_FUNC int GetGroupID1() { return get_group_id(1); }
+  INLINE_FUNC int GetGroupID0() { return get_group_id(0); }
 #endif
 
 // =================================================================================================
diff --git a/src/kernels/opencl_to_cuda.h b/src/kernels/opencl_to_cuda.h
index 2e46bc2b..94a1549e 100644
--- a/src/kernels/opencl_to_cuda.h
+++ b/src/kernels/opencl_to_cuda.h
@@ -16,32 +16,28 @@
 R"(
 // =================================================================================================
 
-// Replaces the OpenCL keywords with CUDA equivalent
-#define __kernel __placeholder__
-#define __global
-#define __placeholder__ extern "C" __global__
-#define __local __shared__
-#define restrict __restrict__
-#define __constant const
-#define inline __device__ inline // assumes all device functions are annotated with inline in OpenCL
-
-// Replaces OpenCL synchronisation with CUDA synchronisation
-#define barrier(x) __syncthreads()
+// CLBlast specific additions
+#define CUDA 1
 
 // Replaces the OpenCL get_xxx_ID with CUDA equivalents
-__device__ int get_local_id(int x) {
+__device__ int get_local_id(const int x) {
   if (x == 0) { return threadIdx.x; }
   if (x == 1) { return threadIdx.y; }
   return threadIdx.z;
 }
-__device__ int get_group_id(int x) {
+__device__ int get_group_id(const int x) {
   if (x == 0) { return blockIdx.x; }
-  if (x == 1) { return blockIdx.y;}
+  if (x == 1) { return blockIdx.y; }
   return blockIdx.z;
 }
-__device__ int get_global_id(int x) {
+__device__ int get_global_size(const int x) {
+  if (x == 0) { return gridDim.x; }
+  if (x == 1) { return gridDim.y; }
+  return gridDim.z;
+}
+__device__ int get_global_id(const int x) {
   if (x == 0) { return blockIdx.x*blockDim.x + threadIdx.x; }
-  if (y == 0) { return blockIdx.y*blockDim.y + threadIdx.y; }
+  if (x == 1) { return blockIdx.y*blockDim.y + threadIdx.y; }
   return blockIdx.z*blockDim.z + threadIdx.z;
 }
 
@@ -52,6 +48,27 @@ typedef struct { float s0; float s1; float s2; float s3;
                  float s4; float s5; float s6; float s7;
                  float s8; float s9; float s10; float s11;
                  float s12; float s13; float s14; float s15; } float16;
+typedef struct { double s0; double s1; double s2; double s3;
+                 double s4; double s5; double s6; double s7; } double8;
+typedef struct { double s0; double s1; double s2; double s3;
+                 double s4; double s5; double s6; double s7;
+                 double s8; double s9; double s10; double s11;
+                 double s12; double s13; double s14; double s15; } double16;
+
+// Replaces the OpenCL keywords with CUDA equivalent
+#define __kernel __placeholder__
+#define __global
+#define __placeholder__ extern "C" __global__
+#define __local __shared__
+#define restrict __restrict__
+#define __constant const
+#define inline __device__ // assumes all device functions are annotated with inline in OpenCL
+
+// Kernel attributes (don't replace currently)
+#define reqd_work_group_size(x, y, z)
+
+// Replaces OpenCL synchronisation with CUDA synchronisation
+#define barrier(x) __syncthreads()
 
 // =================================================================================================
 
-- 
cgit v1.2.3