diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-28 17:32:37 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-28 17:32:37 +0200 |
commit | 12b08ae49154379f7471a40809ace6418857b387 (patch) | |
tree | ef958197db0bb8a67c9a5840f828b3f6c72bd8fc /src/kernels/common.opencl | |
parent | 2949e156f5bfdd724987e67477da3e3608e4aaf9 (diff) | |
parent | fa6e5e67f585b77d34c3031c176de9a0f7904aa9 (diff) |
Merge branch 'master' into android_support
Diffstat (limited to 'src/kernels/common.opencl')
-rw-r--r-- | src/kernels/common.opencl | 33 |
1 files changed, 20 insertions, 13 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 9481881e..01c411bc 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -24,14 +24,16 @@ R"( // ================================================================================================= -// Enable support for double-precision -#if PRECISION == 16 - #pragma OPENCL EXTENSION cl_khr_fp16: enable -#endif +#ifndef CUDA + // Enable support for double-precision + #if PRECISION == 16 + #pragma OPENCL EXTENSION cl_khr_fp16: enable + #endif -// Enable support for double-precision -#if PRECISION == 64 || PRECISION == 6464 - #pragma OPENCL EXTENSION cl_khr_fp64: enable + // Enable support for double-precision + #if PRECISION == 64 || PRECISION == 6464 + #pragma OPENCL EXTENSION cl_khr_fp64: enable + #endif #endif // Half-precision @@ -117,10 +119,15 @@ R"( #define GetRealArg(x) x #endif +// Pointers to local memory objects (using a define because CUDA doesn't need them) +#ifndef LOCAL_PTR + #define LOCAL_PTR __local +#endif + // ================================================================================================= // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific -// devices, this is enabled (see src/routine.cc). +// devices, this is enabled (see src/routine.cpp). #ifndef USE_CL_MAD #define USE_CL_MAD 0 #endif @@ -254,18 +261,18 @@ R"( // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf // More details: https://github.com/CNugteren/CLBlast/issues/53 #if USE_STAGGERED_INDICES == 1 - INLINE_FUNC size_t GetGroupIDFlat() { + INLINE_FUNC int GetGroupIDFlat() { return get_group_id(0) + get_num_groups(0) * get_group_id(1); } - INLINE_FUNC size_t GetGroupID1() { + INLINE_FUNC int GetGroupID1() { return (GetGroupIDFlat()) % get_num_groups(1); } - INLINE_FUNC size_t GetGroupID0() { + INLINE_FUNC int GetGroupID0() { return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0); } #else - INLINE_FUNC size_t GetGroupID1() { return get_group_id(1); } - INLINE_FUNC size_t GetGroupID0() { return get_group_id(0); } + INLINE_FUNC int GetGroupID1() { return get_group_id(1); } + INLINE_FUNC int GetGroupID0() { return get_group_id(0); } #endif // ================================================================================================= |