summaryrefslogtreecommitdiff
path: root/src/kernels/level3/xgemm_direct_batched.opencl
diff options
context:
space:
mode:
Diffstat (limited to 'src/kernels/level3/xgemm_direct_batched.opencl')
-rw-r--r--src/kernels/level3/xgemm_direct_batched.opencl48
1 files changed, 40 insertions, 8 deletions
diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl
index d15ed31e..ec0b008b 100644
--- a/src/kernels/level3/xgemm_direct_batched.opencl
+++ b/src/kernels/level3/xgemm_direct_batched.opencl
@@ -20,7 +20,11 @@ R"(
#if defined(ROUTINE_GEMMBATCHED)
// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#elif
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -41,7 +45,11 @@ void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#elif
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -62,7 +70,11 @@ void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#elif
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -83,7 +95,11 @@ void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#elif
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -108,7 +124,11 @@ void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
#if defined(ROUTINE_GEMMSTRIDEDBATCHED)
// Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#elif
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -127,7 +147,11 @@ void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int k
}
// Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#elif
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -146,7 +170,11 @@ void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int k
}
// Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#elif
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -165,7 +193,11 @@ void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int k
}
// Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#elif
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,