summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2016-02-06 12:09:21 +0100
committerCNugteren <web@cedricnugteren.nl>2016-02-06 12:09:21 +0100
commit40346bb3a551f14afa5465d7708d8d31102e475e (patch)
tree3a2e1f336d2cbeef4c122eb3a7cf6e44e52e4545 /src
parentfbf071ba6299e053f4cf4011168d80bf877f3a07 (diff)
Reduced unrolling factor in xgemv kernel to reduce compilation times
Diffstat (limited to 'src')
-rw-r--r--src/kernels/level2/xgemv.opencl33
-rw-r--r--src/tuning/xgemv.cc4
2 files changed, 22 insertions, 15 deletions
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 8ed0e9e4..908d7d13 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -27,6 +27,9 @@ R"(
#ifndef WPT1
#define WPT1 1 // The amount of work-per-thread
#endif
+#ifndef UNROLL1
+ #define UNROLL1 32 // Unroll factor (must be a divider of WGS1)
+#endif
// 2: For the fast version
#ifndef WGS2
@@ -301,28 +304,31 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
barrier(CLK_LOCAL_MEM_FENCE);
// Loops over the work per thread, and checks whether in bounds
- #pragma unroll
for (int w=0; w<WPT1; ++w) {
const int gid = w*get_global_size(0) + get_global_id(0);
if (gid < m) {
// The multiply-add function for the main part (divisable by WGS1)
if (a_rotated == 0) { // Not rotated
- #pragma unroll
- for (int kloop=0; kloop<WGS1; ++kloop) {
- const int k = kwg + kloop;
- real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
- if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
- MultiplyAdd(acc[w], xlm[kloop], value);
+ for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) {
+ #pragma unroll
+ for (int kunroll=0; kunroll<UNROLL1; ++kunroll) {
+ const int k = kwg + kloop + kunroll;
+ real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
+ if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+ MultiplyAdd(acc[w], xlm[kloop + kunroll], value);
+ }
}
}
else { // Transposed
- #pragma unroll
- for (int kloop=0; kloop<WGS1; ++kloop) {
- const int k = kwg + kloop;
- real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
- if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
- MultiplyAdd(acc[w], xlm[kloop], value);
+ for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) {
+ #pragma unroll
+ for (int kunroll=0; kunroll<UNROLL1; ++kunroll) {
+ const int k = kwg + kloop + kunroll;
+ real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
+ if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+ MultiplyAdd(acc[w], xlm[kloop + kunroll], value);
+ }
}
}
}
@@ -563,3 +569,4 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea
)"
// =================================================================================================
+
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 60c73d76..c3cf9b7f 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -60,8 +60,8 @@ class TuneXgemv {
// Sets the tuning parameters and their possible values
static void SetParameters(cltune::Tuner &tuner, const size_t id) {
- tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256, 512, 1024, 1536, 2048});
- tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8});
+ tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256, 512});
+ tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
}