diff options
Diffstat (limited to 'src/kernels')
-rw-r--r-- | src/kernels/common.opencl | 7 | ||||
-rw-r--r-- | src/kernels/level2/level2.opencl | 102 | ||||
-rw-r--r-- | src/kernels/level2/xger.opencl | 106 | ||||
-rw-r--r-- | src/kernels/level2/xher.opencl | 73 |
4 files changed, 288 insertions, 0 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index f2a2e7a7..973c123e 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -147,6 +147,13 @@ R"( #define AXPBY(e, a, b, c, d) e = a*b + c*d #endif +// The scalar GER function +#if PRECISION == 3232 || PRECISION == 6464 + #define GER(e, a, b, c, d) real ab; ab.x = MulReal(a,b); ab.y = MulImag(a,b); e.x = MulReal(ab,c) + d.x; e.y = MulImag(ab,c) + d.y +#else + #define GER(e, a, b, c, d) e = a*b*c + d +#endif + // The complex conjugate operation for complex transforms #if PRECISION == 3232 || PRECISION == 6464 #define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y diff --git a/src/kernels/level2/level2.opencl b/src/kernels/level2/level2.opencl new file mode 100644 index 00000000..ad92595a --- /dev/null +++ b/src/kernels/level2/level2.opencl @@ -0,0 +1,102 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains common functions for matrix update kernels (Xger, Xher). +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. + +#ifndef WGS1 + #define WGS1 8 // The local work-group size in first dimension +#endif +#ifndef WGS2 + #define WGS2 8 // The local work-group size in second dimension +#endif +#ifndef WPT + #define WPT 1 // The amount of work-per-thread in both dimensions +#endif + +// ================================================================================================= + +// Returns an element from a vector +inline real LoadVector(const int id, const int max, + __global real* gm, const int offset, const int inc, + const int do_conjugate) { + if (id < max) { + real result = gm[id*inc + offset]; + if (do_conjugate) { + #if defined(ROUTINE_GERC) + COMPLEX_CONJUGATE(result); + #endif + #if defined(ROUTINE_HER) || defined(ROUTINE_HPR) + COMPLEX_CONJUGATE(result); + #endif + } + return result; + } + else { + real default_result; + SetToZero(default_result); + return default_result; + } +} + +// Performs the rank-1 matrix update +inline void MatrixUpdate(const int id1, const int id2, const int max1, const int max2, + __global real* agm, const int a_offset, const int a_ld, + const real alpha, const real xvalue, const real yvalue, + const int is_upper) { + + // Bounds of a regular matrix + if (id1 < max1 && id2 < max2) { + + #if defined(ROUTINE_SPR) || defined(ROUTINE_HPR) + int a_index; + if (is_upper) { + a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2; + } + else { + a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2; + } + a_index += a_offset; + #else + const int a_index = id2*a_ld + id1 + a_offset; + #endif + + // Loads the current value of the A matrix + const real avalue = agm[a_index]; + + // Computes result = alpha * x[i] * y[j] + a[i][j] + real result; + GER(result, alpha, xvalue, yvalue, avalue); + + // For hermetian matrices + #if defined(ROUTINE_HER) || defined(ROUTINE_HPR) + if (id1 == id2) { result.y = ZERO; } + #endif + + // Stores the final result + agm[a_index] = result; + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl new file mode 100644 index 00000000..d377fbb0 --- /dev/null +++ b/src/kernels/level2/xger.opencl @@ -0,0 +1,106 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the Xger kernels for rank-1 matrix update. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC) +__attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +__kernel void Xger(const int max1, const int max2, const real alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + const __global real* ygm, const int y_offset, const int y_inc, + __global real* restrict agm, const int a_offset, const int a_ld, + const int is_rowmajor) { + + // Register storage for X and Y + real xvalues[WPT]; + real yvalues[WPT]; + + // Row-major version + if (is_rowmajor) { + + // Loads the X-vector + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int id2 = w*get_global_size(1) + get_global_id(1); + xvalues[w] = LoadVector(id2, max2, xgm, x_offset, x_inc, false); + } + + // Loads the Y-vector + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int id1 = w*get_global_size(0) + get_global_id(0); + yvalues[w] = LoadVector(id1, max1, ygm, y_offset, y_inc, true); + } + + // Loops over the work per thread twice + #pragma unroll + for (int w1=0; w1<WPT; ++w1) { + #pragma unroll + for (int w2=0; w2<WPT; ++w2) { + + // Global thread IDs + const int id1 = w1*get_global_size(0) + get_global_id(0); + const int id2 = w2*get_global_size(1) + get_global_id(1); + + // Loads A, performs the operation, and stores the result into A + MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld, + alpha, xvalues[w2], yvalues[w1], false); + } + } + } + + // Col-major version + else { + + // Loads the X-vector + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int id1 = w*get_global_size(0) + get_global_id(0); + xvalues[w] = LoadVector(id1, max1, xgm, x_offset, x_inc, false); + } + + // Loads the Y-vector + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int id2 = w*get_global_size(1) + get_global_id(1); + yvalues[w] = LoadVector(id2, max2, ygm, y_offset, y_inc, true); + } + + // Loops over the work per thread twice + #pragma unroll + for (int w1=0; w1<WPT; ++w1) { + #pragma unroll + for (int w2=0; w2<WPT; ++w2) { + + // Global thread IDs + const int id1 = w1*get_global_size(0) + get_global_id(0); + const int id2 = w2*get_global_size(1) + get_global_id(1); + + // Loads A, performs the operation, and stores the result into A + MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld, + alpha, xvalues[w1], yvalues[w2], false); + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl new file mode 100644 index 00000000..edb94ca8 --- /dev/null +++ b/src/kernels/level2/xher.opencl @@ -0,0 +1,73 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the Xher kernels for rank-1 matrix update. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR) +__attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +__kernel void Xher(const int n, const real alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* restrict agm, const int a_offset, const int a_ld, + const int is_upper, const int is_rowmajor) { + + // Register storage for X and XT + real xvalues[WPT]; + real xtvalues[WPT]; + + // Loads the X-vector + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int id2 = w*get_global_size(1) + get_global_id(1); + xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor); + } + + // Loads the X-transposed-vector + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int id1 = w*get_global_size(0) + get_global_id(0); + xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor); + } + + // Loops over the work per thread twice + #pragma unroll + for (int w1=0; w1<WPT; ++w1) { + #pragma unroll + for (int w2=0; w2<WPT; ++w2) { + + // Global thread IDs + const int id1 = w1*get_global_size(0) + get_global_id(0); + const int id2 = w2*get_global_size(1) + get_global_id(1); + + // Skip these threads if they do not contain threads contributing to the matrix-triangle + if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) { + // Do nothing + } + + // Loads A, performs the operation, and stores the result into A + else { + MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[w2], xtvalues[w1], is_upper); + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= |