diff options
-rw-r--r-- | src/kernels/level1/xamax.opencl | 2 | ||||
-rw-r--r-- | src/kernels/level1/xasum.opencl | 2 | ||||
-rw-r--r-- | src/kernels/level1/xcopy.opencl | 5 | ||||
-rw-r--r-- | src/kernels/level1/xnrm2.opencl | 2 | ||||
-rw-r--r-- | src/kernels/level1/xscal.opencl | 5 | ||||
-rw-r--r-- | src/kernels/level1/xswap.opencl | 5 | ||||
-rw-r--r-- | src/kernels/level2/xher.opencl | 24 | ||||
-rw-r--r-- | src/kernels/level2/xher2.opencl | 40 | ||||
-rw-r--r-- | src/kernels/level3/convert_hermitian.opencl | 20 | ||||
-rw-r--r-- | src/kernels/level3/convert_symmetric.opencl | 16 | ||||
-rw-r--r-- | src/kernels/level3/convert_triangular.opencl | 16 | ||||
-rw-r--r-- | src/kernels/level3/invert_diagonal_blocks.opencl | 44 | ||||
-rw-r--r-- | src/kernels/levelx/im2col.opencl | 2 | ||||
-rw-r--r-- | test/correctness/misc/preprocessor.cpp | 9 |
14 files changed, 97 insertions, 95 deletions
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 2bd2f714..27add015 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -75,7 +75,6 @@ void Xamax(const int n, barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory - #pragma unroll for (int s=WGS1/2; s>0; s=s>>1) { if (lid < s) { if (maxlm[lid + s] >= maxlm[lid]) { @@ -117,7 +116,6 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm, barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory - #pragma unroll for (int s=WGS2/2; s>0; s=s>>1) { if (lid < s) { if (maxlm[lid + s] >= maxlm[lid]) { diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl index 1fc91be8..29e7fa3e 100644 --- a/src/kernels/level1/xasum.opencl +++ b/src/kernels/level1/xasum.opencl @@ -56,7 +56,6 @@ void Xasum(const int n, barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory - #pragma unroll for (int s=WGS1/2; s>0; s=s>>1) { if (lid < s) { Add(lm[lid], lm[lid], lm[lid + s]); @@ -85,7 +84,6 @@ void XasumEpilogue(const __global real* restrict input, barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory - #pragma unroll for (int s=WGS2/2; s>0; s=s>>1) { if (lid < s) { Add(lm[lid], lm[lid], lm[lid + s]); diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl index 228e0735..aed80fc2 100644 --- a/src/kernels/level1/xcopy.opencl +++ b/src/kernels/level1/xcopy.opencl @@ -28,7 +28,6 @@ void Xcopy(const int n, __global real* ygm, const int y_offset, const int y_inc) { // Loops over the work that needs to be done (allows for an arbitrary number of threads) - #pragma unroll for (int id = get_global_id(0); id<n; id += get_global_size(0)) { ygm[id*y_inc + y_offset] = xgm[id*x_inc + x_offset]; } @@ -43,8 +42,8 @@ void XcopyFast(const int n, const __global realV* restrict xgm, __global realV* ygm) { #pragma unroll - for (int w=0; w<WPT; ++w) { - const int id = w*get_global_size(0) + get_global_id(0); + for (int _w = 0; _w < WPT; _w += 1) { + const int id = _w*get_global_size(0) + get_global_id(0); ygm[id] = xgm[id]; } } diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl index f6d869cb..6a81c150 100644 --- a/src/kernels/level1/xnrm2.opencl +++ b/src/kernels/level1/xnrm2.opencl @@ -54,7 +54,6 @@ void Xnrm2(const int n, barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory - #pragma unroll for (int s=WGS1/2; s>0; s=s>>1) { if (lid < s) { Add(lm[lid], lm[lid], lm[lid + s]); @@ -83,7 +82,6 @@ void Xnrm2Epilogue(const __global real* restrict input, barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory - #pragma unroll for (int s=WGS2/2; s>0; s=s>>1) { if (lid < s) { Add(lm[lid], lm[lid], lm[lid + s]); diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl index 3da9c2fd..cb133e88 100644 --- a/src/kernels/level1/xscal.opencl +++ b/src/kernels/level1/xscal.opencl @@ -28,7 +28,6 @@ void Xscal(const int n, const real_arg arg_alpha, const real alpha = GetRealArg(arg_alpha); // Loops over the work that needs to be done (allows for an arbitrary number of threads) - #pragma unroll for (int id = get_global_id(0); id<n; id += get_global_size(0)) { real xvalue = xgm[id*x_inc + x_offset]; real result; @@ -47,8 +46,8 @@ void XscalFast(const int n, const real_arg arg_alpha, const real alpha = GetRealArg(arg_alpha); #pragma unroll - for (int w=0; w<WPT; ++w) { - const int id = w*get_global_size(0) + get_global_id(0); + for (int _w = 0; _w < WPT; _w += 1) { + const int id = _w*get_global_size(0) + get_global_id(0); realV xvalue = xgm[id]; realV result; result = MultiplyVector(result, alpha, xvalue); diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl index 267271c0..bf5b6194 100644 --- a/src/kernels/level1/xswap.opencl +++ b/src/kernels/level1/xswap.opencl @@ -28,7 +28,6 @@ void Xswap(const int n, __global real* ygm, const int y_offset, const int y_inc) { // Loops over the work that needs to be done (allows for an arbitrary number of threads) - #pragma unroll for (int id = get_global_id(0); id<n; id += get_global_size(0)) { real temp = xgm[id*x_inc + x_offset]; xgm[id*x_inc + x_offset] = ygm[id*y_inc + y_offset]; @@ -45,8 +44,8 @@ void XswapFast(const int n, __global realV* xgm, __global realV* ygm) { #pragma unroll - for (int w=0; w<WPT; ++w) { - const int id = w*get_global_size(0) + get_global_id(0); + for (int _w = 0; _w < WPT; _w += 1) { + const int id = _w*get_global_size(0) + get_global_id(0); realV temp = xgm[id]; xgm[id] = ygm[id]; ygm[id] = temp; diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl index b0772218..8a57bdfc 100644 --- a/src/kernels/level2/xher.opencl +++ b/src/kernels/level2/xher.opencl @@ -27,32 +27,34 @@ void Xher(const int n, const real alpha = GetRealArg(arg_alpha); // Register storage for X and XT + #pragma promote_to_registers real xvalues[WPT]; + #pragma promote_to_registers real xtvalues[WPT]; // Loads the X-vector #pragma unroll - for (int w=0; w<WPT; ++w) { - const int id2 = w*get_global_size(1) + get_global_id(1); - xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor); + for (int _w = 0; _w < WPT; _w += 1) { + const int id2 = _w*get_global_size(1) + get_global_id(1); + xvalues[_w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor); } // Loads the X-transposed-vector #pragma unroll - for (int w=0; w<WPT; ++w) { - const int id1 = w*get_global_size(0) + get_global_id(0); - xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor); + for (int _w = 0; _w < WPT; _w += 1) { + const int id1 = _w*get_global_size(0) + get_global_id(0); + xtvalues[_w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor); } // Loops over the work per thread twice #pragma unroll - for (int w1=0; w1<WPT; ++w1) { + for (int _w1 = 0; _w1 < WPT; _w1 += 1) { #pragma unroll - for (int w2=0; w2<WPT; ++w2) { + for (int _w2 = 0; _w2 < WPT; _w2 += 1) { // Global thread IDs - const int id1 = w1*get_global_size(0) + get_global_id(0); - const int id2 = w2*get_global_size(1) + get_global_id(1); + const int id1 = _w1*get_global_size(0) + get_global_id(0); + const int id2 = _w2*get_global_size(1) + get_global_id(1); // Skip these threads if they do not contain threads contributing to the matrix-triangle if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) { @@ -61,7 +63,7 @@ void Xher(const int n, // Loads A, performs the operation, and stores the result into A else { - MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[w2], xtvalues[w1], is_upper); + MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[_w2], xtvalues[_w1], is_upper); } } } diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl index 00a756c9..73305149 100644 --- a/src/kernels/level2/xher2.opencl +++ b/src/kernels/level2/xher2.opencl @@ -28,37 +28,41 @@ void Xher2(const int n, const real alpha = GetRealArg(arg_alpha); // Register storage for X and Y + #pragma promote_to_registers real xvalues[WPT]; + #pragma promote_to_registers real yvalues[WPT]; + #pragma promote_to_registers real xtvalues[WPT]; + #pragma promote_to_registers real ytvalues[WPT]; // Loads the X-vector #pragma unroll - for (int w=0; w<WPT; ++w) { - const int id2 = w*get_global_size(1) + get_global_id(1); - xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor); + for (int _w = 0; _w < WPT; _w += 1) { + const int id2 = _w*get_global_size(1) + get_global_id(1); + xvalues[_w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor); } // Loads the X-transposed-vector #pragma unroll - for (int w=0; w<WPT; ++w) { - const int id1 = w*get_global_size(0) + get_global_id(0); - xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor); + for (int _w = 0; _w < WPT; _w += 1) { + const int id1 = _w*get_global_size(0) + get_global_id(0); + xtvalues[_w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor); } // Loads the Y-vector #pragma unroll - for (int w=0; w<WPT; ++w) { - const int id1 = w*get_global_size(0) + get_global_id(0); - yvalues[w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor); + for (int _w = 0; _w < WPT; _w += 1) { + const int id1 = _w*get_global_size(0) + get_global_id(0); + yvalues[_w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor); } // Loads the Y-transposed-vector #pragma unroll - for (int w=0; w<WPT; ++w) { - const int id2 = w*get_global_size(1) + get_global_id(1); - ytvalues[w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor); + for (int _w = 0; _w < WPT; _w += 1) { + const int id2 = _w*get_global_size(1) + get_global_id(1); + ytvalues[_w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor); } // Sets the proper value of alpha in case conjugation is needed @@ -75,13 +79,13 @@ void Xher2(const int n, // Loops over the work per thread twice #pragma unroll - for (int w1=0; w1<WPT; ++w1) { + for (int _w1 = 0; _w1 < WPT; _w1 += 1) { #pragma unroll - for (int w2=0; w2<WPT; ++w2) { + for (int _w2 = 0; _w2 < WPT; _w2 += 1) { // Global thread IDs - const int id1 = w1*get_global_size(0) + get_global_id(0); - const int id2 = w2*get_global_size(1) + get_global_id(1); + const int id1 = _w1*get_global_size(0) + get_global_id(0); + const int id2 = _w2*get_global_size(1) + get_global_id(1); // Skip these threads if they do not contain threads contributing to the matrix-triangle if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) { @@ -91,8 +95,8 @@ void Xher2(const int n, // Loads A, performs the operation, and stores the result into A else { MatrixUpdate2(id1, id2, n, n, agm, a_offset, a_ld, - alpha1, xvalues[w2], yvalues[w1], - alpha2, xtvalues[w1], ytvalues[w2], is_upper); + alpha1, xvalues[_w2], yvalues[_w1], + alpha2, xtvalues[_w1], ytvalues[_w2], is_upper); } } } diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl index ed2ded98..0e89b78b 100644 --- a/src/kernels/level3/convert_hermitian.opencl +++ b/src/kernels/level3/convert_hermitian.opencl @@ -16,7 +16,8 @@ R"( // ================================================================================================= -#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464) +#if defined(ROUTINE_HEMM) +#if PRECISION == 3232 || PRECISION == 6464 // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. @@ -30,11 +31,11 @@ void HermLowerToSquared(const int src_dim, // Loops over the work per thread in both dimensions #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { + const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { + const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the lower-hermitian matrix @@ -69,11 +70,11 @@ void HermUpperToSquared(const int src_dim, // Loops over the work per thread in both dimensions #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { + const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { + const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the upper-hermitian matrix @@ -98,6 +99,7 @@ void HermUpperToSquared(const int src_dim, } #endif +#endif // ================================================================================================= // End of the C++11 raw string literal diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl index 8ae53b37..83ecdd65 100644 --- a/src/kernels/level3/convert_symmetric.opencl +++ b/src/kernels/level3/convert_symmetric.opencl @@ -30,11 +30,11 @@ void SymmLowerToSquared(const int src_dim, // Loops over the work per thread in both dimensions #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { + const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { + const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the lower-symmetric matrix @@ -63,11 +63,11 @@ void SymmUpperToSquared(const int src_dim, // Loops over the work per thread in both dimensions #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { + const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { + const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the upper-symmetric matrix diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl index f848dcc1..a9d5e769 100644 --- a/src/kernels/level3/convert_triangular.opencl +++ b/src/kernels/level3/convert_triangular.opencl @@ -31,11 +31,11 @@ void TriaLowerToSquared(const int src_dim, // Loops over the work per thread in both dimensions #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { + const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { + const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the lower-triangular matrix @@ -66,11 +66,11 @@ void TriaUpperToSquared(const int src_dim, // Loops over the work per thread in both dimensions #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { + const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { + const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the upper-triangular matrix diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl index 281fdcff..db1513c1 100644 --- a/src/kernels/level3/invert_diagonal_blocks.opencl +++ b/src/kernels/level3/invert_diagonal_blocks.opencl @@ -82,14 +82,14 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // Loads the source lower triangle into local memory. Any values in the upper triangle or // outside of the matrix are set to zero #pragma unroll - for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { - const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) : - (thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n); + for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) { + const bool condition = (is_upper) ? (thread_index <= _j && block_index*INTERNAL_BLOCK_SIZE + _j < n) : + (thread_index >= _j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n); if (condition) { - lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset]; + lm[thread_index][_j] = src[_j*src_ld + thread_index + src_block_offset]; } else { - SetToZero(lm[thread_index][j]); + SetToZero(lm[thread_index][_j]); } } barrier(CLK_LOCAL_MEM_FENCE); @@ -116,7 +116,6 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src real sum; if (thread_index < j) { SetToZero(sum); - #pragma unroll for (int k = 0; k < j; ++k) { MultiplyAdd(sum, lm[thread_index][k], lm[k][j]); } @@ -139,7 +138,6 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src real sum; if (thread_index > j) { SetToZero(sum); - #pragma unroll for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) { MultiplyAdd(sum, lm[thread_index][k], lm[k][j]); } @@ -156,7 +154,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // Writes the result to global memory #pragma unroll - for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) { dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j]; } } @@ -188,19 +186,17 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, // Initializes the result registers real cpm[16]; #pragma unroll - for (int j = 0; j < 16; ++j) { - SetToZero(cpm[j]); + for (int _j = 0; _j < 16; _j += 1) { + SetToZero(cpm[_j]); } // Computes NT x 16 block of C, each thread computes one 1 x 16 row for (int k = 0; k < current_size; k += 16) { // Loads a 16 x 16 block of B into local memory using NX x 4 threads - #pragma unroll - for( int i=0; i < 16; i += (size/4) ) { // += get_local_size(0) - #pragma unroll - for( int j=0; j < 16; j += 4 ) { // += get_local_size(1) - blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb]; + for (int i = 0; i < 16; i += (size/4) ) { // += get_local_size(0) + for (int _j = 0; _j < 16; _j += 4 ) { // += get_local_size(1) + blm[(lidx + i) * LOCALX + (lidy + _j)] = bgm[k + i + _j*ldb]; } } barrier(CLK_LOCAL_MEM_FENCE); @@ -210,11 +206,11 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, // Performs 16 x 16 multiply-add operations #pragma unroll - for (int i = 0; i < 16; ++i) { + for (int _i = 0; _i < 16; _i += 1) { if (part == 2 || col++ < n) { #pragma unroll - for (int j = 0; j < 16; ++j) { - MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]); + for (int _j = 0; _j < 16; _j += 1) { + MultiplyAdd(cpm[_j], agm[(_i + k) * lda], blm[_i * LOCALX + _j]); } } } @@ -226,10 +222,10 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, // Performs 16 x 16 multiply-add operations #pragma unroll - for (int i = 0; i < 16; ++i) { + for (int _i = 0; _i < 16; _i += 1) { #pragma unroll - for (int j = 0; j < 16; ++j) { - MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]); + for (int _j = 0; _j < 16; _j += 1) { + MultiplyAdd(cpm[_j], agm[(_i + k) * lda], blm[_i * LOCALX + _j]); } } } @@ -240,9 +236,9 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, // Stores NT x 16 results: each thread writes one 16 x 1 row #pragma unroll - for (int i = 0; i < 16; ++i) { - if (part == 2) { Negate(cpm[i]); } - cgm[0] = cpm[i]; + for (int _i = 0; _i < 16; _i += 1) { + if (part == 2) { Negate(cpm[_i]); } + cgm[0] = cpm[_i]; cgm += ldc; } } diff --git a/src/kernels/levelx/im2col.opencl b/src/kernels/levelx/im2col.opencl index e7f69420..301e076b 100644 --- a/src/kernels/levelx/im2col.opencl +++ b/src/kernels/levelx/im2col.opencl @@ -41,9 +41,7 @@ void im2col(const int input_h, const int input_w, const int channels, const int c_id = ((int)get_global_id(1)) / output_h; // input channels if (h_id < output_h && w_id < output_w && c_id < channels) { - #pragma unroll for (int kh_id = 0; kh_id < kernel_h; ++kh_id) { // kernel height - #pragma unroll for (int kw_id = 0; kw_id < kernel_w; ++kw_id) { // kernel width // Retrieves the input value diff --git a/test/correctness/misc/preprocessor.cpp b/test/correctness/misc/preprocessor.cpp index 92ca2490..8b5f16f5 100644 --- a/test/correctness/misc/preprocessor.cpp +++ b/test/correctness/misc/preprocessor.cpp @@ -243,6 +243,14 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio ; if (TestKernel(device, context, "XgemmDirectTN", gemm_direct_sources, precision)) { passed++; } else { errors++; } + // HEMM + const auto herm_sources = + "#define ROUTINE_HEMM\n" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/convert_hermitian.opencl" + ; + if (TestKernel(device, context, "HermLowerToSquared", herm_sources, precision)) { passed++; } else { errors++; } + // Prints and returns the statistics std::cout << std::endl; std::cout << " " << passed << " test(s) passed" << std::endl; @@ -258,6 +266,7 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunPreprocessor(argc, argv, false, clblast::Precision::kSingle); + errors += clblast::RunPreprocessor(argc, argv, true, clblast::Precision::kComplexDouble); if (errors > 0) { return 1; } else { return 0; } } |