From 75517353d505de1d3979866060261a666aebfd36 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sat, 22 Aug 2015 14:33:48 +0200 Subject: Re-organized level1 xaxpy kernel --- src/kernels/level1/level1.opencl | 96 +++++++++++++++++++++++++++++ src/kernels/level1/xaxpy.opencl | 55 +++++++++++++++++ src/kernels/xaxpy.opencl | 128 --------------------------------------- src/routines/level1/xaxpy.cc | 3 +- src/tuning/xaxpy.cc | 3 +- 5 files changed, 155 insertions(+), 130 deletions(-) create mode 100644 src/kernels/level1/level1.opencl create mode 100644 src/kernels/level1/xaxpy.opencl delete mode 100644 src/kernels/xaxpy.opencl (limited to 'src') diff --git a/src/kernels/level1/level1.opencl b/src/kernels/level1/level1.opencl new file mode 100644 index 00000000..449a20a2 --- /dev/null +++ b/src/kernels/level1/level1.opencl @@ -0,0 +1,96 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the common functions and parameters specific for level 1 BLAS kernels. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef WGS + #define WGS 64 // The local work-group size +#endif +#ifndef WPT + #define WPT 1 // The amount of work-per-thread +#endif +#ifndef VW + #define VW 1 // Vector width of vectors X and Y +#endif + +// ================================================================================================= + +// Data-widths +#if VW == 1 + typedef real realV; +#elif VW == 2 + typedef real2 realV; +#elif VW == 4 + typedef real4 realV; +#elif VW == 8 + typedef real8 realV; +#elif VW == 16 + typedef real16 realV; +#endif + +// ================================================================================================= + +// The vectorized multiply-add function +inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { + #if VW == 1 + MultiplyAdd(cvec, aval, bvec); + #elif VW == 2 + MultiplyAdd(cvec.x, aval, bvec.x); + MultiplyAdd(cvec.y, aval, bvec.y); + #elif VW == 4 + MultiplyAdd(cvec.x, aval, bvec.x); + MultiplyAdd(cvec.y, aval, bvec.y); + MultiplyAdd(cvec.z, aval, bvec.z); + MultiplyAdd(cvec.w, aval, bvec.w); + #elif VW == 8 + MultiplyAdd(cvec.s0, aval, bvec.s0); + MultiplyAdd(cvec.s1, aval, bvec.s1); + MultiplyAdd(cvec.s2, aval, bvec.s2); + MultiplyAdd(cvec.s3, aval, bvec.s3); + MultiplyAdd(cvec.s4, aval, bvec.s4); + MultiplyAdd(cvec.s5, aval, bvec.s5); + MultiplyAdd(cvec.s6, aval, bvec.s6); + MultiplyAdd(cvec.s7, aval, bvec.s7); + #elif VW == 16 + MultiplyAdd(cvec.s0, aval, bvec.s0); + MultiplyAdd(cvec.s1, aval, bvec.s1); + MultiplyAdd(cvec.s2, aval, bvec.s2); + MultiplyAdd(cvec.s3, aval, bvec.s3); + MultiplyAdd(cvec.s4, aval, bvec.s4); + MultiplyAdd(cvec.s5, aval, bvec.s5); + MultiplyAdd(cvec.s6, aval, bvec.s6); + MultiplyAdd(cvec.s7, aval, bvec.s7); + MultiplyAdd(cvec.s8, aval, bvec.s8); + MultiplyAdd(cvec.s9, aval, bvec.s9); + MultiplyAdd(cvec.sA, aval, bvec.sA); + MultiplyAdd(cvec.sB, aval, bvec.sB); + MultiplyAdd(cvec.sC, aval, bvec.sC); + MultiplyAdd(cvec.sD, aval, bvec.sD); + MultiplyAdd(cvec.sE, aval, bvec.sE); + MultiplyAdd(cvec.sF, aval, bvec.sF); + #endif + return cvec; +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl new file mode 100644 index 00000000..3d926d9e --- /dev/null +++ b/src/kernels/level1/xaxpy.opencl @@ -0,0 +1,55 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit +// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't +// support vector data-types. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xaxpy(const int n, const real alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id -// -// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit -// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't -// support vector data-types. -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef WGS - #define WGS 64 // The local work-group size -#endif -#ifndef WPT - #define WPT 1 // The amount of work-per-thread -#endif -#ifndef VW - #define VW 1 // Vector width of vectors X and Y -#endif - -// ================================================================================================= - -// Data-widths -#if VW == 1 - typedef real realV; -#elif VW == 2 - typedef real2 realV; -#elif VW == 4 - typedef real4 realV; -#elif VW == 8 - typedef real8 realV; -#elif VW == 16 - typedef real16 realV; -#endif - -// ================================================================================================= - -// The vectorized multiply-add function -inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { - #if VW == 1 - MultiplyAdd(cvec, aval, bvec); - #elif VW == 2 - MultiplyAdd(cvec.x, aval, bvec.x); - MultiplyAdd(cvec.y, aval, bvec.y); - #elif VW == 4 - MultiplyAdd(cvec.x, aval, bvec.x); - MultiplyAdd(cvec.y, aval, bvec.y); - MultiplyAdd(cvec.z, aval, bvec.z); - MultiplyAdd(cvec.w, aval, bvec.w); - #elif VW == 8 - MultiplyAdd(cvec.s0, aval, bvec.s0); - MultiplyAdd(cvec.s1, aval, bvec.s1); - MultiplyAdd(cvec.s2, aval, bvec.s2); - MultiplyAdd(cvec.s3, aval, bvec.s3); - MultiplyAdd(cvec.s4, aval, bvec.s4); - MultiplyAdd(cvec.s5, aval, bvec.s5); - MultiplyAdd(cvec.s6, aval, bvec.s6); - MultiplyAdd(cvec.s7, aval, bvec.s7); - #elif VW == 16 - MultiplyAdd(cvec.s0, aval, bvec.s0); - MultiplyAdd(cvec.s1, aval, bvec.s1); - MultiplyAdd(cvec.s2, aval, bvec.s2); - MultiplyAdd(cvec.s3, aval, bvec.s3); - MultiplyAdd(cvec.s4, aval, bvec.s4); - MultiplyAdd(cvec.s5, aval, bvec.s5); - MultiplyAdd(cvec.s6, aval, bvec.s6); - MultiplyAdd(cvec.s7, aval, bvec.s7); - MultiplyAdd(cvec.s8, aval, bvec.s8); - MultiplyAdd(cvec.s9, aval, bvec.s9); - MultiplyAdd(cvec.sA, aval, bvec.sA); - MultiplyAdd(cvec.sB, aval, bvec.sB); - MultiplyAdd(cvec.sC, aval, bvec.sC); - MultiplyAdd(cvec.sD, aval, bvec.sD); - MultiplyAdd(cvec.sE, aval, bvec.sE); - MultiplyAdd(cvec.sF, aval, bvec.sF); - #endif - return cvec; -} - -// ================================================================================================= - -// Full version of the kernel with offsets and strided accesses -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xaxpy(const int n, const real alpha, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* ygm, const int y_offset, const int y_inc) { - - // Loops over the work that needs to be done (allows for an arbitrary number of threads) - #pragma unroll - for (int id = get_global_id(0); id Xaxpy::Xaxpy(Queue &queue, Event &event): Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) { source_string_ = - #include "../../kernels/xaxpy.opencl" + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xaxpy.opencl" ; } diff --git a/src/tuning/xaxpy.cc b/src/tuning/xaxpy.cc index cc9e81d3..7715b128 100644 --- a/src/tuning/xaxpy.cc +++ b/src/tuning/xaxpy.cc @@ -31,7 +31,8 @@ class TuneXaxpy { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/xaxpy.opencl" + #include "../src/kernels/level1/level1.opencl" + #include "../src/kernels/level1/xaxpy.opencl" ; } -- cgit v1.2.3