diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-05-12 20:01:33 -0700 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-05-12 20:01:33 -0700 |
commit | f151e56daa617e3327826f06f0765d1673fa8cfd (patch) | |
tree | 66396978988720155adf4f6eb21b921758ccd8aa | |
parent | 86e8df60f1598760511b059b42a9e4f9dddfa150 (diff) |
Added the IxAMIN routines: absolute minimum version of IxAMAX
-rw-r--r-- | CHANGELOG | 2 | ||||
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | doc/clblast.md | 53 | ||||
-rw-r--r-- | include/clblast.h | 7 | ||||
-rw-r--r-- | include/clblast_c.h | 22 | ||||
-rw-r--r-- | include/clblast_netlib_c.h | 10 | ||||
-rwxr-xr-x | scripts/generator/generator.py | 3 | ||||
-rw-r--r-- | src/clblast.cpp | 37 | ||||
-rw-r--r-- | src/clblast_c.cpp | 67 | ||||
-rw-r--r-- | src/clblast_netlib_c.cpp | 90 | ||||
-rw-r--r-- | src/kernels/level1/xamax.opencl | 4 | ||||
-rw-r--r-- | src/routines/level1/xamin.hpp | 49 |
12 files changed, 343 insertions, 2 deletions
@@ -4,6 +4,8 @@ Development (next version) - Performance reports are now external at https://cnugteren.github.io/clblast - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) +- Added non-BLAS level-1 routines: + * iSAMIN/iDAMIN/iCAMIN/iZAMIN (absolute minimum version of the ixAMAX BLAS routines) Version 0.11.0 - Improved the internal program source and binary caches for scalability and speed (thanks to 'intelfx') @@ -289,6 +289,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif | Level-X | S | D | C | Z | H | | -----------|---|---|---|---|---| | xSUM | ✔ | ✔ | ✔ | ✔ | ✔ | +| IxAMIN | ✔ | ✔ | ✔ | ✔ | ✔ | | IxMAX | ✔ | ✔ | ✔ | ✔ | ✔ | | IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ | | xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ | diff --git a/doc/clblast.md b/doc/clblast.md index 6ff5f7d0..fbd0461e 100644 --- a/doc/clblast.md +++ b/doc/clblast.md @@ -573,6 +573,59 @@ Arguments to AMAX: +xAMIN: Index of absolute minimum value in a vector (non-BLAS function) +------------- + +Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. + +C++ API: +``` +template <typename T> +StatusCode Amin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +CLBlastStatusCode CLBlastiSamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastiDamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastiCamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastiZamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastiHamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to AMIN: + +* `const size_t n`: Integer size argument. This value must be positive. +* `cl_mem imin_buffer`: OpenCL buffer to store the output imin vector. +* `const size_t imin_offset`: The offset in elements from the start of the output imin vector. +* `cl_mem imin_buffer`: OpenCL buffer to store the output imin vector. +* `const size_t imin_offset`: The offset in elements from the start of the output imin vector. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + xMAX: Index of maximum value in a vector (non-BLAS function) ------------- diff --git a/include/clblast.h b/include/clblast.h index 54944ea2..354ca591 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -240,6 +240,13 @@ StatusCode Amax(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +template <typename T> +StatusCode Amin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template <typename T> StatusCode Max(const size_t n, diff --git a/include/clblast_c.h b/include/clblast_c.h index b0ef5f34..323a28df 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -403,6 +403,28 @@ CLBlastStatusCode PUBLIC_API CLBlastiHamax(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +CLBlastStatusCode PUBLIC_API CLBlastiSamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastiDamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastiCamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastiZamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastiHamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX CLBlastStatusCode PUBLIC_API CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h index 384fab20..4d0d8e1f 100644 --- a/include/clblast_netlib_c.h +++ b/include/clblast_netlib_c.h @@ -240,6 +240,16 @@ int PUBLIC_API cblas_icamax(const int n, int PUBLIC_API cblas_izamax(const int n, const void* x, const int x_inc); +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +int PUBLIC_API cblas_isamin(const int n, + const float* x, const int x_inc); +int PUBLIC_API cblas_idamin(const int n, + const double* x, const int x_inc); +int PUBLIC_API cblas_icamin(const int n, + const void* x, const int x_inc); +int PUBLIC_API cblas_izamin(const int n, + const void* x, const int x_inc); + // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX int PUBLIC_API cblas_ismax(const int n, const float* x, const int x_inc); diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 8c13b2ff..0d0ee29c 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -42,7 +42,7 @@ FILES = [ "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", ] -HEADER_LINES = [122, 77, 126, 24, 29, 41, 29, 65, 32] +HEADER_LINES = [122, 78, 126, 24, 29, 41, 29, 65, 32] FOOTER_LINES = [25, 139, 27, 38, 6, 6, 6, 9, 2] HEADER_LINES_DOC = 0 FOOTER_LINES_DOC = 63 @@ -117,6 +117,7 @@ ROUTINES = [ Routine(True, True, False, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), Routine(True, False, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), Routine(True, True, False, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, False, "1", "amin", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of absolute minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.", []), Routine(True, False, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), Routine(True, False, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], diff --git a/src/clblast.cpp b/src/clblast.cpp index 78548eba..1eb1ddd7 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -30,6 +30,7 @@ #include "routines/level1/xasum.hpp" #include "routines/level1/xsum.hpp" // non-BLAS routine #include "routines/level1/xamax.hpp" +#include "routines/level1/xamin.hpp" // non-BLAS routine #include "routines/level1/xmax.hpp" // non-BLAS routine #include "routines/level1/xmin.hpp" // non-BLAS routine @@ -550,6 +551,42 @@ template StatusCode PUBLIC_API Amax<half>(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +template <typename T> +StatusCode Amin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xamin<T>(queue_cpp, event); + routine.DoAmin(n, + Buffer<unsigned int>(imin_buffer), imin_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Amin<float>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amin<double>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amin<float2>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amin<double2>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amin<half>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template <typename T> StatusCode Max(const size_t n, diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index b6a64749..d2656274 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -820,6 +820,73 @@ CLBlastStatusCode CLBlastiHamax(const size_t n, } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } } +// AMIN +CLBlastStatusCode CLBlastiSamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + try { + return static_cast<CLBlastStatusCode>( + clblast::Amin<float>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastiDamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + try { + return static_cast<CLBlastStatusCode>( + clblast::Amin<double>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastiCamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + try { + return static_cast<CLBlastStatusCode>( + clblast::Amin<float2>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastiZamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + try { + return static_cast<CLBlastStatusCode>( + clblast::Amin<double2>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastiHamin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + try { + return static_cast<CLBlastStatusCode>( + clblast::Amin<half>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} + // MAX CLBlastStatusCode CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index 3fbabd43..d3b9b5e6 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -1191,6 +1191,96 @@ int cblas_izamax(const int n, return imax[0]; } +// AMIN +int cblas_isamin(const int n, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n * x_inc; + const auto imin_size = 1; + auto x_buffer = clblast::Buffer<float>(context, x_size); + auto imin_buffer = clblast::Buffer<int>(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); + auto queue_cl = queue(); + auto s = clblast::Amin<float>(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin)); + return imin[0]; +} +int cblas_idamin(const int n, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n * x_inc; + const auto imin_size = 1; + auto x_buffer = clblast::Buffer<double>(context, x_size); + auto imin_buffer = clblast::Buffer<int>(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); + auto queue_cl = queue(); + auto s = clblast::Amin<double>(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin)); + return imin[0]; +} +int cblas_icamin(const int n, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n * x_inc; + const auto imin_size = 1; + auto x_buffer = clblast::Buffer<float2>(context, x_size); + auto imin_buffer = clblast::Buffer<int>(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); + auto queue_cl = queue(); + auto s = clblast::Amin<float2>(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin)); + return imin[0]; +} +int cblas_izamin(const int n, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n * x_inc; + const auto imin_size = 1; + auto x_buffer = clblast::Buffer<double2>(context, x_size); + auto imin_buffer = clblast::Buffer<int>(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); + auto queue_cl = queue(); + auto s = clblast::Amin<double2>(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin)); + return imin[0]; +} + // MAX int cblas_ismax(const int n, const float* x, const int x_inc) { diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 48ad2e75..2bd2f714 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file contains the Xamax kernel. It implements an index of absolute max computation using +// This file contains the Xamax kernel. It implements index of (absolute) min/max computation using // reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is // loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel // is executed with a single workgroup only, computing the final result. @@ -59,6 +59,8 @@ void Xamax(const int n, // nothing special here #elif defined(ROUTINE_MIN) // non-absolute minimum version x = -x; + #elif defined(ROUTINE_AMIN) // absolute minimum version + x = -fabs(x); #else x = fabs(x); #endif diff --git a/src/routines/level1/xamin.hpp b/src/routines/level1/xamin.hpp new file mode 100644 index 00000000..6622e220 --- /dev/null +++ b/src/routines/level1/xamin.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xamin routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XAMIN_H_ +#define CLBLAST_ROUTINES_XAMIN_H_ + +#include "routine.hpp" +#include "routines/level1/xamax.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xamin: public Xamax<T> { + public: + + // Members and methods from the base class + using Xamax<T>::DoAmax; + + // Constructor + Xamin(Queue &queue, EventPointer event, const std::string &name = "AMIN"): + Xamax<T>(queue, event, name) { + } + + // Forwards to the regular max-absolute version. The implementation difference is realised in the + // kernel through a pre-processor macro based on the name of the routine. + void DoAmin(const size_t n, + const Buffer<unsigned int> &imin_buffer, const size_t imin_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XAMIN_H_ +#endif |