diff options
author | CNugteren <web@cedricnugteren.nl> | 2015-06-10 08:44:30 +0200 |
---|---|---|
committer | CNugteren <web@cedricnugteren.nl> | 2015-06-10 08:44:30 +0200 |
commit | 85c1db93221bf9d71083c6725a33ccbcd1b61de4 (patch) | |
tree | e53e4d62d53cc85c4383bccea12904f27c4ac7bc /src | |
parent | 3c17c1c13313022879c8caf289d0f47ea5d7d22d (diff) |
Added initial naive version of Xgemv kernel
Diffstat (limited to 'src')
-rw-r--r-- | src/kernels/xgemv.opencl | 60 | ||||
-rw-r--r-- | src/tuning/tuning.cc | 58 | ||||
-rw-r--r-- | src/tuning/xgemv.cc | 89 |
3 files changed, 207 insertions, 0 deletions
diff --git a/src/kernels/xgemv.opencl b/src/kernels/xgemv.opencl new file mode 100644 index 00000000..c90bc26e --- /dev/null +++ b/src/kernels/xgemv.opencl @@ -0,0 +1,60 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the Xgemv kernel for matrix-vector multiplication. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef WGS + #define WGS 64 // The local work-group size +#endif +#ifndef WPT + #define WPT 1 // The amount of work-per-thread +#endif +#ifndef VW + #define VW 1 // Vector width of vectors X and Y +#endif + +// ================================================================================================= + +// The gemv kernel +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xgemv(const int m, const int n, const real alpha, const real beta, + const __global real* restrict agm, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id<m; id += get_global_size(0)) { + + // Loop over the elements of the matrix A + real acc; + SetToZero(acc); + for (int k=0; k<n; ++k) { + MultiplyAdd(acc, agm[id + m*k], xgm[k*x_inc + x_offset]); + } + AXPBY(ygm[id*y_inc + y_offset], alpha, acc, beta, ygm[id*y_inc + y_offset]); + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)"; + +// ================================================================================================= diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc index bb93c053..94333089 100644 --- a/src/tuning/tuning.cc +++ b/src/tuning/tuning.cc @@ -73,6 +73,64 @@ template void TunerXY<double2>(int, char**, const Tuner2<double2>&); // ================================================================================================= // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect +// the results. Used for matrix-vector-vector routines. +template <typename T> +void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) { + + // Sets the parameters and platform/device for which to tune (command-line options) + auto help = std::string{"* Options given/available:\n"}; + auto args = Arguments<T>{}; + args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); + args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); + args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); + args.m = GetArgument(argc, argv, help, kArgM, size_t{1024}); + args.n = GetArgument(argc, argv, help, kArgN, size_t{1024}); + args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); + args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); + fprintf(stdout, "%s\n", help.c_str()); + + // Creates input buffers with random data + auto a_mat = std::vector<T>(args.m * args.n); + auto x_vec = std::vector<T>(args.n); + auto y_vec = std::vector<T>(args.m); + PopulateVector(a_mat); + PopulateVector(x_vec); + PopulateVector(y_vec); + + // Initializes the tuner for the chosen device + cltune::Tuner tuner(args.platform_id, args.device_id); + + // Use full-search to explore all parameter combinations. + tuner.UseFullSearch(); + + // Configures the tuning parameters (kernel specific) + tune_function(args, a_mat, x_vec, y_vec, tuner); + + // Starts the tuning process + tuner.Tune(); + + // Prints the results to screen + auto time_ms = tuner.PrintToScreen(); + tuner.PrintFormatted(); + + // Also prints the performance of the best-case in terms of GB/s and GFLOPS + const auto mega_bytes = ((args.m*args.n + 2*args.m + args.n)*GetBytes(args.precision)) * 1.0e-6; + const auto mega_flops = (2*args.m*args.n) * 1.0e-6; + if (time_ms != 0.0) { + printf("[ -------> ] %.1lf ms or %.1lf GB/s or %.1lf GFLOPS\n", + time_ms, mega_bytes/time_ms, mega_flops/time_ms); + } +} + +// Compiles the above function +template void TunerAXY<float>(int, char**, const Tuner3<float>&); +template void TunerAXY<double>(int, char**, const Tuner3<double>&); +template void TunerAXY<float2>(int, char**, const Tuner3<float2>&); +template void TunerAXY<double2>(int, char**, const Tuner3<double2>&); + +// ================================================================================================= + +// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect // the results. Used for matrix-matrix routines. template <typename T> void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function) { diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc new file mode 100644 index 00000000..1ee7c7bf --- /dev/null +++ b/src/tuning/xgemv.cc @@ -0,0 +1,89 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements an auto-tuner to tune the Xgemv OpenCL kernel. It uses the CLTune library. +// +// ================================================================================================= + +#include <string> +#include <vector> +#include <stdexcept> + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// The Xgemv auto-tuner +template <typename T> +void XgemvTune(const Arguments<T> &args, + const std::vector<T> &a_mat, const std::vector<T> &x_vec, std::vector<T> &y_vec, + cltune::Tuner &tuner) { + + // This points to the Xgemv kernel as found in the CLBlast library + std::string common_source = + #include "../src/kernels/common.opencl" + std::string kernel_source = + #include "../src/kernels/xgemv.opencl" + auto sources = common_source + kernel_source; + auto id = tuner.AddKernelFromString(sources, "Xgemv", {args.m}, {1}); + tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64}); + + // Sets the tunable parameters and their possible values + tuner.AddParameter(id, "WGS", {64, 128}); + tuner.AddParameter(id, "WPT", {1}); + tuner.AddParameter(id, "VW", {1}); + + // Tests for a specific precision + tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); + tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + + // Modifies the thread-sizes (local) based on the parameters + tuner.MulLocalSize(id, {"WGS"}); + tuner.DivGlobalSize(id, {"WPT"}); + tuner.DivGlobalSize(id, {"VW"}); + + // Sets the function's arguments + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentScalar(static_cast<int>(args.n)); + tuner.AddArgumentScalar(args.alpha); + tuner.AddArgumentScalar(args.beta); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentOutput(y_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void TunerXgemv(int argc, char *argv[]) { + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: TunerAXY<float>(argc, argv, XgemvTune<float>); break; + case Precision::kDouble: TunerAXY<double>(argc, argv, XgemvTune<double>); break; + case Precision::kComplexSingle: TunerAXY<float2>(argc, argv, XgemvTune<float2>); break; + case Precision::kComplexDouble: TunerAXY<double2>(argc, argv, XgemvTune<double2>); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::TunerXgemv(argc, argv); + return 0; +} + +// ================================================================================================= |