summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2015-06-10 08:44:30 +0200
committerCNugteren <web@cedricnugteren.nl>2015-06-10 08:44:30 +0200
commit85c1db93221bf9d71083c6725a33ccbcd1b61de4 (patch)
treee53e4d62d53cc85c4383bccea12904f27c4ac7bc /src
parent3c17c1c13313022879c8caf289d0f47ea5d7d22d (diff)
Added initial naive version of Xgemv kernel
Diffstat (limited to 'src')
-rw-r--r--src/kernels/xgemv.opencl60
-rw-r--r--src/tuning/tuning.cc58
-rw-r--r--src/tuning/xgemv.cc89
3 files changed, 207 insertions, 0 deletions
diff --git a/src/kernels/xgemv.opencl b/src/kernels/xgemv.opencl
new file mode 100644
index 00000000..c90bc26e
--- /dev/null
+++ b/src/kernels/xgemv.opencl
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xgemv kernel for matrix-vector multiplication.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef WGS
+ #define WGS 64 // The local work-group size
+#endif
+#ifndef WPT
+ #define WPT 1 // The amount of work-per-thread
+#endif
+#ifndef VW
+ #define VW 1 // Vector width of vectors X and Y
+#endif
+
+// =================================================================================================
+
+// The gemv kernel
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
+ const __global real* restrict agm,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* ygm, const int y_offset, const int y_inc) {
+
+ // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+ #pragma unroll
+ for (int id = get_global_id(0); id<m; id += get_global_size(0)) {
+
+ // Loop over the elements of the matrix A
+ real acc;
+ SetToZero(acc);
+ for (int k=0; k<n; ++k) {
+ MultiplyAdd(acc, agm[id + m*k], xgm[k*x_inc + x_offset]);
+ }
+ AXPBY(ygm[id*y_inc + y_offset], alpha, acc, beta, ygm[id*y_inc + y_offset]);
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc
index bb93c053..94333089 100644
--- a/src/tuning/tuning.cc
+++ b/src/tuning/tuning.cc
@@ -73,6 +73,64 @@ template void TunerXY<double2>(int, char**, const Tuner2<double2>&);
// =================================================================================================
// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for matrix-vector-vector routines.
+template <typename T>
+void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) {
+
+ // Sets the parameters and platform/device for which to tune (command-line options)
+ auto help = std::string{"* Options given/available:\n"};
+ auto args = Arguments<T>{};
+ args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+ args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+ args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+ args.m = GetArgument(argc, argv, help, kArgM, size_t{1024});
+ args.n = GetArgument(argc, argv, help, kArgN, size_t{1024});
+ args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
+ args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>());
+ fprintf(stdout, "%s\n", help.c_str());
+
+ // Creates input buffers with random data
+ auto a_mat = std::vector<T>(args.m * args.n);
+ auto x_vec = std::vector<T>(args.n);
+ auto y_vec = std::vector<T>(args.m);
+ PopulateVector(a_mat);
+ PopulateVector(x_vec);
+ PopulateVector(y_vec);
+
+ // Initializes the tuner for the chosen device
+ cltune::Tuner tuner(args.platform_id, args.device_id);
+
+ // Use full-search to explore all parameter combinations.
+ tuner.UseFullSearch();
+
+ // Configures the tuning parameters (kernel specific)
+ tune_function(args, a_mat, x_vec, y_vec, tuner);
+
+ // Starts the tuning process
+ tuner.Tune();
+
+ // Prints the results to screen
+ auto time_ms = tuner.PrintToScreen();
+ tuner.PrintFormatted();
+
+ // Also prints the performance of the best-case in terms of GB/s and GFLOPS
+ const auto mega_bytes = ((args.m*args.n + 2*args.m + args.n)*GetBytes(args.precision)) * 1.0e-6;
+ const auto mega_flops = (2*args.m*args.n) * 1.0e-6;
+ if (time_ms != 0.0) {
+ printf("[ -------> ] %.1lf ms or %.1lf GB/s or %.1lf GFLOPS\n",
+ time_ms, mega_bytes/time_ms, mega_flops/time_ms);
+ }
+}
+
+// Compiles the above function
+template void TunerAXY<float>(int, char**, const Tuner3<float>&);
+template void TunerAXY<double>(int, char**, const Tuner3<double>&);
+template void TunerAXY<float2>(int, char**, const Tuner3<float2>&);
+template void TunerAXY<double2>(int, char**, const Tuner3<double2>&);
+
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
// the results. Used for matrix-matrix routines.
template <typename T>
void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function) {
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
new file mode 100644
index 00000000..1ee7c7bf
--- /dev/null
+++ b/src/tuning/xgemv.cc
@@ -0,0 +1,89 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the Xgemv OpenCL kernel. It uses the CLTune library.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The Xgemv auto-tuner
+template <typename T>
+void XgemvTune(const Arguments<T> &args,
+ const std::vector<T> &a_mat, const std::vector<T> &x_vec, std::vector<T> &y_vec,
+ cltune::Tuner &tuner) {
+
+ // This points to the Xgemv kernel as found in the CLBlast library
+ std::string common_source =
+ #include "../src/kernels/common.opencl"
+ std::string kernel_source =
+ #include "../src/kernels/xgemv.opencl"
+ auto sources = common_source + kernel_source;
+ auto id = tuner.AddKernelFromString(sources, "Xgemv", {args.m}, {1});
+ tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
+
+ // Sets the tunable parameters and their possible values
+ tuner.AddParameter(id, "WGS", {64, 128});
+ tuner.AddParameter(id, "WPT", {1});
+ tuner.AddParameter(id, "VW", {1});
+
+ // Tests for a specific precision
+ tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+ tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+ // Modifies the thread-sizes (local) based on the parameters
+ tuner.MulLocalSize(id, {"WGS"});
+ tuner.DivGlobalSize(id, {"WPT"});
+ tuner.DivGlobalSize(id, {"VW"});
+
+ // Sets the function's arguments
+ tuner.AddArgumentScalar(static_cast<int>(args.m));
+ tuner.AddArgumentScalar(static_cast<int>(args.n));
+ tuner.AddArgumentScalar(args.alpha);
+ tuner.AddArgumentScalar(args.beta);
+ tuner.AddArgumentInput(a_mat);
+ tuner.AddArgumentInput(x_vec);
+ tuner.AddArgumentScalar(0);
+ tuner.AddArgumentScalar(1);
+ tuner.AddArgumentOutput(y_vec);
+ tuner.AddArgumentScalar(0);
+ tuner.AddArgumentScalar(1);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerXgemv(int argc, char *argv[]) {
+ switch(GetPrecision(argc, argv)) {
+ case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+ case Precision::kSingle: TunerAXY<float>(argc, argv, XgemvTune<float>); break;
+ case Precision::kDouble: TunerAXY<double>(argc, argv, XgemvTune<double>); break;
+ case Precision::kComplexSingle: TunerAXY<float2>(argc, argv, XgemvTune<float2>); break;
+ case Precision::kComplexDouble: TunerAXY<double2>(argc, argv, XgemvTune<double2>); break;
+ }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+ clblast::TunerXgemv(argc, argv);
+ return 0;
+}
+
+// =================================================================================================