Merge branch 'development' into gemm_direct

author: Cedric Nugteren <web@cedricnugteren.nl> 2016-10-01 13:45:08 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2016-10-01 13:45:08 +0200
commit: a9d35cf04ceb2ba2185c7520dbff79580abbd785 (patch)
tree: d12dabf8c65ee699a78cee8313ccb19377ab2832 /src
parent: 73d135c2cef9763b47d410b125eb8bb89ece8432 (diff)
parent: d59e5c570b0bbdb8348d2f9ee6fc5850e606db27 (diff)
5 files changed, 54 insertions, 40 deletions
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index d57223dd..aaa76cb4 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -12,8 +12,8 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore
-// contains the following header copyright notice:
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
 //
 // =================================================================================================
 //
@@ -97,14 +97,12 @@ class Event {
   // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
   float GetElapsedTime() const {
     WaitForCompletion();
-    auto bytes = size_t{0};
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
-    auto time_start = size_t{0};
+    const auto bytes = sizeof(cl_ulong);
+    auto time_start = cl_ulong{0};
     clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
-    auto time_end = size_t{0};
+    auto time_end = cl_ulong{0};
     clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
-    return (time_end - time_start) * 1.0e-6f;
+    return static_cast<float>(time_end - time_start) * 1.0e-6f;
   }
 
   // Accessor to the private data-member
@@ -152,6 +150,17 @@ class Platform {
   cl_platform_id platform_;
 };
 
+// Retrieves a vector with all platforms
+inline std::vector<Platform> GetAllPlatforms() {
+  auto num_platforms = cl_uint{0};
+  CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
+  auto all_platforms = std::vector<Platform>();
+  for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
+    all_platforms.push_back(Platform(platform_id));
+  }
+  return all_platforms;
+}
+
 // =================================================================================================
 
 // C++11 version of 'cl_device_id'
@@ -201,8 +210,8 @@ class Device {
   std::vector<size_t> MaxWorkItemSizes() const {
     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
   }
-  cl_ulong LocalMemSize() const {
-    return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
+  unsigned long LocalMemSize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
   }
   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
   size_t CoreClock() const {
@@ -238,9 +247,11 @@ class Device {
   // Query for a specific type of device or brand
   bool IsCPU() const { return Type() == "CPU"; }
   bool IsGPU() const { return Type() == "GPU"; }
-  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
+                              Vendor() == "AuthenticAMD";; }
   bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
-  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+  bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
+                                Vendor() == "GenuineIntel"; }
   bool IsARM() const { return Vendor() == "ARM"; }
 
   // Accessor to the private data-member
@@ -606,8 +617,7 @@ class Buffer {
 
   // Retrieves the actual allocated size in bytes
   size_t GetSize() const {
-    auto bytes = size_t{0};
-    CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
+    const auto bytes = sizeof(size_t);
     auto result = size_t{0};
     CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
     return result;
@@ -658,17 +668,16 @@ class Kernel {
   }
 
   // Retrieves the amount of local memory used per work-group for this kernel
-  cl_ulong LocalMemUsage(const Device &device) const {
-    auto bytes = size_t{0};
+  unsigned long LocalMemUsage(const Device &device) const {
+    const auto bytes = sizeof(cl_ulong);
     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
-    CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
     auto result = cl_ulong{0};
     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
-    return result;
+    return static_cast<unsigned long>(result);
   }
 
   // Retrieves the name of the kernel
-  std::string GetFunctionName() {
+  std::string GetFunctionName() const {
     auto bytes = size_t{0};
     CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
     auto result = std::string{};
@@ -689,6 +698,7 @@ class Kernel {
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event,
               const std::vector<Event> &waitForEvents) {
+
     // Builds a plain version of the events waiting list
     auto waitForEventsPlain = std::vector<cl_event>();
     for (auto &waitEvent : waitForEvents) {
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 4cb7fd00..1abc5e8a 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -126,10 +126,10 @@ class TuneXgemm {
   // Sets the local memory size
   static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
     auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision));
+      return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision));
     };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM",
-                                                    "SB", "KWG", "NWG", "VWN"});
+    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG",
+                                                    "SB", "KWG", "NWG"});
   }
 
   // Sets the base thread configuration
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 19df5f9a..8fa93efc 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -30,6 +30,7 @@ namespace clblast {
 // that it is automatically compiled for the various kernels (given as the 'C' template argument).
 template <typename C, typename T>
 void Tuner(int argc, char* argv[]) {
+  constexpr auto kSeed = 42; // fixed seed for reproducibility
 
   // Sets the parameters and platform/device for which to tune (command-line options)
   auto help = std::string{"* Options given/available:\n"};
@@ -45,6 +46,8 @@ void Tuner(int argc, char* argv[]) {
     if (o == kArgBeta)     { args.beta     = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
     if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); }
   }
+  const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{1});
+
   fprintf(stdout, "%s\n", help.c_str());
 
   // Tests validity of the given arguments
@@ -73,12 +76,12 @@ void Tuner(int argc, char* argv[]) {
   auto b_mat = std::vector<T>(C::GetSizeB(args));
   auto c_mat = std::vector<T>(C::GetSizeC(args));
   auto temp = std::vector<T>(C::GetSizeTemp(args));
-  PopulateVector(x_vec);
-  PopulateVector(y_vec);
-  PopulateVector(a_mat);
-  PopulateVector(b_mat);
-  PopulateVector(c_mat);
-  PopulateVector(temp);
+  PopulateVector(x_vec, kSeed);
+  PopulateVector(y_vec, kSeed);
+  PopulateVector(a_mat, kSeed);
+  PopulateVector(b_mat, kSeed);
+  PopulateVector(c_mat, kSeed);
+  PopulateVector(temp, kSeed);
 
   // Initializes the tuner for the chosen device
   cltune::Tuner tuner(args.platform_id, args.device_id);
@@ -126,6 +129,7 @@ void Tuner(int argc, char* argv[]) {
   C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
 
   // Starts the tuning process
+  tuner.SetNumRuns(num_runs);
   tuner.Tune();
 
   // Prints the results to screen
@@ -134,7 +138,7 @@ void Tuner(int argc, char* argv[]) {
 
   // Also prints the performance of the best-case in terms of GB/s or GFLOPS
   if (time_ms != 0.0) {
-    printf("[ -------> ] %.1lf ms", time_ms);
+    printf("[ -------> ] %.2lf ms", time_ms);
     printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
   }
 
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 77bc72d7..86cc2d13 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -270,40 +270,40 @@ unsigned int GetRandomSeed() {
 
 // Create a random number generator and populates a vector with samples from a random distribution
 template <typename T>
-void PopulateVector(std::vector<T> &vector) {
+void PopulateVector(std::vector<T> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<T>(kTestDataLowerLimit);
   auto upper_limit = static_cast<T>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<T> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element = dist(mt); }
 }
-template void PopulateVector<float>(std::vector<float>&);
-template void PopulateVector<double>(std::vector<double>&);
+template void PopulateVector<float>(std::vector<float>&, const unsigned int);
+template void PopulateVector<double>(std::vector<double>&, const unsigned int);
 
 // Specialized versions of the above for complex data-types
 template <>
-void PopulateVector(std::vector<float2> &vector) {
+void PopulateVector(std::vector<float2> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<float>(kTestDataLowerLimit);
   auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 template <>
-void PopulateVector(std::vector<double2> &vector) {
+void PopulateVector(std::vector<double2> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<double>(kTestDataLowerLimit);
   auto upper_limit = static_cast<double>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<double> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 
 // Specialized versions of the above for half-precision
 template <>
-void PopulateVector(std::vector<half> &vector) {
+void PopulateVector(std::vector<half> &vector, const unsigned int seed) {
   const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
   const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
 }
diff --git a/src/utilities.hpp b/src/utilities.hpp
index 75bd5a69..71bfc1af 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -219,7 +219,7 @@ constexpr auto kTestDataUpperLimit = 2.0;
 
 // Populates a vector with random data
 template <typename T>
-void PopulateVector(std::vector<T> &vector);
+void PopulateVector(std::vector<T> &vector, const unsigned int seed);
 
 // =================================================================================================
author	Cedric Nugteren <web@cedricnugteren.nl>	2016-10-01 13:45:08 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2016-10-01 13:45:08 +0200
commit	a9d35cf04ceb2ba2185c7520dbff79580abbd785 (patch)
tree	d12dabf8c65ee699a78cee8313ccb19377ab2832 /src
parent	73d135c2cef9763b47d410b125eb8bb89ece8432 (diff)
parent	d59e5c570b0bbdb8348d2f9ee6fc5850e606db27 (diff)