diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-10-01 13:45:08 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-10-01 13:45:08 +0200 |
commit | a9d35cf04ceb2ba2185c7520dbff79580abbd785 (patch) | |
tree | d12dabf8c65ee699a78cee8313ccb19377ab2832 /src | |
parent | 73d135c2cef9763b47d410b125eb8bb89ece8432 (diff) | |
parent | d59e5c570b0bbdb8348d2f9ee6fc5850e606db27 (diff) |
Merge branch 'development' into gemm_direct
Diffstat (limited to 'src')
-rw-r--r-- | src/clpp11.hpp | 48 | ||||
-rw-r--r-- | src/tuning/kernels/xgemm.cpp | 6 | ||||
-rw-r--r-- | src/tuning/tuning.hpp | 18 | ||||
-rw-r--r-- | src/utilities.cpp | 20 | ||||
-rw-r--r-- | src/utilities.hpp | 2 |
5 files changed, 54 insertions, 40 deletions
diff --git a/src/clpp11.hpp b/src/clpp11.hpp index d57223dd..aaa76cb4 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -12,8 +12,8 @@ // Portability here means that a similar header exists for CUDA with the same classes and // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. // -// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore -// contains the following header copyright notice: +// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and +// therefore contains the following header copyright notice: // // ================================================================================================= // @@ -97,14 +97,12 @@ class Event { // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx float GetElapsedTime() const { WaitForCompletion(); - auto bytes = size_t{0}; - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); - auto time_start = size_t{0}; + const auto bytes = sizeof(cl_ulong); + auto time_start = cl_ulong{0}; clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); - auto time_end = size_t{0}; + auto time_end = cl_ulong{0}; clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); - return (time_end - time_start) * 1.0e-6f; + return static_cast<float>(time_end - time_start) * 1.0e-6f; } // Accessor to the private data-member @@ -152,6 +150,17 @@ class Platform { cl_platform_id platform_; }; +// Retrieves a vector with all platforms +inline std::vector<Platform> GetAllPlatforms() { + auto num_platforms = cl_uint{0}; + CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); + auto all_platforms = std::vector<Platform>(); + for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) { + all_platforms.push_back(Platform(platform_id)); + } + return all_platforms; +} + // ================================================================================================= // C++11 version of 'cl_device_id' @@ -201,8 +210,8 @@ class Device { std::vector<size_t> MaxWorkItemSizes() const { return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES); } - cl_ulong LocalMemSize() const { - return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE); + unsigned long LocalMemSize() const { + return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE)); } std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } size_t CoreClock() const { @@ -238,9 +247,11 @@ class Device { // Query for a specific type of device or brand bool IsCPU() const { return Type() == "CPU"; } bool IsGPU() const { return Type() == "GPU"; } - bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; } + bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." || + Vendor() == "AuthenticAMD";; } bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; } - bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; } + bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" || + Vendor() == "GenuineIntel"; } bool IsARM() const { return Vendor() == "ARM"; } // Accessor to the private data-member @@ -606,8 +617,7 @@ class Buffer { // Retrieves the actual allocated size in bytes size_t GetSize() const { - auto bytes = size_t{0}; - CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes)); + const auto bytes = sizeof(size_t); auto result = size_t{0}; CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr)); return result; @@ -658,17 +668,16 @@ class Kernel { } // Retrieves the amount of local memory used per work-group for this kernel - cl_ulong LocalMemUsage(const Device &device) const { - auto bytes = size_t{0}; + unsigned long LocalMemUsage(const Device &device) const { + const auto bytes = sizeof(cl_ulong); auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE}; - CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes)); auto result = cl_ulong{0}; CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr)); - return result; + return static_cast<unsigned long>(result); } // Retrieves the name of the kernel - std::string GetFunctionName() { + std::string GetFunctionName() const { auto bytes = size_t{0}; CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes)); auto result = std::string{}; @@ -689,6 +698,7 @@ class Kernel { void Launch(const Queue &queue, const std::vector<size_t> &global, const std::vector<size_t> &local, EventPointer event, const std::vector<Event> &waitForEvents) { + // Builds a plain version of the events waiting list auto waitForEventsPlain = std::vector<cl_event>(); for (auto &waitEvent : waitForEvents) { diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 4cb7fd00..1abc5e8a 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -126,10 +126,10 @@ class TuneXgemm { // Sets the local memory size static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { auto LocalMemorySize = [args] (std::vector<size_t> v) { - return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision)); + return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision)); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM", - "SB", "KWG", "NWG", "VWN"}); + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", + "SB", "KWG", "NWG"}); } // Sets the base thread configuration diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index 19df5f9a..8fa93efc 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -30,6 +30,7 @@ namespace clblast { // that it is automatically compiled for the various kernels (given as the 'C' template argument). template <typename C, typename T> void Tuner(int argc, char* argv[]) { + constexpr auto kSeed = 42; // fixed seed for reproducibility // Sets the parameters and platform/device for which to tune (command-line options) auto help = std::string{"* Options given/available:\n"}; @@ -45,6 +46,8 @@ void Tuner(int argc, char* argv[]) { if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); } if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); } } + const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{1}); + fprintf(stdout, "%s\n", help.c_str()); // Tests validity of the given arguments @@ -73,12 +76,12 @@ void Tuner(int argc, char* argv[]) { auto b_mat = std::vector<T>(C::GetSizeB(args)); auto c_mat = std::vector<T>(C::GetSizeC(args)); auto temp = std::vector<T>(C::GetSizeTemp(args)); - PopulateVector(x_vec); - PopulateVector(y_vec); - PopulateVector(a_mat); - PopulateVector(b_mat); - PopulateVector(c_mat); - PopulateVector(temp); + PopulateVector(x_vec, kSeed); + PopulateVector(y_vec, kSeed); + PopulateVector(a_mat, kSeed); + PopulateVector(b_mat, kSeed); + PopulateVector(c_mat, kSeed); + PopulateVector(temp, kSeed); // Initializes the tuner for the chosen device cltune::Tuner tuner(args.platform_id, args.device_id); @@ -126,6 +129,7 @@ void Tuner(int argc, char* argv[]) { C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); // Starts the tuning process + tuner.SetNumRuns(num_runs); tuner.Tune(); // Prints the results to screen @@ -134,7 +138,7 @@ void Tuner(int argc, char* argv[]) { // Also prints the performance of the best-case in terms of GB/s or GFLOPS if (time_ms != 0.0) { - printf("[ -------> ] %.1lf ms", time_ms); + printf("[ -------> ] %.2lf ms", time_ms); printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str()); } diff --git a/src/utilities.cpp b/src/utilities.cpp index 77bc72d7..86cc2d13 100644 --- a/src/utilities.cpp +++ b/src/utilities.cpp @@ -270,40 +270,40 @@ unsigned int GetRandomSeed() { // Create a random number generator and populates a vector with samples from a random distribution template <typename T> -void PopulateVector(std::vector<T> &vector) { +void PopulateVector(std::vector<T> &vector, const unsigned int seed) { auto lower_limit = static_cast<T>(kTestDataLowerLimit); auto upper_limit = static_cast<T>(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); + std::mt19937 mt(seed); std::uniform_real_distribution<T> dist(lower_limit, upper_limit); for (auto &element: vector) { element = dist(mt); } } -template void PopulateVector<float>(std::vector<float>&); -template void PopulateVector<double>(std::vector<double>&); +template void PopulateVector<float>(std::vector<float>&, const unsigned int); +template void PopulateVector<double>(std::vector<double>&, const unsigned int); // Specialized versions of the above for complex data-types template <> -void PopulateVector(std::vector<float2> &vector) { +void PopulateVector(std::vector<float2> &vector, const unsigned int seed) { auto lower_limit = static_cast<float>(kTestDataLowerLimit); auto upper_limit = static_cast<float>(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); + std::mt19937 mt(seed); std::uniform_real_distribution<float> dist(lower_limit, upper_limit); for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } } template <> -void PopulateVector(std::vector<double2> &vector) { +void PopulateVector(std::vector<double2> &vector, const unsigned int seed) { auto lower_limit = static_cast<double>(kTestDataLowerLimit); auto upper_limit = static_cast<double>(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); + std::mt19937 mt(seed); std::uniform_real_distribution<double> dist(lower_limit, upper_limit); for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } } // Specialized versions of the above for half-precision template <> -void PopulateVector(std::vector<half> &vector) { +void PopulateVector(std::vector<half> &vector, const unsigned int seed) { const auto lower_limit = static_cast<float>(kTestDataLowerLimit); const auto upper_limit = static_cast<float>(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); + std::mt19937 mt(seed); std::uniform_real_distribution<float> dist(lower_limit, upper_limit); for (auto &element: vector) { element = FloatToHalf(dist(mt)); } } diff --git a/src/utilities.hpp b/src/utilities.hpp index 75bd5a69..71bfc1af 100644 --- a/src/utilities.hpp +++ b/src/utilities.hpp @@ -219,7 +219,7 @@ constexpr auto kTestDataUpperLimit = 2.0; // Populates a vector with random data template <typename T> -void PopulateVector(std::vector<T> &vector); +void PopulateVector(std::vector<T> &vector, const unsigned int seed); // ================================================================================================= |