diff options
Diffstat (limited to 'src')
54 files changed, 617 insertions, 430 deletions
diff --git a/src/cache.cpp b/src/cache.cpp index a2e51792..c5cc6a4d 100644 --- a/src/cache.cpp +++ b/src/cache.cpp @@ -15,108 +15,84 @@ #include <vector> #include <mutex> +#include "database/database.hpp" #include "cache.hpp" namespace clblast { // ================================================================================================= -// Stores the compiled binary or IR in the cache -void StoreBinaryToCache(const std::string &binary, const std::string &device_name, - const Precision &precision, const std::string &routine_name) { - #ifdef VERBOSE - printf("[DEBUG] Storing binary '%s' in cache\n", routine_name.c_str()); - #endif - binary_cache_mutex_.lock(); - binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name}); - binary_cache_mutex_.unlock(); -} - -// Stores the compiled program in the cache -void StoreProgramToCache(const Program &program, const Context &context, - const Precision &precision, const std::string &routine_name) { - #ifdef VERBOSE - printf("[DEBUG] Storing program '%s' in cache\n", routine_name.c_str()); - #endif - program_cache_mutex_.lock(); - program_cache_.push_back(ProgramCache{program, context(), precision, routine_name}); - program_cache_mutex_.unlock(); -} +template <typename Key, typename Value> +template <typename U> +Value Cache<Key, Value>::Get(const U &key, bool *in_cache) const { + std::lock_guard<std::mutex> lock(cache_mutex_); -// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws -// otherwise. -const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name) { - #ifdef VERBOSE - printf("[DEBUG] Retrieving binary '%s' from cache\n", routine_name.c_str()); - #endif - binary_cache_mutex_.lock(); - for (auto &cached_binary: binary_cache_) { - if (cached_binary.MatchInCache(device_name, precision, routine_name)) { - binary_cache_mutex_.unlock(); - return cached_binary.binary; +#if __cplusplus >= 201402L + // generalized std::map::find() of C++14 + auto it = cache_.find(key); +#else + // O(n) lookup in a vector + auto it = std::find_if(cache_.begin(), cache_.end(), [&] (const std::pair<Key, Value> &pair) { + return pair.first == key; + }); +#endif + if (it == cache_.end()) { + if (in_cache) { + *in_cache = false; } + return Value(); } - binary_cache_mutex_.unlock(); - throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none"); -} -// Queries the cache and retrieves a matching program. Assumes that the match is available, throws -// otherwise. -const Program& GetProgramFromCache(const Context &context, const Precision &precision, - const std::string &routine_name) { - #ifdef VERBOSE - printf("[DEBUG] Retrieving program '%s' from cache\n", routine_name.c_str()); - #endif - program_cache_mutex_.lock(); - for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(context(), precision, routine_name)) { - program_cache_mutex_.unlock(); - return cached_program.program; - } + if (in_cache) { + *in_cache = true; } - program_cache_mutex_.unlock(); - throw LogicError("GetProgramFromCache: Expected program in cache, but found none"); + return it->second; } -// Queries the cache to see whether or not the compiled kernel is already there -bool BinaryIsInCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name) { - binary_cache_mutex_.lock(); - for (auto &cached_binary: binary_cache_) { - if (cached_binary.MatchInCache(device_name, precision, routine_name)) { - binary_cache_mutex_.unlock(); - return true; - } +template <typename Key, typename Value> +void Cache<Key, Value>::Store(Key &&key, Value &&value) { + std::lock_guard<std::mutex> lock(cache_mutex_); + +#if __cplusplus >= 201402L + // emplace() into a map + auto r = cache_.emplace(std::move(key), std::move(value)); + if (!r.second) { + throw LogicError("Cache::Store: object already in cache"); } - binary_cache_mutex_.unlock(); - return false; +#else + // emplace_back() into a vector + cache_.emplace_back(std::move(key), std::move(value)); +#endif } -// Queries the cache to see whether or not the compiled kernel is already there -bool ProgramIsInCache(const Context &context, const Precision &precision, - const std::string &routine_name) { - program_cache_mutex_.lock(); - for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(context(), precision, routine_name)) { - program_cache_mutex_.unlock(); - return true; - } - } - program_cache_mutex_.unlock(); - return false; +template <typename Key, typename Value> +void Cache<Key, Value>::Invalidate() { + std::lock_guard<std::mutex> lock(cache_mutex_); + + cache_.clear(); } +template <typename Key, typename Value> +Cache<Key, Value> &Cache<Key, Value>::Instance() { + return instance_; +} + +template <typename Key, typename Value> +Cache<Key, Value> Cache<Key, Value>::instance_; + // ================================================================================================= -// Clears the cache of stored binaries and programs -void CacheClearAll() { - binary_cache_mutex_.lock(); - binary_cache_.clear(); - binary_cache_mutex_.unlock(); - program_cache_mutex_.lock(); - program_cache_.clear(); - program_cache_mutex_.unlock(); -} +template class Cache<BinaryKey, std::string>; +template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const; + +// ================================================================================================= + +template class Cache<ProgramKey, Program>; +template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const; + +// ================================================================================================= + +template class Cache<DatabaseKey, Database>; +template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const; // ================================================================================================= } // namespace clblast diff --git a/src/cache.hpp b/src/cache.hpp index 9ecb0f1e..c3675f07 100644 --- a/src/cache.hpp +++ b/src/cache.hpp @@ -15,81 +15,89 @@ #define CLBLAST_CACHE_H_ #include <string> -#include <vector> #include <mutex> +#include <map> #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= -// The cache of compiled OpenCL binaries, along with some meta-data -struct BinaryCache { - std::string binary; - std::string device_name; - Precision precision; - std::string routine_name_; - - // Finds out whether the properties match - bool MatchInCache(const std::string &ref_device, const Precision &ref_precision, - const std::string &ref_routine) { - return (device_name == ref_device && - precision == ref_precision && - routine_name_ == ref_routine); - } -}; - -// The actual cache, implemented as a vector of the above data-type, and its mutex -static std::vector<BinaryCache> binary_cache_; -static std::mutex binary_cache_mutex_; +// The generic thread-safe cache. We assume that the Key may be a heavyweight struct that is not +// normally used by the caller, while the Value is either lightweight or ref-counted. +// Hence, searching by non-Key is supported (if there is a corresponding operator<()), and +// on Store() the Key instance is moved from the caller (because it will likely be constructed +// as temporary at the time of Store()). +template <typename Key, typename Value> +class Cache { +public: + // Cached object is returned by-value to avoid racing with Invalidate(). + // Due to lack of std::optional<>, in case of a cache miss we return a default-constructed + // Value and set the flag to false. + template <typename U> + Value Get(const U &key, bool *in_cache) const; + + // We do not return references to just stored object to avoid racing with Invalidate(). + // Caller is expected to store a temporary. + void Store(Key &&key, Value &&value); + void Invalidate(); + + static Cache<Key, Value> &Instance(); + +private: +#if __cplusplus >= 201402L + // The std::less<void> allows to search in cache by an object comparable with Key, without + // constructing a temporary Key + // (see http://en.cppreference.com/w/cpp/utility/functional/less_void, + // http://www.open-std.org/JTC1/SC22/WG21/docs/papers/2013/n3657.htm, + // http://stackoverflow.com/questions/10536788/avoiding-key-construction-for-stdmapfind) + std::map<Key, Value, std::less<void>> cache_; +#else + std::vector<std::pair<Key, Value>> cache_; +#endif + mutable std::mutex cache_mutex_; + + static Cache<Key, Value> instance_; +}; // class Cache // ================================================================================================= -// The cache of compiled OpenCL programs, along with some meta-data -struct ProgramCache { - Program program; - cl_context context; - Precision precision; - std::string routine_name_; - - // Finds out whether the properties match - bool MatchInCache(const cl_context ref_context, const Precision &ref_precision, - const std::string &ref_routine) { - return (context == ref_context && - precision == ref_precision && - routine_name_ == ref_routine); - } -}; - -// The actual cache, implemented as a vector of the above data-type, and its mutex -static std::vector<ProgramCache> program_cache_; -static std::mutex program_cache_mutex_; +// The key struct for the cache of compiled OpenCL binaries +// Order of fields: precision, routine_name, device_name (smaller fields first) +typedef std::tuple<Precision, std::string, std::string> BinaryKey; +typedef std::tuple<const Precision &, const std::string &, const std::string &> BinaryKeyRef; + +typedef Cache<BinaryKey, std::string> BinaryCache; + +extern template class Cache<BinaryKey, std::string>; +extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const; + // ================================================================================================= -// Stores the compiled binary or program in the cache -void StoreBinaryToCache(const std::string &binary, const std::string &device_name, - const Precision &precision, const std::string &routine_name); -void StoreProgramToCache(const Program &program, const Context &context, - const Precision &precision, const std::string &routine_name); - -// Queries the cache and retrieves a matching binary or program. Assumes that the match is -// available, throws otherwise. -const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name); -const Program& GetProgramFromCache(const Context &context, const Precision &precision, - const std::string &routine_name); - -// Queries the cache to see whether or not the compiled kernel is already there -bool BinaryIsInCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name); -bool ProgramIsInCache(const Context &context, const Precision &precision, - const std::string &routine_name); +// The key struct for the cache of compiled OpenCL programs (context-dependent) +// Order of fields: context, precision, routine_name (smaller fields first) +typedef std::tuple<cl_context, Precision, std::string> ProgramKey; +typedef std::tuple<const cl_context &, const Precision &, const std::string &> ProgramKeyRef; + +typedef Cache<ProgramKey, Program> ProgramCache; + +extern template class Cache<ProgramKey, Program>; +extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const; // ================================================================================================= -// Clears the cache of stored binaries -void CacheClearAll(); +class Database; + +// The key struct for the cache of database maps. +// Order of fields: precision, device_name, routines (smaller fields first) +typedef std::tuple<Precision, std::string, std::vector<std::string>> DatabaseKey; +typedef std::tuple<const Precision &, const std::string &, const std::vector<std::string> &> DatabaseKeyRef; + +typedef Cache<DatabaseKey, Database> DatabaseCache; + +extern template class Cache<DatabaseKey, Database>; +extern template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const; // ================================================================================================= } // namespace clblast diff --git a/src/clblast.cpp b/src/clblast.cpp index ef1cedf9..20ce1ba4 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -15,8 +15,8 @@ #include <string> -#include "clblast.h" #include "cache.hpp" +#include "clblast.h" // BLAS level-1 includes #include "routines/level1/xswap.hpp" @@ -2184,11 +2184,77 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose, // Clears the cache of stored binaries StatusCode ClearCache() { try { - CacheClearAll(); + ProgramCache::Instance().Invalidate(); + BinaryCache::Instance().Invalidate(); } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } +template <typename Real, typename Complex> +void FillCacheForPrecision(Queue &queue) { + try { + + // Runs all the level 1 set-up functions + Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr); + Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr); + Xscal<Real>(queue, nullptr); Xscal<Complex>(queue, nullptr); + Xcopy<Real>(queue, nullptr); Xcopy<Complex>(queue, nullptr); + Xaxpy<Real>(queue, nullptr); Xaxpy<Complex>(queue, nullptr); + Xdot<Real>(queue, nullptr); + Xdotu<Complex>(queue, nullptr); + Xdotc<Complex>(queue, nullptr); + Xnrm2<Real>(queue, nullptr); Xnrm2<Complex>(queue, nullptr); + Xasum<Real>(queue, nullptr); Xasum<Complex>(queue, nullptr); + Xsum<Real>(queue, nullptr); Xsum<Complex>(queue, nullptr); + Xamax<Real>(queue, nullptr); Xamax<Complex>(queue, nullptr); + Xmax<Real>(queue, nullptr); Xmax<Complex>(queue, nullptr); + Xmin<Real>(queue, nullptr); Xmin<Complex>(queue, nullptr); + + // Runs all the level 2 set-up functions + Xgemv<Real>(queue, nullptr); Xgemv<Complex>(queue, nullptr); + Xgbmv<Real>(queue, nullptr); Xgbmv<Complex>(queue, nullptr); + Xhemv<Complex>(queue, nullptr); + Xhbmv<Complex>(queue, nullptr); + Xhpmv<Complex>(queue, nullptr); + Xsymv<Real>(queue, nullptr); + Xsbmv<Real>(queue, nullptr); + Xspmv<Real>(queue, nullptr); + Xtrmv<Real>(queue, nullptr); Xtrmv<Complex>(queue, nullptr); + Xtbmv<Real>(queue, nullptr); Xtbmv<Complex>(queue, nullptr); + Xtpmv<Real>(queue, nullptr); Xtpmv<Complex>(queue, nullptr); + Xger<Real>(queue, nullptr); + Xgeru<Complex>(queue, nullptr); + Xgerc<Complex>(queue, nullptr); + Xher<Complex,Real>(queue, nullptr); + Xhpr<Complex,Real>(queue, nullptr); + Xher2<Complex>(queue, nullptr); + Xhpr2<Complex>(queue, nullptr); + Xsyr<Real>(queue, nullptr); + Xspr<Real>(queue, nullptr); + Xsyr2<Real>(queue, nullptr); + Xspr2<Real>(queue, nullptr); + + // Runs all the level 3 set-up functions + Xgemm<Real>(queue, nullptr); Xgemm<Complex>(queue, nullptr); + Xsymm<Real>(queue, nullptr); Xsymm<Complex>(queue, nullptr); + Xhemm<Complex>(queue, nullptr); + Xsyrk<Real>(queue, nullptr); Xsyrk<Complex>(queue, nullptr); + Xherk<Complex,Real>(queue, nullptr); + Xsyr2k<Real>(queue, nullptr); Xsyr2k<Complex>(queue, nullptr); + Xher2k<Complex,Real>(queue, nullptr); + Xtrmm<Real>(queue, nullptr); Xtrmm<Complex>(queue, nullptr); + + // Runs all the non-BLAS set-up functions + Xomatcopy<Real>(queue, nullptr); Xomatcopy<Complex>(queue, nullptr); + + } catch(const RuntimeErrorCode &e) { + if (e.status() != StatusCode::kNoDoublePrecision && + e.status() != StatusCode::kNoHalfPrecision) { + throw; + } + } +} + // Fills the cache with all binaries for a specific device // TODO: Add half-precision FP16 set-up calls StatusCode FillCache(const cl_device_id device) { @@ -2199,58 +2265,8 @@ StatusCode FillCache(const cl_device_id device) { auto context = Context(device_cpp); auto queue = Queue(context, device_cpp); - // Runs all the level 1 set-up functions - Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr); - Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr); - Xscal<float>(queue, nullptr); Xscal<double>(queue, nullptr); Xscal<float2>(queue, nullptr); Xscal<double2>(queue, nullptr); - Xcopy<float>(queue, nullptr); Xcopy<double>(queue, nullptr); Xcopy<float2>(queue, nullptr); Xcopy<double2>(queue, nullptr); - Xaxpy<float>(queue, nullptr); Xaxpy<double>(queue, nullptr); Xaxpy<float2>(queue, nullptr); Xaxpy<double2>(queue, nullptr); - Xdot<float>(queue, nullptr); Xdot<double>(queue, nullptr); - Xdotu<float2>(queue, nullptr); Xdotu<double2>(queue, nullptr); - Xdotc<float2>(queue, nullptr); Xdotc<double2>(queue, nullptr); - Xnrm2<float>(queue, nullptr); Xnrm2<double>(queue, nullptr); Xnrm2<float2>(queue, nullptr); Xnrm2<double2>(queue, nullptr); - Xasum<float>(queue, nullptr); Xasum<double>(queue, nullptr); Xasum<float2>(queue, nullptr); Xasum<double2>(queue, nullptr); - Xsum<float>(queue, nullptr); Xsum<double>(queue, nullptr); Xsum<float2>(queue, nullptr); Xsum<double2>(queue, nullptr); - Xamax<float>(queue, nullptr); Xamax<double>(queue, nullptr); Xamax<float2>(queue, nullptr); Xamax<double2>(queue, nullptr); - Xmax<float>(queue, nullptr); Xmax<double>(queue, nullptr); Xmax<float2>(queue, nullptr); Xmax<double2>(queue, nullptr); - Xmin<float>(queue, nullptr); Xmin<double>(queue, nullptr); Xmin<float2>(queue, nullptr); Xmin<double2>(queue, nullptr); - - // Runs all the level 2 set-up functions - Xgemv<float>(queue, nullptr); Xgemv<double>(queue, nullptr); Xgemv<float2>(queue, nullptr); Xgemv<double2>(queue, nullptr); - Xgbmv<float>(queue, nullptr); Xgbmv<double>(queue, nullptr); Xgbmv<float2>(queue, nullptr); Xgbmv<double2>(queue, nullptr); - Xhemv<float2>(queue, nullptr); Xhemv<double2>(queue, nullptr); - Xhbmv<float2>(queue, nullptr); Xhbmv<double2>(queue, nullptr); - Xhpmv<float2>(queue, nullptr); Xhpmv<double2>(queue, nullptr); - Xsymv<float>(queue, nullptr); Xsymv<double>(queue, nullptr); - Xsbmv<float>(queue, nullptr); Xsbmv<double>(queue, nullptr); - Xspmv<float>(queue, nullptr); Xspmv<double>(queue, nullptr); - Xtrmv<float>(queue, nullptr); Xtrmv<double>(queue, nullptr); Xtrmv<float2>(queue, nullptr); Xtrmv<double2>(queue, nullptr); - Xtbmv<float>(queue, nullptr); Xtbmv<double>(queue, nullptr); Xtbmv<float2>(queue, nullptr); Xtbmv<double2>(queue, nullptr); - Xtpmv<float>(queue, nullptr); Xtpmv<double>(queue, nullptr); Xtpmv<float2>(queue, nullptr); Xtpmv<double2>(queue, nullptr); - Xger<float>(queue, nullptr); Xger<double>(queue, nullptr); - Xgeru<float2>(queue, nullptr); Xgeru<double2>(queue, nullptr); - Xgerc<float2>(queue, nullptr); Xgerc<double2>(queue, nullptr); - Xher<float2,float>(queue, nullptr); Xher<double2,double>(queue, nullptr); - Xhpr<float2,float>(queue, nullptr); Xhpr<double2,double>(queue, nullptr); - Xher2<float2>(queue, nullptr); Xher2<double2>(queue, nullptr); - Xhpr2<float2>(queue, nullptr); Xhpr2<double2>(queue, nullptr); - Xsyr<float>(queue, nullptr); Xsyr<double>(queue, nullptr); - Xspr<float>(queue, nullptr); Xspr<double>(queue, nullptr); - Xsyr2<float>(queue, nullptr); Xsyr2<double>(queue, nullptr); - Xspr2<float>(queue, nullptr); Xspr2<double>(queue, nullptr); - - // Runs all the level 3 set-up functions - Xgemm<float>(queue, nullptr); Xgemm<double>(queue, nullptr); Xgemm<float2>(queue, nullptr); Xgemm<double2>(queue, nullptr); - Xsymm<float>(queue, nullptr); Xsymm<double>(queue, nullptr); Xsymm<float2>(queue, nullptr); Xsymm<double2>(queue, nullptr); - Xhemm<float2>(queue, nullptr); Xhemm<double2>(queue, nullptr); - Xsyrk<float>(queue, nullptr); Xsyrk<double>(queue, nullptr); Xsyrk<float2>(queue, nullptr); Xsyrk<double2>(queue, nullptr); - Xherk<float2,float>(queue, nullptr); Xherk<double2,double>(queue, nullptr); - Xsyr2k<float>(queue, nullptr); Xsyr2k<double>(queue, nullptr); Xsyr2k<float2>(queue, nullptr); Xsyr2k<double2>(queue, nullptr); - Xher2k<float2,float>(queue, nullptr); Xher2k<double2,double>(queue, nullptr); - Xtrmm<float>(queue, nullptr); Xtrmm<double>(queue, nullptr); Xtrmm<float2>(queue, nullptr); Xtrmm<double2>(queue, nullptr); - - // Runs all the level 3 set-up functions - Xomatcopy<float>(queue, nullptr); Xomatcopy<double>(queue, nullptr); Xomatcopy<float2>(queue, nullptr); Xomatcopy<double2>(queue, nullptr); + FillCacheForPrecision<float, float2>(queue); + FillCacheForPrecision<double, double2>(queue); } catch (...) { return DispatchException(); } return StatusCode::kSuccess; diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index 59e4cd16..e4f2b3ed 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -13,9 +13,9 @@ #include <string> +#include "utilities/utilities.hpp" #include "clblast_c.h" #include "clblast.h" -#include "utilities/utilities.hpp" // Shortcuts to the clblast namespace using float2 = clblast::float2; diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 0383f53a..41af28da 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -333,7 +333,10 @@ class Context { // Regular constructor with memory management explicit Context(const Device &device): - context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) { + context_(new cl_context, [](cl_context* c) { + if (*c) { CheckErrorDtor(clReleaseContext(*c)); } + delete c; + }) { auto status = CL_SUCCESS; const cl_device_id dev = device(); *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status); @@ -355,33 +358,37 @@ using ContextPointer = cl_context*; // Enumeration of build statuses of the run-time compilation process enum class BuildStatus { kSuccess, kError, kInvalid }; -// C++11 version of 'cl_program'. Additionally holds the program's source code. +// C++11 version of 'cl_program'. class Program { public: - // Note that there is no constructor based on the regular OpenCL data-type because of extra state + Program() = default; // Source-based constructor with memory management - explicit Program(const Context &context, std::string source): - program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }), - length_(source.length()), - source_(std::move(source)), - source_ptr_(&source_[0]) { + explicit Program(const Context &context, const std::string &source): + program_(new cl_program, [](cl_program* p) { + if (*p) { CheckErrorDtor(clReleaseProgram(*p)); } + delete p; + }) { + const char *source_ptr = &source[0]; + size_t length = source.length(); auto status = CL_SUCCESS; - *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status); + *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status); CLError::Check(status, "clCreateProgramWithSource"); } // Binary-based constructor with memory management - explicit Program(const Device &device, const Context &context, const std::string& binary): - program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }), - length_(binary.length()), - source_(binary), - source_ptr_(&source_[0]) { + explicit Program(const Device &device, const Context &context, const std::string &binary): + program_(new cl_program, [](cl_program* p) { + if (*p) { CheckErrorDtor(clReleaseProgram(*p)); } + delete p; + }) { + const char *binary_ptr = &binary[0]; + size_t length = binary.length(); auto status1 = CL_SUCCESS; auto status2 = CL_SUCCESS; const cl_device_id dev = device(); - *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_, - reinterpret_cast<const unsigned char**>(&source_ptr_), + *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length, + reinterpret_cast<const unsigned char**>(&binary_ptr), &status1, &status2); CLError::Check(status1, "clCreateProgramWithBinary (binary status)"); CLError::Check(status2, "clCreateProgramWithBinary"); @@ -421,9 +428,6 @@ class Program { const cl_program& operator()() const { return *program_; } private: std::shared_ptr<cl_program> program_; - size_t length_; - std::string source_; // Note: the source can also be a binary or IR - const char* source_ptr_; }; // ================================================================================================= @@ -440,8 +444,10 @@ class Queue { // Regular constructor with memory management explicit Queue(const Context &context, const Device &device): - queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s)); - delete s; }) { + queue_(new cl_command_queue, [](cl_command_queue* s) { + if (*s) { CheckErrorDtor(clReleaseCommandQueue(*s)); } + delete s; + }) { auto status = CL_SUCCESS; *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); CLError::Check(status, "clCreateCommandQueue"); @@ -665,7 +671,10 @@ class Kernel { // Regular constructor with memory management explicit Kernel(const Program &program, const std::string &name): - kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) { + kernel_(new cl_kernel, [](cl_kernel* k) { + if (*k) { CheckErrorDtor(clReleaseKernel(*k)); } + delete k; + }) { auto status = CL_SUCCESS; *kernel_ = clCreateKernel(program(), name.c_str(), &status); CLError::Check(status, "clCreateKernel"); diff --git a/src/database/database.cpp b/src/database/database.cpp index c000b0b7..aff6490d 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -67,12 +67,11 @@ const std::unordered_map<std::string, std::string> Database::kVendorNames{ // Constructor, computing device properties and populating the parameter-vector from the database. // This takes an optional overlay database in case of custom tuning or custom kernels. -Database::Database(const Queue &queue, const std::vector<std::string> &kernels, +Database::Database(const Device &device, const std::vector<std::string> &kernels, const Precision precision, const std::vector<const DatabaseEntry*> &overlay): - parameters_{} { + parameters_(std::make_shared<Parameters>()) { // Finds information of the current device - auto device = queue.GetDevice(); auto device_type = device.Type(); auto device_vendor = device.Vendor(); auto device_name = device.Name(); @@ -91,7 +90,7 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels, for (auto &db: { database, overlay}) { search_result = Search(kernel, device_type, device_vendor, device_name, precision, db); if (search_result) { - parameters_.insert(search_result->begin(), search_result->end()); + parameters_->insert(search_result->begin(), search_result->end()); break; } } @@ -105,7 +104,7 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels, // Returns a list of OpenCL pre-processor defines in string form std::string Database::GetDefines() const { std::string defines{}; - for (auto ¶meter: parameters_) { + for (auto ¶meter: *parameters_) { defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n"; } return defines; diff --git a/src/database/database.hpp b/src/database/database.hpp index 7c05a20b..87c12293 100644 --- a/src/database/database.hpp +++ b/src/database/database.hpp @@ -72,12 +72,14 @@ class Database { // The database consists of separate database entries, stored together in a vector static const std::vector<const DatabaseEntry*> database; + Database() = default; + // The constructor with a user-provided database overlay (potentially an empty vector) - explicit Database(const Queue &queue, const std::vector<std::string> &routines, + explicit Database(const Device &device, const std::vector<std::string> &routines, const Precision precision, const std::vector<const DatabaseEntry*> &overlay); // Accessor of values by key - size_t operator[](const std::string key) const { return parameters_.find(key)->second; } + size_t operator[](const std::string key) const { return parameters_->find(key)->second; } // Obtain a list of OpenCL pre-processor defines based on the parameters std::string GetDefines() const; @@ -90,7 +92,7 @@ class Database { const std::vector<const DatabaseEntry*> &db) const; // Found parameters suitable for this device/kernel - Parameters parameters_; + std::shared_ptr<Parameters> parameters_; }; // ================================================================================================= diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp index 1bc63691..f0431933 100644 --- a/src/database/kernels/copy.hpp +++ b/src/database/kernels/copy.hpp @@ -44,6 +44,7 @@ const Database::DatabaseEntry CopySingle = { { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Tonga", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, + { "Turks", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, } }, @@ -55,10 +56,12 @@ const Database::DatabaseEntry CopySingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } }, } }, { // Intel GPUs @@ -83,6 +86,7 @@ const Database::DatabaseEntry CopySingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "GeForce GTX 1080", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, @@ -117,13 +121,16 @@ const Database::DatabaseEntry CopyComplexSingle = { { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Turks", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",2} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, } @@ -150,6 +157,7 @@ const Database::DatabaseEntry CopyComplexSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 1070", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 1080", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, @@ -164,7 +172,7 @@ const Database::DatabaseEntry CopyComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, } }, } @@ -193,10 +201,12 @@ const Database::DatabaseEntry CopyDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, } }, { // Intel accelerators @@ -209,6 +219,7 @@ const Database::DatabaseEntry CopyDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "GeForce GTX 1080", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 670", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, @@ -225,7 +236,7 @@ const Database::DatabaseEntry CopyDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, } }, } @@ -254,10 +265,12 @@ const Database::DatabaseEntry CopyComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, } }, { // Intel accelerators @@ -270,6 +283,7 @@ const Database::DatabaseEntry CopyComplexDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",4} } }, + { "GeForce GTX 1080", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp index 310d3a78..3378709c 100644 --- a/src/database/kernels/pad.hpp +++ b/src/database/kernels/pad.hpp @@ -44,7 +44,8 @@ const Database::DatabaseEntry PadSingle = { { "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + { "Turks", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } }, { // ARM GPUs @@ -55,8 +56,10 @@ const Database::DatabaseEntry PadSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, } @@ -83,6 +86,7 @@ const Database::DatabaseEntry PadSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX 1070", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 1080", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, { "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, @@ -94,7 +98,7 @@ const Database::DatabaseEntry PadSingle = { { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, } }, { // Default @@ -117,7 +121,8 @@ const Database::DatabaseEntry PadComplexSingle = { { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Turks", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, } }, { // ARM GPUs @@ -128,10 +133,12 @@ const Database::DatabaseEntry PadComplexSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, } }, { // Intel GPUs @@ -156,6 +163,7 @@ const Database::DatabaseEntry PadComplexSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 1080", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, @@ -172,7 +180,7 @@ const Database::DatabaseEntry PadComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } }, } @@ -201,8 +209,10 @@ const Database::DatabaseEntry PadDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, } @@ -217,6 +227,7 @@ const Database::DatabaseEntry PadDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 1080", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, @@ -262,10 +273,12 @@ const Database::DatabaseEntry PadComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, } }, { // Intel accelerators @@ -278,6 +291,7 @@ const Database::DatabaseEntry PadComplexDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, + { "GeForce GTX 1080", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp index 8ef09e85..212723c7 100644 --- a/src/database/kernels/padtranspose.hpp +++ b/src/database/kernels/padtranspose.hpp @@ -44,6 +44,7 @@ const Database::DatabaseEntry PadtransposeSingle = { { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Turks", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, } }, @@ -55,8 +56,10 @@ const Database::DatabaseEntry PadtransposeSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, } @@ -83,6 +86,7 @@ const Database::DatabaseEntry PadtransposeSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, { "GeForce GTX 1070", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 1080", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, @@ -117,6 +121,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = { { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Turks", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, @@ -128,10 +133,12 @@ const Database::DatabaseEntry PadtransposeComplexSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, } }, { // Intel GPUs @@ -156,6 +163,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 1080", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, @@ -201,8 +209,10 @@ const Database::DatabaseEntry PadtransposeDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } @@ -217,6 +227,7 @@ const Database::DatabaseEntry PadtransposeDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 1080", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, @@ -262,10 +273,12 @@ const Database::DatabaseEntry PadtransposeComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, { // Intel accelerators @@ -278,6 +291,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 1080", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp index 23fecb49..f33f2a04 100644 --- a/src/database/kernels/transpose.hpp +++ b/src/database/kernels/transpose.hpp @@ -44,7 +44,8 @@ const Database::DatabaseEntry TransposeSingle = { { "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Tonga", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Turks", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, { // ARM GPUs @@ -55,8 +56,10 @@ const Database::DatabaseEntry TransposeSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, } @@ -83,6 +86,7 @@ const Database::DatabaseEntry TransposeSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "GeForce GTX 1080", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, @@ -117,6 +121,7 @@ const Database::DatabaseEntry TransposeComplexSingle = { { "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tonga", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "Turks", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, @@ -128,8 +133,10 @@ const Database::DatabaseEntry TransposeComplexSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, } @@ -150,6 +157,7 @@ const Database::DatabaseEntry TransposeComplexSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "GeForce GTX 1070", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "GeForce GTX 1080", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, @@ -166,7 +174,7 @@ const Database::DatabaseEntry TransposeComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, } @@ -195,10 +203,12 @@ const Database::DatabaseEntry TransposeDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, } }, { // Intel accelerators @@ -211,6 +221,7 @@ const Database::DatabaseEntry TransposeDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "GeForce GTX 1080", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, @@ -256,16 +267,19 @@ const Database::DatabaseEntry TransposeComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX 1080", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp index 52845e96..e4e3c621 100644 --- a/src/database/kernels/xaxpy.hpp +++ b/src/database/kernels/xaxpy.hpp @@ -44,6 +44,7 @@ const Database::DatabaseEntry XaxpySingle = { { "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } }, + { "Turks", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, { "default", { {"VW",2}, {"WGS",64}, {"WPT",2} } }, } }, @@ -55,10 +56,12 @@ const Database::DatabaseEntry XaxpySingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, + { "default", { {"VW",8}, {"WGS",512}, {"WPT",1} } }, } }, { // Intel GPUs @@ -83,6 +86,7 @@ const Database::DatabaseEntry XaxpySingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",4} } }, + { "GeForce GTX 1080", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX 480", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, @@ -94,7 +98,7 @@ const Database::DatabaseEntry XaxpySingle = { { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, } }, { // Default @@ -117,6 +121,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = { { "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } }, + { "Turks", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, @@ -128,8 +133,10 @@ const Database::DatabaseEntry XaxpyComplexSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",4}, {"WGS",1024}, {"WPT",1} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",4}, {"WGS",1024}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, { "default", { {"VW",8}, {"WGS",1024}, {"WPT",1} } }, } @@ -156,6 +163,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",2} } }, + { "GeForce GTX 1080", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, @@ -201,10 +209,12 @@ const Database::DatabaseEntry XaxpyDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",8}, {"WGS",256}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } }, - { "default", { {"VW",8}, {"WGS",512}, {"WPT",1} } }, + { "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, } }, { // Intel accelerators @@ -217,6 +227,7 @@ const Database::DatabaseEntry XaxpyDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",8} } }, + { "GeForce GTX 1080", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, @@ -262,8 +273,10 @@ const Database::DatabaseEntry XaxpyComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",4}, {"WGS",1024}, {"WPT",1} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",8}, {"WGS",1024}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "default", { {"VW",4}, {"WGS",1024}, {"WPT",1} } }, } @@ -278,6 +291,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",2} } }, + { "GeForce GTX 1080", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, @@ -294,7 +308,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, } }, } diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp index 8b07c539..30d98e5d 100644 --- a/src/database/kernels/xdot.hpp +++ b/src/database/kernels/xdot.hpp @@ -43,13 +43,16 @@ const Database::DatabaseEntry XdotSingle = { { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",128}, {"WGS2",32} } }, { "Tonga", { {"WGS1",64}, {"WGS2",32} } }, - { "default", { {"WGS1",128}, {"WGS2",32} } }, + { "Turks", { {"WGS1",128}, {"WGS2",64} } }, + { "default", { {"WGS1",128}, {"WGS2",64} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WGS2",32} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",1024}, {"WGS2",32} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, } }, { // Intel GPUs @@ -67,6 +70,7 @@ const Database::DatabaseEntry XdotSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",128}, {"WGS2",32} } }, { "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",1024} } }, + { "GeForce GTX 1080", { {"WGS1",512}, {"WGS2",64} } }, { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, { "GeForce GTX 670", { {"WGS1",512}, {"WGS2",1024} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } }, @@ -76,12 +80,12 @@ const Database::DatabaseEntry XdotSingle = { { "GeForce GTX TITAN Black", { {"WGS1",512}, {"WGS2",64} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",256}, {"WGS2",256} } }, + { "default", { {"WGS1",256}, {"WGS2",64} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",256}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, } @@ -98,13 +102,16 @@ const Database::DatabaseEntry XdotComplexSingle = { { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",32} } }, { "Tonga", { {"WGS1",256}, {"WGS2",64} } }, - { "default", { {"WGS1",256}, {"WGS2",64} } }, + { "Turks", { {"WGS1",128}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",32} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WGS2",64} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",1024}, {"WGS2",32} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",32} } }, } }, { // Intel GPUs @@ -122,6 +129,7 @@ const Database::DatabaseEntry XdotComplexSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",64}, {"WGS2",32} } }, { "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",32} } }, + { "GeForce GTX 1080", { {"WGS1",128}, {"WGS2",64} } }, { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, { "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, @@ -136,7 +144,7 @@ const Database::DatabaseEntry XdotComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",256}, {"WGS2",64} } }, + { "default", { {"WGS1",256}, {"WGS2",32} } }, } }, } @@ -158,14 +166,17 @@ const Database::DatabaseEntry XdotDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",64}, {"WGS2",128} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",64} } }, - { "default", { {"WGS1",512}, {"WGS2",64} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",64} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",128}, {"WGS2",32} } }, { "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",512} } }, + { "GeForce GTX 1080", { {"WGS1",128}, {"WGS2",128} } }, { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, { "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, @@ -175,7 +186,7 @@ const Database::DatabaseEntry XdotDouble = { { "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, - { "default", { {"WGS1",128}, {"WGS2",64} } }, + { "default", { {"WGS1",128}, {"WGS2",128} } }, } }, { // Default @@ -202,14 +213,17 @@ const Database::DatabaseEntry XdotComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WGS2",128} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",1024}, {"WGS2",32} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",64}, {"WGS2",32} } }, { "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",64} } }, + { "GeForce GTX 1080", { {"WGS1",128}, {"WGS2",32} } }, { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, { "GeForce GTX 670", { {"WGS1",512}, {"WGS2",128} } }, { "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } }, @@ -224,7 +238,7 @@ const Database::DatabaseEntry XdotComplexDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",256}, {"WGS2",64} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, } diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp index 66ac8a9f..d9414f8b 100644 --- a/src/database/kernels/xgemm.hpp +++ b/src/database/kernels/xgemm.hpp @@ -43,7 +43,8 @@ const Database::DatabaseEntry XgemmSingle = { { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, { "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, + { "Turks", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, } }, { // ARM GPUs @@ -54,10 +55,12 @@ const Database::DatabaseEntry XgemmSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, } }, { // Intel GPUs @@ -82,6 +85,7 @@ const Database::DatabaseEntry XgemmSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, { "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, + { "GeForce GTX 1080", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } }, { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, @@ -93,7 +97,7 @@ const Database::DatabaseEntry XgemmSingle = { { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } }, { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, } }, { // Default @@ -116,7 +120,8 @@ const Database::DatabaseEntry XgemmComplexSingle = { { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, { "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } }, + { "Turks", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, } }, { // ARM GPUs @@ -127,10 +132,12 @@ const Database::DatabaseEntry XgemmComplexSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, } }, { // Intel GPUs @@ -155,6 +162,7 @@ const Database::DatabaseEntry XgemmComplexSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, + { "GeForce GTX 1080", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, @@ -171,7 +179,7 @@ const Database::DatabaseEntry XgemmComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, } @@ -200,10 +208,12 @@ const Database::DatabaseEntry XgemmDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, - { "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, } }, { // Intel accelerators @@ -216,6 +226,7 @@ const Database::DatabaseEntry XgemmDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "GeForce GTX 1080", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, @@ -227,12 +238,12 @@ const Database::DatabaseEntry XgemmDouble = { { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, } }, } @@ -261,10 +272,12 @@ const Database::DatabaseEntry XgemmComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, } }, { // Intel accelerators @@ -277,6 +290,7 @@ const Database::DatabaseEntry XgemmComplexDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX 1070", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, + { "GeForce GTX 1080", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } }, { "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, @@ -287,12 +301,12 @@ const Database::DatabaseEntry XgemmComplexDouble = { { "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, } }, } diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp index 4413cf1b..c0cd2c04 100644 --- a/src/database/kernels/xgemm_direct.hpp +++ b/src/database/kernels/xgemm_direct.hpp @@ -39,9 +39,17 @@ const Database::DatabaseEntry XgemmDirectSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, { "Tonga", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, + { "Turks", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, } }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",8}, {"WGD",64} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",64} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } }, @@ -51,6 +59,7 @@ const Database::DatabaseEntry XgemmDirectSingle = { }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 1080", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } }, { "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } }, { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } }, @@ -72,9 +81,17 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, { "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + { "Turks", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } }, { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, } }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, @@ -84,6 +101,7 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = { }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 1080", { {"KWID",8}, {"MDIMAD",8}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, { "GeForce GTX 750 Ti", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } }, { "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, @@ -91,7 +109,7 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } }, } }, } @@ -108,16 +126,24 @@ const Database::DatabaseEntry XgemmDirectDouble = { { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, } }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, + } + }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 1080", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } }, { "GeForce GTX TITAN Black", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, - { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, } }, } @@ -134,8 +160,16 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = { { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, } }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } }, + } + }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 1080", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, { "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } }, { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, @@ -143,7 +177,7 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, } }, } diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp index 5f25f210..52b17d94 100644 --- a/src/database/kernels/xgemv.hpp +++ b/src/database/kernels/xgemv.hpp @@ -44,13 +44,16 @@ const Database::DatabaseEntry XgemvSingle = { { "Pitcairn", { {"WGS1",256}, {"WPT1",1} } }, { "Tahiti", { {"WGS1",256}, {"WPT1",1} } }, { "Tonga", { {"WGS1",128}, {"WPT1",2} } }, + { "Turks", { {"WGS1",32}, {"WPT1",1} } }, { "default", { {"WGS1",128}, {"WPT1",1} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WPT1",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WPT1",4} } }, { "default", { {"WGS1",64}, {"WPT1",4} } }, } }, @@ -76,6 +79,7 @@ const Database::DatabaseEntry XgemvSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",256}, {"WPT1",1} } }, { "GeForce GTX 1070", { {"WGS1",128}, {"WPT1",1} } }, + { "GeForce GTX 1080", { {"WGS1",32}, {"WPT1",1} } }, { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } }, { "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } }, { "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1} } }, @@ -110,14 +114,17 @@ const Database::DatabaseEntry XgemvComplexSingle = { { "Pitcairn", { {"WGS1",64}, {"WPT1",1} } }, { "Tahiti", { {"WGS1",64}, {"WPT1",1} } }, { "Tonga", { {"WGS1",32}, {"WPT1",1} } }, + { "Turks", { {"WGS1",64}, {"WPT1",1} } }, { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WPT1",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, - { "default", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WPT1",4} } }, + { "default", { {"WGS1",64}, {"WPT1",2} } }, } }, { // Intel GPUs @@ -142,6 +149,7 @@ const Database::DatabaseEntry XgemvComplexSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",256}, {"WPT1",1} } }, { "GeForce GTX 1070", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 1080", { {"WGS1",32}, {"WPT1",1} } }, { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } }, { "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } }, { "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1} } }, @@ -177,8 +185,10 @@ const Database::DatabaseEntry XgemvDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",64}, {"WPT1",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WPT1",4} } }, { "default", { {"WGS1",64}, {"WPT1",4} } }, } }, @@ -192,6 +202,7 @@ const Database::DatabaseEntry XgemvDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",128}, {"WPT1",1} } }, { "GeForce GTX 1070", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 1080", { {"WGS1",32}, {"WPT1",1} } }, { "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1} } }, { "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1} } }, @@ -208,7 +219,7 @@ const Database::DatabaseEntry XgemvDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",128}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, } @@ -231,8 +242,10 @@ const Database::DatabaseEntry XgemvComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",64}, {"WPT1",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",32}, {"WPT1",4} } }, { "default", { {"WGS1",64}, {"WPT1",4} } }, } }, diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp index 994a220c..2dd400bc 100644 --- a/src/database/kernels/xgemv_fast.hpp +++ b/src/database/kernels/xgemv_fast.hpp @@ -44,14 +44,17 @@ const Database::DatabaseEntry XgemvFastSingle = { { "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Tonga", { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } }, + { "Turks", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } }, - { "default", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",2}, {"WGS2",16}, {"WPT2",4} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } }, } }, { // Intel GPUs @@ -76,6 +79,7 @@ const Database::DatabaseEntry XgemvFastSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } }, { "GeForce GTX 1070", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 1080", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "GeForce GTX 480", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "GeForce GTX 670", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } }, { "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, @@ -110,13 +114,16 @@ const Database::DatabaseEntry XgemvFastComplexSingle = { { "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Tahiti", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "Tonga", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } }, + { "Turks", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } }, { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",4}, {"WGS2",16}, {"WPT2",4} } }, { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } }, } }, @@ -173,8 +180,10 @@ const Database::DatabaseEntry XgemvFastDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } }, { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } }, } }, @@ -188,6 +197,7 @@ const Database::DatabaseEntry XgemvFastDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "GeForce GTX 1070", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 1080", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } }, { "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "GeForce GTX 670", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, @@ -227,9 +237,11 @@ const Database::DatabaseEntry XgemvFastComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",4}, {"WGS2",32}, {"WPT2",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } }, - { "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",1}, {"WGS2",16}, {"WPT2",2} } }, + { "default", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } }, } }, { // Intel accelerators diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp index da8bcfeb..36a435b5 100644 --- a/src/database/kernels/xgemv_fast_rot.hpp +++ b/src/database/kernels/xgemv_fast_rot.hpp @@ -39,13 +39,16 @@ const Database::DatabaseEntry XgemvFastRotSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } }, { "Tonga", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } }, + { "Turks", { {"VW3",8}, {"WGS3",128}, {"WPT3",16} } }, { "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } }, - { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } }, + { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, } }, { // Intel GPUs @@ -60,6 +63,7 @@ const Database::DatabaseEntry XgemvFastRotSingle = { }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 1080", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, { "GeForce GTX 750 Ti", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, { "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, { "GeForce GTX TITAN Black", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } }, @@ -68,7 +72,7 @@ const Database::DatabaseEntry XgemvFastRotSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } }, + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, } }, } @@ -82,13 +86,16 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } }, { "Tonga", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, - { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, + { "Turks", { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } }, + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, - { "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, } }, { // Intel GPUs @@ -122,12 +129,15 @@ const Database::DatabaseEntry XgemvFastRotDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } }, - { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } }, + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 1080", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, { "GeForce GTX 750 Ti", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, { "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, { "GeForce GTX TITAN Black", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, @@ -155,13 +165,15 @@ const Database::DatabaseEntry XgemvFastRotComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } }, - { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } }, + { "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, } }, } diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp index 5e2be6a9..f99b7632 100644 --- a/src/database/kernels/xger.hpp +++ b/src/database/kernels/xger.hpp @@ -44,6 +44,7 @@ const Database::DatabaseEntry XgerSingle = { { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, { "Tonga", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, + { "Turks", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } }, { "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } }, } }, @@ -55,7 +56,9 @@ const Database::DatabaseEntry XgerSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WGS2",4}, {"WPT",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",4}, {"WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, { "default", { {"WGS1",128}, {"WGS2",8}, {"WPT",4} } }, } @@ -75,6 +78,7 @@ const Database::DatabaseEntry XgerSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, { "GeForce GTX 1070", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } }, + { "GeForce GTX 1080", { {"WGS1",16}, {"WGS2",4}, {"WPT",1} } }, { "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, { "GeForce GTX 670", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, @@ -105,7 +109,8 @@ const Database::DatabaseEntry XgerComplexSingle = { { "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, { "Tonga", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, - { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "Turks", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, } }, { // ARM GPUs @@ -116,9 +121,11 @@ const Database::DatabaseEntry XgerComplexSingle = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",2}, {"WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, + { "default", { {"WGS1",256}, {"WGS2",2}, {"WPT",4} } }, } }, { // Intel GPUs @@ -136,6 +143,7 @@ const Database::DatabaseEntry XgerComplexSingle = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } }, { "GeForce GTX 1070", { {"WGS1",16}, {"WGS2",64}, {"WPT",2} } }, + { "GeForce GTX 1080", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } }, { "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } }, { "GeForce GTX 670", { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, @@ -143,7 +151,7 @@ const Database::DatabaseEntry XgerComplexSingle = { { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, { "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, - { "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } }, + { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, } }, { // Default @@ -177,15 +185,18 @@ const Database::DatabaseEntry XgerDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",4}, {"WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } }, - { "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } }, + { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX 1070", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } }, + { "GeForce GTX 1080", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } }, { "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "GeForce GTX 670", { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, @@ -227,7 +238,9 @@ const Database::DatabaseEntry XgerComplexDouble = { }, { // Intel CPUs kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WGS2",4}, {"WPT",4} } }, { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, + { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",512}, {"WGS2",2}, {"WPT",2} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, } @@ -236,6 +249,7 @@ const Database::DatabaseEntry XgerComplexDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX 1070", { {"WGS1",8}, {"WGS2",128}, {"WPT",1} } }, + { "GeForce GTX 1080", { {"WGS1",8}, {"WGS2",4}, {"WPT",1} } }, { "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, { "GeForce GTX 670", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } }, diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index c052e94f..0ce4f367 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -31,9 +31,7 @@ R"( // Enable support for double-precision #if PRECISION == 64 || PRECISION == 6464 - #if __OPENCL_VERSION__ <= CL_VERSION_1_1 - #pragma OPENCL EXTENSION cl_khr_fp64: enable - #endif + #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif // Half-precision diff --git a/src/routine.cpp b/src/routine.cpp index acafb0d2..4fe04a60 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -32,11 +32,34 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name, event_(event), context_(queue_.GetContext()), device_(queue_.GetDevice()), - device_name_(device_.Name()), - db_(queue_, routines, precision_, userDatabase) { + device_name_(device_.Name()) { + + InitDatabase(routines, userDatabase); + InitProgram(source); +} + +void Routine::InitDatabase(const std::vector<std::string> &routines, + const std::vector<const Database::DatabaseEntry*> &userDatabase) { + + // Queries the cache to see whether or not the kernel parameter database is already there + bool has_db; + db_ = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision_, device_name_, routines }, + &has_db); + if (has_db) { return; } + + // Builds the parameter database for this device and routine set and stores it in the cache + db_ = Database(device_, routines, precision_, userDatabase); + DatabaseCache::Instance().Store(DatabaseKey{ precision_, device_name_, routines }, + Database{ db_ }); +} + +void Routine::InitProgram(std::initializer_list<const char *> source) { // Queries the cache to see whether or not the program (context-specific) is already there - if (ProgramIsInCache(context_, precision_, routine_name_)) { return; } + bool has_program; + program_ = ProgramCache::Instance().Get(ProgramKeyRef{ context_(), precision_, routine_name_ }, + &has_program); + if (has_program) { return; } // Sets the build options from an environmental variable (if set) auto options = std::vector<std::string>(); @@ -47,29 +70,29 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name, // Queries the cache to see whether or not the binary (device-specific) is already there. If it // is, a program is created and stored in the cache - if (BinaryIsInCache(device_name_, precision_, routine_name_)) { - auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_); - auto program = Program(device_, context_, binary); - program.Build(device_, options); - StoreProgramToCache(program, context_, precision_, routine_name_); + bool has_binary; + auto binary = BinaryCache::Instance().Get(BinaryKeyRef{ precision_, routine_name_, device_name_ }, + &has_binary); + if (has_binary) { + program_ = Program(device_, context_, binary); + program_.Build(device_, options); + ProgramCache::Instance().Store(ProgramKey{ context_(), precision_, routine_name_ }, + Program{ program_ }); + return; } // Otherwise, the kernel will be compiled and program will be built. Both the binary and the // program will be added to the cache. // Inspects whether or not cl_khr_fp64 is supported in case of double precision - const auto extensions = device_.Capabilities(); - if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { - if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { - throw RuntimeErrorCode(StatusCode::kNoDoublePrecision); - } + if ((precision_ == Precision::kDouble && !PrecisionSupported<double>(device_)) || + (precision_ == Precision::kComplexDouble && !PrecisionSupported<double2>(device_))) { + throw RuntimeErrorCode(StatusCode::kNoDoublePrecision); } // As above, but for cl_khr_fp16 (half precision) - if (precision_ == Precision::kHalf) { - if (extensions.find(kKhronosHalfPrecision) == std::string::npos) { - throw RuntimeErrorCode(StatusCode::kNoHalfPrecision); - } + if (precision_ == Precision::kHalf && !PrecisionSupported<half>(device_)) { + throw RuntimeErrorCode(StatusCode::kNoHalfPrecision); } // Collects the parameters for this device in the form of defines, and adds the precision @@ -114,21 +137,23 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name, #endif // Compiles the kernel - auto program = Program(context_, source_string); + program_ = Program(context_, source_string); try { - program.Build(device_, options); + program_.Build(device_, options); } catch (const CLError &e) { if (e.status() == CL_BUILD_PROGRAM_FAILURE) { fprintf(stdout, "OpenCL compiler error/warning: %s\n", - program.GetBuildInfo(device_).c_str()); + program_.GetBuildInfo(device_).c_str()); } throw; } // Store the compiled binary and program in the cache - const auto binary = program.GetIR(); - StoreBinaryToCache(binary, device_name_, precision_, routine_name_); - StoreProgramToCache(program, context_, precision_, routine_name_); + BinaryCache::Instance().Store(BinaryKey{ precision_, routine_name_, device_name_ }, + program_.GetIR()); + + ProgramCache::Instance().Store(ProgramKey{ context_(), precision_, routine_name_ }, + Program{ program_ }); // Prints the elapsed compilation time in case of debugging in verbose mode #ifdef VERBOSE diff --git a/src/routine.hpp b/src/routine.hpp index 2d8b2415..f366e4d9 100644 --- a/src/routine.hpp +++ b/src/routine.hpp @@ -35,11 +35,22 @@ class Routine { // Base class constructor. The user database is an optional extra database to override the // built-in database. // All heavy preparation work is done inside this constructor. + // NOTE: the caller must provide the same userDatabase for each combination of device, precision + // and routine list, otherwise the caching logic will break. explicit Routine(Queue &queue, EventPointer event, const std::string &name, const std::vector<std::string> &routines, const Precision precision, const std::vector<const Database::DatabaseEntry*> &userDatabase, std::initializer_list<const char *> source); + private: + + // Initializes program_, fetching cached program or building one + void InitProgram(std::initializer_list<const char *> source); + + // Initializes db_, fetching cached database or building one + void InitDatabase(const std::vector<std::string> &routines, + const std::vector<const Database::DatabaseEntry*> &userDatabase); + protected: // Non-static variable for the precision @@ -57,8 +68,11 @@ class Routine { // OpenCL device properties const std::string device_name_; + // Compiled program (either retrieved from cache or compiled in slow path) + Program program_; + // Connection to the database for all the device-specific parameters - const Database db_; + Database db_; }; // ================================================================================================= diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 8046c0be..bdea0086 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -19,8 +19,8 @@ #include <string> #include <vector> -#include "clblast.h" #include "clpp11.hpp" +#include "clblast.h" #include "database/database.hpp" namespace clblast { diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp index e9efa1a7..40a66517 100644 --- a/src/routines/level1/xamax.cpp +++ b/src/routines/level1/xamax.cpp @@ -43,9 +43,8 @@ void Xamax<T>::DoAmax(const size_t n, TestVectorIndex(1, imax_buffer, imax_offset); // Retrieves the Xamax kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xamax"); - auto kernel2 = Kernel(program, "XamaxEpilogue"); + auto kernel1 = Kernel(program_, "Xamax"); + auto kernel2 = Kernel(program_, "XamaxEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp index a242a5fa..b93b271c 100644 --- a/src/routines/level1/xasum.cpp +++ b/src/routines/level1/xasum.cpp @@ -43,9 +43,8 @@ void Xasum<T>::DoAsum(const size_t n, TestVectorScalar(1, asum_buffer, asum_offset); // Retrieves the Xasum kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xasum"); - auto kernel2 = Kernel(program, "XasumEpilogue"); + auto kernel1 = Kernel(program_, "Xasum"); + auto kernel2 = Kernel(program_, "XasumEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp index 5436c5b7..39f61ef4 100644 --- a/src/routines/level1/xaxpy.cpp +++ b/src/routines/level1/xaxpy.cpp @@ -52,8 +52,7 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; // Retrieves the Xaxpy kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp index d86200c0..62889764 100644 --- a/src/routines/level1/xcopy.cpp +++ b/src/routines/level1/xcopy.cpp @@ -52,8 +52,7 @@ void Xcopy<T>::DoCopy(const size_t n, auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; // Retrieves the Xcopy kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp index 9d718913..9f9c0590 100644 --- a/src/routines/level1/xdot.cpp +++ b/src/routines/level1/xdot.cpp @@ -46,9 +46,8 @@ void Xdot<T>::DoDot(const size_t n, TestVectorScalar(1, dot_buffer, dot_offset); // Retrieves the Xdot kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xdot"); - auto kernel2 = Kernel(program, "XdotEpilogue"); + auto kernel1 = Kernel(program_, "Xdot"); + auto kernel2 = Kernel(program_, "XdotEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp index 373820a4..aa341aff 100644 --- a/src/routines/level1/xnrm2.cpp +++ b/src/routines/level1/xnrm2.cpp @@ -43,9 +43,8 @@ void Xnrm2<T>::DoNrm2(const size_t n, TestVectorScalar(1, nrm2_buffer, nrm2_offset); // Retrieves the Xnrm2 kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xnrm2"); - auto kernel2 = Kernel(program, "Xnrm2Epilogue"); + auto kernel1 = Kernel(program_, "Xnrm2"); + auto kernel2 = Kernel(program_, "Xnrm2Epilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp index 0521b1e5..9bc096e5 100644 --- a/src/routines/level1/xscal.cpp +++ b/src/routines/level1/xscal.cpp @@ -49,8 +49,7 @@ void Xscal<T>::DoScal(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; // Retrieves the Xscal kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp index c9b97dc9..f046575f 100644 --- a/src/routines/level1/xswap.cpp +++ b/src/routines/level1/xswap.cpp @@ -52,8 +52,7 @@ void Xswap<T>::DoSwap(const size_t n, auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; // Retrieves the Xswap kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 52e66de6..7d2e5f60 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -123,8 +123,7 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, } // Retrieves the Xgemv kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m_real)); diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp index d16ebd11..9ec156a1 100644 --- a/src/routines/level2/xger.cpp +++ b/src/routines/level2/xger.cpp @@ -53,8 +53,7 @@ void Xger<T>::DoGer(const Layout layout, TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xger"); + auto kernel = Kernel(program_, "Xger"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(a_one)); diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp index 6c334e63..ba12a3ef 100644 --- a/src/routines/level2/xher.cpp +++ b/src/routines/level2/xher.cpp @@ -67,8 +67,7 @@ void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, const auto matching_alpha = GetAlpha(alpha); // Retrieves the kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xher"); + auto kernel = Kernel(program_, "Xher"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp index 11e2c871..a420e693 100644 --- a/src/routines/level2/xher2.cpp +++ b/src/routines/level2/xher2.cpp @@ -54,8 +54,7 @@ void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xher2"); + auto kernel = Kernel(program_, "Xher2"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp index b0e4c5ae..d5d009ff 100644 --- a/src/routines/level2/xtrsv.cpp +++ b/src/routines/level2/xtrsv.cpp @@ -37,9 +37,6 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle, if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); }; - // Retrieves the program from the cache - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV"); - // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1; const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) || @@ -52,7 +49,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle, // Retrieves the kernel from the compiled binary const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward"; - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); @@ -94,9 +91,6 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle, TestMatrixA(n, n, a_buffer, a_offset, a_ld); TestVectorX(n, b_buffer, b_offset, b_inc); - // Retrieves the program from the cache - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV"); - // Creates a copy of B to avoid overwriting input while computing output // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels const auto x_offset = b_offset; @@ -108,7 +102,7 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle, // Fills the output buffer with zeros auto eventWaitList = std::vector<Event>(); auto fill_vector_event = Event(); - FillVector(queue_, device_, program, db_, fill_vector_event.pointer(), eventWaitList, + FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList, n, x_inc, x_offset, x_buffer, ConstantZero<T>()); fill_vector_event.WaitForCompletion(); diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp index dc3f32f0..67e626a1 100644 --- a/src/routines/level2/xtrsv.hpp +++ b/src/routines/level2/xtrsv.hpp @@ -27,11 +27,11 @@ class Xtrsv: public Xgemv<T> { public: // Uses the generic matrix-vector routine - using Xgemv<T>::routine_name_; using Xgemv<T>::queue_; using Xgemv<T>::context_; using Xgemv<T>::device_; using Xgemv<T>::db_; + using Xgemv<T>::program_; using Xgemv<T>::DoGemv; // Constructor diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 0015b629..7bd388c1 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -150,9 +150,6 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled; const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 && a_do_transpose == false && a_conjugate == false; @@ -178,7 +175,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, a_one_i, a_two_i, a_one_i, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_do_transpose, a_conjugate); eventWaitList.push_back(eventProcessA); } @@ -189,7 +186,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, b_one_i, b_two_i, b_one_i, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, b_do_transpose, b_conjugate); eventWaitList.push_back(eventProcessB); } @@ -200,13 +197,13 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offset, c_buffer, c_one_i, c_two_i, c_one_i, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_do_transpose, false); eventWaitList.push_back(eventProcessC); } // Retrieves the Xgemm kernel from the compiled binary - auto kernel = Kernel(program, "Xgemm"); + auto kernel = Kernel(program_, "Xgemm"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m_ceiled)); @@ -236,7 +233,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, c_one_i, c_two_i, c_one_i, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_do_transpose, false); } } @@ -255,13 +252,10 @@ void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate) { - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Retrieves the proper XgemmDirect kernel from the compiled binary const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); - auto kernel = Kernel(program, name); + auto kernel = Kernel(program_, name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m)); diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp index e5b1502a..8629f3de 100644 --- a/src/routines/level3/xhemm.cpp +++ b/src/routines/level3/xhemm.cpp @@ -58,8 +58,7 @@ void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the hermitian-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp index 2385706e..7c011915 100644 --- a/src/routines/level3/xhemm.hpp +++ b/src/routines/level3/xhemm.hpp @@ -30,6 +30,7 @@ class Xhemm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index ee3bb8b8..2aed2781 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -81,9 +81,6 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && ab_rotated == false && ab_conjugate == false; @@ -116,7 +113,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, ab_conjugate); eventWaitList.push_back(eventProcessA1); } @@ -125,7 +122,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, !ab_conjugate); eventWaitList.push_back(eventProcessA2); } @@ -134,7 +131,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, ab_conjugate); eventWaitList.push_back(eventProcessB1); } @@ -143,7 +140,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, !ab_conjugate); eventWaitList.push_back(eventProcessB2); } @@ -154,12 +151,12 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -201,7 +198,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, true); } diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index ae8e9324..d982859e 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -79,9 +79,6 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && a_rotated == false && a_conjugate == false; @@ -109,7 +106,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, a_conjugate); eventWaitList.push_back(eventProcessA); } @@ -118,7 +115,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, b_conjugate); eventWaitList.push_back(eventProcessB); } @@ -129,12 +126,12 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -163,7 +160,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, true); } diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp index d7f771d1..969edfc8 100644 --- a/src/routines/level3/xsymm.cpp +++ b/src/routines/level3/xsymm.cpp @@ -30,12 +30,12 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } @@ -58,8 +58,7 @@ void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the symmetric-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp index ee965364..7a584560 100644 --- a/src/routines/level3/xsymm.hpp +++ b/src/routines/level3/xsymm.hpp @@ -32,6 +32,7 @@ class Xsymm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index cb0e0461..fdef43dc 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -77,9 +77,6 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && ab_rotated == false; @@ -103,7 +100,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, false); eventWaitList.push_back(eventProcessA); } @@ -112,7 +109,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, false); eventWaitList.push_back(eventProcessB); } @@ -123,12 +120,12 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -168,7 +165,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, false); } diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index bd6c4b25..9588c28c 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -74,9 +74,6 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && a_rotated == false; @@ -97,7 +94,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, false); eventWaitList.push_back(eventProcessA); } @@ -108,12 +105,12 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -142,7 +139,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, false); } diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp index ed810e72..02c295ac 100644 --- a/src/routines/level3/xtrmm.cpp +++ b/src/routines/level3/xtrmm.cpp @@ -70,8 +70,7 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the triangular-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp index 967bf132..e77b7214 100644 --- a/src/routines/level3/xtrmm.hpp +++ b/src/routines/level3/xtrmm.hpp @@ -31,6 +31,7 @@ class Xtrmm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp index 8061b508..3a910261 100644 --- a/src/routines/level3/xtrsm.cpp +++ b/src/routines/level3/xtrsm.cpp @@ -79,9 +79,8 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian // Fills the output buffer with zeros auto eventWaitList = std::vector<Event>(); - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSM"); auto fill_matrix_event = Event(); - FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), eventWaitList, + FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList, x_one, x_ld, x_offset, x_buffer, ConstantZero<T>()); fill_matrix_event.WaitForCompletion(); diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp index 288e9d11..b9d5432a 100644 --- a/src/routines/level3/xtrsm.hpp +++ b/src/routines/level3/xtrsm.hpp @@ -26,11 +26,11 @@ class Xtrsm: public Xgemm<T> { public: // Uses methods and variables the Xgemm routine - using Xgemm<T>::routine_name_; using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; using Xgemm<T>::db_; + using Xgemm<T>::program_; using Xgemm<T>::DoGemm; // Constructor diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp index ffee9b7c..696e694a 100644 --- a/src/routines/levelx/xinvert.cpp +++ b/src/routines/levelx/xinvert.cpp @@ -69,18 +69,15 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle (triangle == Triangle::kLower && layout == Layout::kRowMajor)); const auto name_postfix = (is_upper) ? "Upper" : "Lower"; - // Retrieves the program from the cache - auto event_wait_list = std::vector<Event>(); - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "INVERT"); - // Fills the output buffer with zeros + auto event_wait_list = std::vector<Event>(); auto fill_matrix_event = Event(); - FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), event_wait_list, + FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>()); event_wait_list.push_back(fill_matrix_event); // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group - auto kernel = Kernel(program, "InvertDiagonalBlock"); + auto kernel = Kernel(program_, "InvertDiagonalBlock"); kernel.SetArgument(0, static_cast<int>(n)); kernel.SetArgument(1, src()); kernel.SetArgument(2, static_cast<int>(offset)); @@ -110,7 +107,7 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]}; // Part 1 - auto kernel1 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix); + auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix); kernel1.SetArgument(0, static_cast<int>(n)); kernel1.SetArgument(1, src()); kernel1.SetArgument(2, static_cast<int>(offset)); @@ -125,7 +122,7 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle // Part 2 const bool is_last_kernel = (current_size * 2 >= block_size); - auto kernel2 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix); + auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix); kernel2.SetArgument(0, static_cast<int>(n)); kernel2.SetArgument(1, dest()); kernel2.SetArgument(2, static_cast<int>(current_size)); diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp index 875ca7d2..4ae8c056 100644 --- a/src/routines/levelx/xomatcopy.cpp +++ b/src/routines/levelx/xomatcopy.cpp @@ -65,14 +65,11 @@ void Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose, TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto emptyEventList = std::vector<Event>(); PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, b_one, b_two, b_ld, b_offset, b_buffer, - alpha, program, false, transpose, conjugate); + alpha, program_, false, transpose, conjugate); } // ================================================================================================= diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp index f3c7b9a3..0d0033b6 100644 --- a/src/utilities/clblast_exceptions.hpp +++ b/src/utilities/clblast_exceptions.hpp @@ -16,8 +16,8 @@ #ifndef CLBLAST_EXCEPTIONS_H_ #define CLBLAST_EXCEPTIONS_H_ -#include "clblast.h" #include "clpp11.hpp" +#include "clblast.h" namespace clblast { // ================================================================================================= diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index 3e408bb7..757f1b5e 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -21,9 +21,9 @@ #include <functional> #include <complex> +#include "clpp11.hpp" #include "clblast.h" #include "clblast_half.h" -#include "clpp11.hpp" #include "utilities/clblast_exceptions.hpp" #include "utilities/msvc.hpp" |