summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.travis.yml16
-rw-r--r--CHANGELOG6
-rw-r--r--CMakeLists.txt2
-rw-r--r--README.md4
-rw-r--r--samples/cache.c2
-rw-r--r--samples/dgemv.c2
-rw-r--r--samples/haxpy.c2
-rw-r--r--samples/sasum.c2
-rw-r--r--samples/sgemm.c2
-rw-r--r--samples/sgemm.cpp5
-rw-r--r--scripts/database/database/clblast.py2
-rwxr-xr-xscripts/generator/generator.py2
-rw-r--r--src/cache.cpp142
-rw-r--r--src/cache.hpp126
-rw-r--r--src/clblast.cpp124
-rw-r--r--src/clblast_c.cpp2
-rw-r--r--src/clpp11.hpp53
-rw-r--r--src/database/database.cpp9
-rw-r--r--src/database/database.hpp8
-rw-r--r--src/database/kernels/copy.hpp26
-rw-r--r--src/database/kernels/pad.hpp26
-rw-r--r--src/database/kernels/padtranspose.hpp18
-rw-r--r--src/database/kernels/transpose.hpp22
-rw-r--r--src/database/kernels/xaxpy.hpp22
-rw-r--r--src/database/kernels/xdot.hpp36
-rw-r--r--src/database/kernels/xgemm.hpp38
-rw-r--r--src/database/kernels/xgemm_direct.hpp42
-rw-r--r--src/database/kernels/xgemv.hpp17
-rw-r--r--src/database/kernels/xgemv_fast.hpp16
-rw-r--r--src/database/kernels/xgemv_fast_rot.hpp26
-rw-r--r--src/database/kernels/xger.hpp22
-rw-r--r--src/kernels/common.opencl4
-rw-r--r--src/routine.cpp71
-rw-r--r--src/routine.hpp16
-rw-r--r--src/routines/common.hpp2
-rw-r--r--src/routines/level1/xamax.cpp5
-rw-r--r--src/routines/level1/xasum.cpp5
-rw-r--r--src/routines/level1/xaxpy.cpp3
-rw-r--r--src/routines/level1/xcopy.cpp3
-rw-r--r--src/routines/level1/xdot.cpp5
-rw-r--r--src/routines/level1/xnrm2.cpp5
-rw-r--r--src/routines/level1/xscal.cpp3
-rw-r--r--src/routines/level1/xswap.cpp3
-rw-r--r--src/routines/level2/xgemv.cpp3
-rw-r--r--src/routines/level2/xger.cpp3
-rw-r--r--src/routines/level2/xher.cpp3
-rw-r--r--src/routines/level2/xher2.cpp3
-rw-r--r--src/routines/level2/xtrsv.cpp10
-rw-r--r--src/routines/level2/xtrsv.hpp2
-rw-r--r--src/routines/level3/xgemm.cpp18
-rw-r--r--src/routines/level3/xhemm.cpp3
-rw-r--r--src/routines/level3/xhemm.hpp1
-rw-r--r--src/routines/level3/xher2k.cpp17
-rw-r--r--src/routines/level3/xherk.cpp13
-rw-r--r--src/routines/level3/xsymm.cpp15
-rw-r--r--src/routines/level3/xsymm.hpp1
-rw-r--r--src/routines/level3/xsyr2k.cpp13
-rw-r--r--src/routines/level3/xsyrk.cpp11
-rw-r--r--src/routines/level3/xtrmm.cpp3
-rw-r--r--src/routines/level3/xtrmm.hpp1
-rw-r--r--src/routines/level3/xtrsm.cpp3
-rw-r--r--src/routines/level3/xtrsm.hpp2
-rw-r--r--src/routines/levelx/xinvert.cpp13
-rw-r--r--src/routines/levelx/xomatcopy.cpp5
-rw-r--r--src/utilities/clblast_exceptions.hpp2
-rw-r--r--src/utilities/utilities.hpp2
-rw-r--r--test/correctness/tester.cpp34
-rw-r--r--test/correctness/tester.hpp4
-rw-r--r--test/performance/client.hpp4
69 files changed, 681 insertions, 455 deletions
diff --git a/.travis.yml b/.travis.yml
index 0465afa4..6a47bbd7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,14 +2,6 @@ language: cpp
sudo: required
dist: trusty
-os:
- - linux
- - osx
-
-compiler:
- - gcc
- - clang
-
addons:
apt:
sources:
@@ -19,6 +11,14 @@ addons:
- cmake
- ocl-icd-opencl-dev
+matrix:
+ include:
+ - os: linux
+ compiler: gcc
+ - os: linux
+ compiler: clang
+ - os: osx
+
env:
global:
- CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
diff --git a/CHANGELOG b/CHANGELOG
index 089e3fd8..20f17807 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,12 @@
Development version (next release)
+- Improved the internal program source and binary caches for scalability and speed (thanks to 'intelfx')
+- Fixed a bug having to re-create the binary even if it was in the cache
- Fixed a bug when using offsets in the direct version of the GEMM kernels
+- Fixed a missing cl_khr_fp64 when running double-precision on Intel CPUs
+- Tests now also exit with an error code when OpenCL errors or compilation errors occur
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see README)
Version 0.10.0
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 41982b21..1c35a4c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -241,7 +241,7 @@ endif()
if(SAMPLES)
# Downloads the cl.hpp file from Khronos
- file(DOWNLOAD https://www.khronos.org/registry/cl/api/1.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
+ file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
# Adds sample programs (C++)
foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
diff --git a/README.md b/README.md
index d550122f..35e79db8 100644
--- a/README.md
+++ b/README.md
@@ -115,6 +115,7 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- GeForce GTX 750 Ti
- GeForce GTX 980
- GeForce GTX 1070
+ - GeForce GTX 1080
- GeForce GTX TITAN
- GeForce GTX TITAN Black
- GeForce GTX TITAN X
@@ -127,6 +128,7 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- Pitcairn
- Tahiti
- Tonga
+ - Turks
* Intel GPUs:
- HD Graphics 530
- HD Graphics 5500 BroadWell U-Processor GT2
@@ -137,7 +139,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- Iris Pro
* Intel CPUs:
- Core i5-6200U
+ - Core i7-2670QM
- Core i7-3770K
+ - Core i7-4790K
- Core i7-5930K
* Other devices:
- ARM Mali-T628 GPU
diff --git a/samples/cache.c b/samples/cache.c
index 40f2163f..980c7cf3 100644
--- a/samples/cache.c
+++ b/samples/cache.c
@@ -20,6 +20,8 @@
#include <string.h>
#include <time.h>
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
// Includes the CLBlast library (C interface)
#include <clblast_c.h>
diff --git a/samples/dgemv.c b/samples/dgemv.c
index dc2fe7db..975cb7ac 100644
--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@@ -19,6 +19,8 @@
#include <stdio.h>
#include <string.h>
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
// Includes the CLBlast library (C interface)
#include <clblast_c.h>
diff --git a/samples/haxpy.c b/samples/haxpy.c
index 8e0833f8..4f2bb400 100644
--- a/samples/haxpy.c
+++ b/samples/haxpy.c
@@ -18,6 +18,8 @@
#include <stdio.h>
#include <string.h>
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
// Includes the CLBlast library (C interface)
#include <clblast_c.h>
diff --git a/samples/sasum.c b/samples/sasum.c
index c285dd14..78377336 100644
--- a/samples/sasum.c
+++ b/samples/sasum.c
@@ -19,6 +19,8 @@
#include <stdio.h>
#include <string.h>
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
// Includes the CLBlast library (C interface)
#include <clblast_c.h>
diff --git a/samples/sgemm.c b/samples/sgemm.c
index 132dad81..92f3057d 100644
--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@@ -19,6 +19,8 @@
#include <stdio.h>
#include <string.h>
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
// Includes the CLBlast library (C interface)
#include <clblast_c.h>
diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp
index 401ecff8..b960865b 100644
--- a/samples/sgemm.cpp
+++ b/samples/sgemm.cpp
@@ -20,6 +20,9 @@
#include <chrono>
#include <vector>
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
// Includes the C++ OpenCL API. If not yet available, it can be found here:
// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
#include "cl.hpp"
@@ -103,7 +106,7 @@ int main() {
auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
// Example completed. See "clblast.h" for status codes (0 -> success).
- printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, status);
+ printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
return 0;
}
diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index d89b6350..8af3ab5b 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -123,7 +123,7 @@ def print_cpp_database(database, output_dir):
devices = sorted(set([s["device"] for s in type_database]))
for device_name in devices:
device_database = [s for s in type_database if s["device"] == device_name]
- device_name_quoted = "\"%s\"," % device_name
+ device_name_quoted = "\"%s\"," % device_name.strip()
device_name_cpp = " { %-50s { " % device_name_quoted
f.write(device_name_cpp)
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 1bd0b58e..8624938c 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -42,7 +42,7 @@ FILES = [
"/src/clblast_netlib_c.cpp",
]
HEADER_LINES = [117, 75, 118, 22, 29, 41, 65, 32]
-FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]
+FOOTER_LINES = [17, 95, 19, 18, 6, 6, 9, 2]
# Different possibilities for requirements
ald_m = "The value of `a_ld` must be at least `m`."
diff --git a/src/cache.cpp b/src/cache.cpp
index a2e51792..c5cc6a4d 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -15,108 +15,84 @@
#include <vector>
#include <mutex>
+#include "database/database.hpp"
#include "cache.hpp"
namespace clblast {
// =================================================================================================
-// Stores the compiled binary or IR in the cache
-void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
- const Precision &precision, const std::string &routine_name) {
- #ifdef VERBOSE
- printf("[DEBUG] Storing binary '%s' in cache\n", routine_name.c_str());
- #endif
- binary_cache_mutex_.lock();
- binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
- binary_cache_mutex_.unlock();
-}
-
-// Stores the compiled program in the cache
-void StoreProgramToCache(const Program &program, const Context &context,
- const Precision &precision, const std::string &routine_name) {
- #ifdef VERBOSE
- printf("[DEBUG] Storing program '%s' in cache\n", routine_name.c_str());
- #endif
- program_cache_mutex_.lock();
- program_cache_.push_back(ProgramCache{program, context(), precision, routine_name});
- program_cache_mutex_.unlock();
-}
+template <typename Key, typename Value>
+template <typename U>
+Value Cache<Key, Value>::Get(const U &key, bool *in_cache) const {
+ std::lock_guard<std::mutex> lock(cache_mutex_);
-// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws
-// otherwise.
-const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
- const std::string &routine_name) {
- #ifdef VERBOSE
- printf("[DEBUG] Retrieving binary '%s' from cache\n", routine_name.c_str());
- #endif
- binary_cache_mutex_.lock();
- for (auto &cached_binary: binary_cache_) {
- if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
- binary_cache_mutex_.unlock();
- return cached_binary.binary;
+#if __cplusplus >= 201402L
+ // generalized std::map::find() of C++14
+ auto it = cache_.find(key);
+#else
+ // O(n) lookup in a vector
+ auto it = std::find_if(cache_.begin(), cache_.end(), [&] (const std::pair<Key, Value> &pair) {
+ return pair.first == key;
+ });
+#endif
+ if (it == cache_.end()) {
+ if (in_cache) {
+ *in_cache = false;
}
+ return Value();
}
- binary_cache_mutex_.unlock();
- throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
-}
-// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
-// otherwise.
-const Program& GetProgramFromCache(const Context &context, const Precision &precision,
- const std::string &routine_name) {
- #ifdef VERBOSE
- printf("[DEBUG] Retrieving program '%s' from cache\n", routine_name.c_str());
- #endif
- program_cache_mutex_.lock();
- for (auto &cached_program: program_cache_) {
- if (cached_program.MatchInCache(context(), precision, routine_name)) {
- program_cache_mutex_.unlock();
- return cached_program.program;
- }
+ if (in_cache) {
+ *in_cache = true;
}
- program_cache_mutex_.unlock();
- throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
+ return it->second;
}
-// Queries the cache to see whether or not the compiled kernel is already there
-bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
- const std::string &routine_name) {
- binary_cache_mutex_.lock();
- for (auto &cached_binary: binary_cache_) {
- if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
- binary_cache_mutex_.unlock();
- return true;
- }
+template <typename Key, typename Value>
+void Cache<Key, Value>::Store(Key &&key, Value &&value) {
+ std::lock_guard<std::mutex> lock(cache_mutex_);
+
+#if __cplusplus >= 201402L
+ // emplace() into a map
+ auto r = cache_.emplace(std::move(key), std::move(value));
+ if (!r.second) {
+ throw LogicError("Cache::Store: object already in cache");
}
- binary_cache_mutex_.unlock();
- return false;
+#else
+ // emplace_back() into a vector
+ cache_.emplace_back(std::move(key), std::move(value));
+#endif
}
-// Queries the cache to see whether or not the compiled kernel is already there
-bool ProgramIsInCache(const Context &context, const Precision &precision,
- const std::string &routine_name) {
- program_cache_mutex_.lock();
- for (auto &cached_program: program_cache_) {
- if (cached_program.MatchInCache(context(), precision, routine_name)) {
- program_cache_mutex_.unlock();
- return true;
- }
- }
- program_cache_mutex_.unlock();
- return false;
+template <typename Key, typename Value>
+void Cache<Key, Value>::Invalidate() {
+ std::lock_guard<std::mutex> lock(cache_mutex_);
+
+ cache_.clear();
}
+template <typename Key, typename Value>
+Cache<Key, Value> &Cache<Key, Value>::Instance() {
+ return instance_;
+}
+
+template <typename Key, typename Value>
+Cache<Key, Value> Cache<Key, Value>::instance_;
+
// =================================================================================================
-// Clears the cache of stored binaries and programs
-void CacheClearAll() {
- binary_cache_mutex_.lock();
- binary_cache_.clear();
- binary_cache_mutex_.unlock();
- program_cache_mutex_.lock();
- program_cache_.clear();
- program_cache_mutex_.unlock();
-}
+template class Cache<BinaryKey, std::string>;
+template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
+
+// =================================================================================================
+
+template class Cache<ProgramKey, Program>;
+template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
+
+// =================================================================================================
+
+template class Cache<DatabaseKey, Database>;
+template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const;
// =================================================================================================
} // namespace clblast
diff --git a/src/cache.hpp b/src/cache.hpp
index 9ecb0f1e..c3675f07 100644
--- a/src/cache.hpp
+++ b/src/cache.hpp
@@ -15,81 +15,89 @@
#define CLBLAST_CACHE_H_
#include <string>
-#include <vector>
#include <mutex>
+#include <map>
#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
-// The cache of compiled OpenCL binaries, along with some meta-data
-struct BinaryCache {
- std::string binary;
- std::string device_name;
- Precision precision;
- std::string routine_name_;
-
- // Finds out whether the properties match
- bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
- const std::string &ref_routine) {
- return (device_name == ref_device &&
- precision == ref_precision &&
- routine_name_ == ref_routine);
- }
-};
-
-// The actual cache, implemented as a vector of the above data-type, and its mutex
-static std::vector<BinaryCache> binary_cache_;
-static std::mutex binary_cache_mutex_;
+// The generic thread-safe cache. We assume that the Key may be a heavyweight struct that is not
+// normally used by the caller, while the Value is either lightweight or ref-counted.
+// Hence, searching by non-Key is supported (if there is a corresponding operator<()), and
+// on Store() the Key instance is moved from the caller (because it will likely be constructed
+// as temporary at the time of Store()).
+template <typename Key, typename Value>
+class Cache {
+public:
+ // Cached object is returned by-value to avoid racing with Invalidate().
+ // Due to lack of std::optional<>, in case of a cache miss we return a default-constructed
+ // Value and set the flag to false.
+ template <typename U>
+ Value Get(const U &key, bool *in_cache) const;
+
+ // We do not return references to just stored object to avoid racing with Invalidate().
+ // Caller is expected to store a temporary.
+ void Store(Key &&key, Value &&value);
+ void Invalidate();
+
+ static Cache<Key, Value> &Instance();
+
+private:
+#if __cplusplus >= 201402L
+ // The std::less<void> allows to search in cache by an object comparable with Key, without
+ // constructing a temporary Key
+ // (see http://en.cppreference.com/w/cpp/utility/functional/less_void,
+ // http://www.open-std.org/JTC1/SC22/WG21/docs/papers/2013/n3657.htm,
+ // http://stackoverflow.com/questions/10536788/avoiding-key-construction-for-stdmapfind)
+ std::map<Key, Value, std::less<void>> cache_;
+#else
+ std::vector<std::pair<Key, Value>> cache_;
+#endif
+ mutable std::mutex cache_mutex_;
+
+ static Cache<Key, Value> instance_;
+}; // class Cache
// =================================================================================================
-// The cache of compiled OpenCL programs, along with some meta-data
-struct ProgramCache {
- Program program;
- cl_context context;
- Precision precision;
- std::string routine_name_;
-
- // Finds out whether the properties match
- bool MatchInCache(const cl_context ref_context, const Precision &ref_precision,
- const std::string &ref_routine) {
- return (context == ref_context &&
- precision == ref_precision &&
- routine_name_ == ref_routine);
- }
-};
-
-// The actual cache, implemented as a vector of the above data-type, and its mutex
-static std::vector<ProgramCache> program_cache_;
-static std::mutex program_cache_mutex_;
+// The key struct for the cache of compiled OpenCL binaries
+// Order of fields: precision, routine_name, device_name (smaller fields first)
+typedef std::tuple<Precision, std::string, std::string> BinaryKey;
+typedef std::tuple<const Precision &, const std::string &, const std::string &> BinaryKeyRef;
+
+typedef Cache<BinaryKey, std::string> BinaryCache;
+
+extern template class Cache<BinaryKey, std::string>;
+extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
+
// =================================================================================================
-// Stores the compiled binary or program in the cache
-void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
- const Precision &precision, const std::string &routine_name);
-void StoreProgramToCache(const Program &program, const Context &context,
- const Precision &precision, const std::string &routine_name);
-
-// Queries the cache and retrieves a matching binary or program. Assumes that the match is
-// available, throws otherwise.
-const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
- const std::string &routine_name);
-const Program& GetProgramFromCache(const Context &context, const Precision &precision,
- const std::string &routine_name);
-
-// Queries the cache to see whether or not the compiled kernel is already there
-bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
- const std::string &routine_name);
-bool ProgramIsInCache(const Context &context, const Precision &precision,
- const std::string &routine_name);
+// The key struct for the cache of compiled OpenCL programs (context-dependent)
+// Order of fields: context, precision, routine_name (smaller fields first)
+typedef std::tuple<cl_context, Precision, std::string> ProgramKey;
+typedef std::tuple<const cl_context &, const Precision &, const std::string &> ProgramKeyRef;
+
+typedef Cache<ProgramKey, Program> ProgramCache;
+
+extern template class Cache<ProgramKey, Program>;
+extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
// =================================================================================================
-// Clears the cache of stored binaries
-void CacheClearAll();
+class Database;
+
+// The key struct for the cache of database maps.
+// Order of fields: precision, device_name, routines (smaller fields first)
+typedef std::tuple<Precision, std::string, std::vector<std::string>> DatabaseKey;
+typedef std::tuple<const Precision &, const std::string &, const std::vector<std::string> &> DatabaseKeyRef;
+
+typedef Cache<DatabaseKey, Database> DatabaseCache;
+
+extern template class Cache<DatabaseKey, Database>;
+extern template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const;
// =================================================================================================
} // namespace clblast
diff --git a/src/clblast.cpp b/src/clblast.cpp
index ef1cedf9..20ce1ba4 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -15,8 +15,8 @@
#include <string>
-#include "clblast.h"
#include "cache.hpp"
+#include "clblast.h"
// BLAS level-1 includes
#include "routines/level1/xswap.hpp"
@@ -2184,11 +2184,77 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
// Clears the cache of stored binaries
StatusCode ClearCache() {
try {
- CacheClearAll();
+ ProgramCache::Instance().Invalidate();
+ BinaryCache::Instance().Invalidate();
} catch (...) { return DispatchException(); }
return StatusCode::kSuccess;
}
+template <typename Real, typename Complex>
+void FillCacheForPrecision(Queue &queue) {
+ try {
+
+ // Runs all the level 1 set-up functions
+ Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
+ Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
+ Xscal<Real>(queue, nullptr); Xscal<Complex>(queue, nullptr);
+ Xcopy<Real>(queue, nullptr); Xcopy<Complex>(queue, nullptr);
+ Xaxpy<Real>(queue, nullptr); Xaxpy<Complex>(queue, nullptr);
+ Xdot<Real>(queue, nullptr);
+ Xdotu<Complex>(queue, nullptr);
+ Xdotc<Complex>(queue, nullptr);
+ Xnrm2<Real>(queue, nullptr); Xnrm2<Complex>(queue, nullptr);
+ Xasum<Real>(queue, nullptr); Xasum<Complex>(queue, nullptr);
+ Xsum<Real>(queue, nullptr); Xsum<Complex>(queue, nullptr);
+ Xamax<Real>(queue, nullptr); Xamax<Complex>(queue, nullptr);
+ Xmax<Real>(queue, nullptr); Xmax<Complex>(queue, nullptr);
+ Xmin<Real>(queue, nullptr); Xmin<Complex>(queue, nullptr);
+
+ // Runs all the level 2 set-up functions
+ Xgemv<Real>(queue, nullptr); Xgemv<Complex>(queue, nullptr);
+ Xgbmv<Real>(queue, nullptr); Xgbmv<Complex>(queue, nullptr);
+ Xhemv<Complex>(queue, nullptr);
+ Xhbmv<Complex>(queue, nullptr);
+ Xhpmv<Complex>(queue, nullptr);
+ Xsymv<Real>(queue, nullptr);
+ Xsbmv<Real>(queue, nullptr);
+ Xspmv<Real>(queue, nullptr);
+ Xtrmv<Real>(queue, nullptr); Xtrmv<Complex>(queue, nullptr);
+ Xtbmv<Real>(queue, nullptr); Xtbmv<Complex>(queue, nullptr);
+ Xtpmv<Real>(queue, nullptr); Xtpmv<Complex>(queue, nullptr);
+ Xger<Real>(queue, nullptr);
+ Xgeru<Complex>(queue, nullptr);
+ Xgerc<Complex>(queue, nullptr);
+ Xher<Complex,Real>(queue, nullptr);
+ Xhpr<Complex,Real>(queue, nullptr);
+ Xher2<Complex>(queue, nullptr);
+ Xhpr2<Complex>(queue, nullptr);
+ Xsyr<Real>(queue, nullptr);
+ Xspr<Real>(queue, nullptr);
+ Xsyr2<Real>(queue, nullptr);
+ Xspr2<Real>(queue, nullptr);
+
+ // Runs all the level 3 set-up functions
+ Xgemm<Real>(queue, nullptr); Xgemm<Complex>(queue, nullptr);
+ Xsymm<Real>(queue, nullptr); Xsymm<Complex>(queue, nullptr);
+ Xhemm<Complex>(queue, nullptr);
+ Xsyrk<Real>(queue, nullptr); Xsyrk<Complex>(queue, nullptr);
+ Xherk<Complex,Real>(queue, nullptr);
+ Xsyr2k<Real>(queue, nullptr); Xsyr2k<Complex>(queue, nullptr);
+ Xher2k<Complex,Real>(queue, nullptr);
+ Xtrmm<Real>(queue, nullptr); Xtrmm<Complex>(queue, nullptr);
+
+ // Runs all the non-BLAS set-up functions
+ Xomatcopy<Real>(queue, nullptr); Xomatcopy<Complex>(queue, nullptr);
+
+ } catch(const RuntimeErrorCode &e) {
+ if (e.status() != StatusCode::kNoDoublePrecision &&
+ e.status() != StatusCode::kNoHalfPrecision) {
+ throw;
+ }
+ }
+}
+
// Fills the cache with all binaries for a specific device
// TODO: Add half-precision FP16 set-up calls
StatusCode FillCache(const cl_device_id device) {
@@ -2199,58 +2265,8 @@ StatusCode FillCache(const cl_device_id device) {
auto context = Context(device_cpp);
auto queue = Queue(context, device_cpp);
- // Runs all the level 1 set-up functions
- Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
- Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
- Xscal<float>(queue, nullptr); Xscal<double>(queue, nullptr); Xscal<float2>(queue, nullptr); Xscal<double2>(queue, nullptr);
- Xcopy<float>(queue, nullptr); Xcopy<double>(queue, nullptr); Xcopy<float2>(queue, nullptr); Xcopy<double2>(queue, nullptr);
- Xaxpy<float>(queue, nullptr); Xaxpy<double>(queue, nullptr); Xaxpy<float2>(queue, nullptr); Xaxpy<double2>(queue, nullptr);
- Xdot<float>(queue, nullptr); Xdot<double>(queue, nullptr);
- Xdotu<float2>(queue, nullptr); Xdotu<double2>(queue, nullptr);
- Xdotc<float2>(queue, nullptr); Xdotc<double2>(queue, nullptr);
- Xnrm2<float>(queue, nullptr); Xnrm2<double>(queue, nullptr); Xnrm2<float2>(queue, nullptr); Xnrm2<double2>(queue, nullptr);
- Xasum<float>(queue, nullptr); Xasum<double>(queue, nullptr); Xasum<float2>(queue, nullptr); Xasum<double2>(queue, nullptr);
- Xsum<float>(queue, nullptr); Xsum<double>(queue, nullptr); Xsum<float2>(queue, nullptr); Xsum<double2>(queue, nullptr);
- Xamax<float>(queue, nullptr); Xamax<double>(queue, nullptr); Xamax<float2>(queue, nullptr); Xamax<double2>(queue, nullptr);
- Xmax<float>(queue, nullptr); Xmax<double>(queue, nullptr); Xmax<float2>(queue, nullptr); Xmax<double2>(queue, nullptr);
- Xmin<float>(queue, nullptr); Xmin<double>(queue, nullptr); Xmin<float2>(queue, nullptr); Xmin<double2>(queue, nullptr);
-
- // Runs all the level 2 set-up functions
- Xgemv<float>(queue, nullptr); Xgemv<double>(queue, nullptr); Xgemv<float2>(queue, nullptr); Xgemv<double2>(queue, nullptr);
- Xgbmv<float>(queue, nullptr); Xgbmv<double>(queue, nullptr); Xgbmv<float2>(queue, nullptr); Xgbmv<double2>(queue, nullptr);
- Xhemv<float2>(queue, nullptr); Xhemv<double2>(queue, nullptr);
- Xhbmv<float2>(queue, nullptr); Xhbmv<double2>(queue, nullptr);
- Xhpmv<float2>(queue, nullptr); Xhpmv<double2>(queue, nullptr);
- Xsymv<float>(queue, nullptr); Xsymv<double>(queue, nullptr);
- Xsbmv<float>(queue, nullptr); Xsbmv<double>(queue, nullptr);
- Xspmv<float>(queue, nullptr); Xspmv<double>(queue, nullptr);
- Xtrmv<float>(queue, nullptr); Xtrmv<double>(queue, nullptr); Xtrmv<float2>(queue, nullptr); Xtrmv<double2>(queue, nullptr);
- Xtbmv<float>(queue, nullptr); Xtbmv<double>(queue, nullptr); Xtbmv<float2>(queue, nullptr); Xtbmv<double2>(queue, nullptr);
- Xtpmv<float>(queue, nullptr); Xtpmv<double>(queue, nullptr); Xtpmv<float2>(queue, nullptr); Xtpmv<double2>(queue, nullptr);
- Xger<float>(queue, nullptr); Xger<double>(queue, nullptr);
- Xgeru<float2>(queue, nullptr); Xgeru<double2>(queue, nullptr);
- Xgerc<float2>(queue, nullptr); Xgerc<double2>(queue, nullptr);
- Xher<float2,float>(queue, nullptr); Xher<double2,double>(queue, nullptr);
- Xhpr<float2,float>(queue, nullptr); Xhpr<double2,double>(queue, nullptr);
- Xher2<float2>(queue, nullptr); Xher2<double2>(queue, nullptr);
- Xhpr2<float2>(queue, nullptr); Xhpr2<double2>(queue, nullptr);
- Xsyr<float>(queue, nullptr); Xsyr<double>(queue, nullptr);
- Xspr<float>(queue, nullptr); Xspr<double>(queue, nullptr);
- Xsyr2<float>(queue, nullptr); Xsyr2<double>(queue, nullptr);
- Xspr2<float>(queue, nullptr); Xspr2<double>(queue, nullptr);
-
- // Runs all the level 3 set-up functions
- Xgemm<float>(queue, nullptr); Xgemm<double>(queue, nullptr); Xgemm<float2>(queue, nullptr); Xgemm<double2>(queue, nullptr);
- Xsymm<float>(queue, nullptr); Xsymm<double>(queue, nullptr); Xsymm<float2>(queue, nullptr); Xsymm<double2>(queue, nullptr);
- Xhemm<float2>(queue, nullptr); Xhemm<double2>(queue, nullptr);
- Xsyrk<float>(queue, nullptr); Xsyrk<double>(queue, nullptr); Xsyrk<float2>(queue, nullptr); Xsyrk<double2>(queue, nullptr);
- Xherk<float2,float>(queue, nullptr); Xherk<double2,double>(queue, nullptr);
- Xsyr2k<float>(queue, nullptr); Xsyr2k<double>(queue, nullptr); Xsyr2k<float2>(queue, nullptr); Xsyr2k<double2>(queue, nullptr);
- Xher2k<float2,float>(queue, nullptr); Xher2k<double2,double>(queue, nullptr);
- Xtrmm<float>(queue, nullptr); Xtrmm<double>(queue, nullptr); Xtrmm<float2>(queue, nullptr); Xtrmm<double2>(queue, nullptr);
-
- // Runs all the level 3 set-up functions
- Xomatcopy<float>(queue, nullptr); Xomatcopy<double>(queue, nullptr); Xomatcopy<float2>(queue, nullptr); Xomatcopy<double2>(queue, nullptr);
+ FillCacheForPrecision<float, float2>(queue);
+ FillCacheForPrecision<double, double2>(queue);
} catch (...) { return DispatchException(); }
return StatusCode::kSuccess;
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index 59e4cd16..e4f2b3ed 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -13,9 +13,9 @@
#include <string>
+#include "utilities/utilities.hpp"
#include "clblast_c.h"
#include "clblast.h"
-#include "utilities/utilities.hpp"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 0383f53a..41af28da 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -333,7 +333,10 @@ class Context {
// Regular constructor with memory management
explicit Context(const Device &device):
- context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) {
+ context_(new cl_context, [](cl_context* c) {
+ if (*c) { CheckErrorDtor(clReleaseContext(*c)); }
+ delete c;
+ }) {
auto status = CL_SUCCESS;
const cl_device_id dev = device();
*context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
@@ -355,33 +358,37 @@ using ContextPointer = cl_context*;
// Enumeration of build statuses of the run-time compilation process
enum class BuildStatus { kSuccess, kError, kInvalid };
-// C++11 version of 'cl_program'. Additionally holds the program's source code.
+// C++11 version of 'cl_program'.
class Program {
public:
- // Note that there is no constructor based on the regular OpenCL data-type because of extra state
+ Program() = default;
// Source-based constructor with memory management
- explicit Program(const Context &context, std::string source):
- program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
- length_(source.length()),
- source_(std::move(source)),
- source_ptr_(&source_[0]) {
+ explicit Program(const Context &context, const std::string &source):
+ program_(new cl_program, [](cl_program* p) {
+ if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
+ delete p;
+ }) {
+ const char *source_ptr = &source[0];
+ size_t length = source.length();
auto status = CL_SUCCESS;
- *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
+ *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
CLError::Check(status, "clCreateProgramWithSource");
}
// Binary-based constructor with memory management
- explicit Program(const Device &device, const Context &context, const std::string& binary):
- program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
- length_(binary.length()),
- source_(binary),
- source_ptr_(&source_[0]) {
+ explicit Program(const Device &device, const Context &context, const std::string &binary):
+ program_(new cl_program, [](cl_program* p) {
+ if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
+ delete p;
+ }) {
+ const char *binary_ptr = &binary[0];
+ size_t length = binary.length();
auto status1 = CL_SUCCESS;
auto status2 = CL_SUCCESS;
const cl_device_id dev = device();
- *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
- reinterpret_cast<const unsigned char**>(&source_ptr_),
+ *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
+ reinterpret_cast<const unsigned char**>(&binary_ptr),
&status1, &status2);
CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
CLError::Check(status2, "clCreateProgramWithBinary");
@@ -421,9 +428,6 @@ class Program {
const cl_program& operator()() const { return *program_; }
private:
std::shared_ptr<cl_program> program_;
- size_t length_;
- std::string source_; // Note: the source can also be a binary or IR
- const char* source_ptr_;
};
// =================================================================================================
@@ -440,8 +444,10 @@ class Queue {
// Regular constructor with memory management
explicit Queue(const Context &context, const Device &device):
- queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s));
- delete s; }) {
+ queue_(new cl_command_queue, [](cl_command_queue* s) {
+ if (*s) { CheckErrorDtor(clReleaseCommandQueue(*s)); }
+ delete s;
+ }) {
auto status = CL_SUCCESS;
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
CLError::Check(status, "clCreateCommandQueue");
@@ -665,7 +671,10 @@ class Kernel {
// Regular constructor with memory management
explicit Kernel(const Program &program, const std::string &name):
- kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
+ kernel_(new cl_kernel, [](cl_kernel* k) {
+ if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
+ delete k;
+ }) {
auto status = CL_SUCCESS;
*kernel_ = clCreateKernel(program(), name.c_str(), &status);
CLError::Check(status, "clCreateKernel");
diff --git a/src/database/database.cpp b/src/database/database.cpp
index c000b0b7..aff6490d 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -67,12 +67,11 @@ const std::unordered_map<std::string, std::string> Database::kVendorNames{
// Constructor, computing device properties and populating the parameter-vector from the database.
// This takes an optional overlay database in case of custom tuning or custom kernels.
-Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
+Database::Database(const Device &device, const std::vector<std::string> &kernels,
const Precision precision, const std::vector<const DatabaseEntry*> &overlay):
- parameters_{} {
+ parameters_(std::make_shared<Parameters>()) {
// Finds information of the current device
- auto device = queue.GetDevice();
auto device_type = device.Type();
auto device_vendor = device.Vendor();
auto device_name = device.Name();
@@ -91,7 +90,7 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
for (auto &db: { database, overlay}) {
search_result = Search(kernel, device_type, device_vendor, device_name, precision, db);
if (search_result) {
- parameters_.insert(search_result->begin(), search_result->end());
+ parameters_->insert(search_result->begin(), search_result->end());
break;
}
}
@@ -105,7 +104,7 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
// Returns a list of OpenCL pre-processor defines in string form
std::string Database::GetDefines() const {
std::string defines{};
- for (auto &parameter: parameters_) {
+ for (auto &parameter: *parameters_) {
defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n";
}
return defines;
diff --git a/src/database/database.hpp b/src/database/database.hpp
index 7c05a20b..87c12293 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -72,12 +72,14 @@ class Database {
// The database consists of separate database entries, stored together in a vector
static const std::vector<const DatabaseEntry*> database;
+ Database() = default;
+
// The constructor with a user-provided database overlay (potentially an empty vector)
- explicit Database(const Queue &queue, const std::vector<std::string> &routines,
+ explicit Database(const Device &device, const std::vector<std::string> &routines,
const Precision precision, const std::vector<const DatabaseEntry*> &overlay);
// Accessor of values by key
- size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
+ size_t operator[](const std::string key) const { return parameters_->find(key)->second; }
// Obtain a list of OpenCL pre-processor defines based on the parameters
std::string GetDefines() const;
@@ -90,7 +92,7 @@ class Database {
const std::vector<const DatabaseEntry*> &db) const;
// Found parameters suitable for this device/kernel
- Parameters parameters_;
+ std::shared_ptr<Parameters> parameters_;
};
// =================================================================================================
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index 1bc63691..f0431933 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -44,6 +44,7 @@ const Database::DatabaseEntry CopySingle = {
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Tonga", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+ { "Turks", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
}
},
@@ -55,10 +56,12 @@ const Database::DatabaseEntry CopySingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
}
},
{ // Intel GPUs
@@ -83,6 +86,7 @@ const Database::DatabaseEntry CopySingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "GeForce GTX 1080", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
@@ -117,13 +121,16 @@ const Database::DatabaseEntry CopyComplexSingle = {
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Turks", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
@@ -150,6 +157,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 1080", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@@ -164,7 +172,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
}
@@ -193,10 +201,12 @@ const Database::DatabaseEntry CopyDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // Intel accelerators
@@ -209,6 +219,7 @@ const Database::DatabaseEntry CopyDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "GeForce GTX 1080", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
@@ -225,7 +236,7 @@ const Database::DatabaseEntry CopyDouble = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
}
},
}
@@ -254,10 +265,12 @@ const Database::DatabaseEntry CopyComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // Intel accelerators
@@ -270,6 +283,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",4} } },
+ { "GeForce GTX 1080", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index 310d3a78..3378709c 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -44,7 +44,8 @@ const Database::DatabaseEntry PadSingle = {
{ "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+ { "Turks", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@@ -55,8 +56,10 @@ const Database::DatabaseEntry PadSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
}
@@ -83,6 +86,7 @@ const Database::DatabaseEntry PadSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 1080", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
@@ -94,7 +98,7 @@ const Database::DatabaseEntry PadSingle = {
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
}
},
{ // Default
@@ -117,7 +121,8 @@ const Database::DatabaseEntry PadComplexSingle = {
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Turks", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
+ { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // ARM GPUs
@@ -128,10 +133,12 @@ const Database::DatabaseEntry PadComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
}
},
{ // Intel GPUs
@@ -156,6 +163,7 @@ const Database::DatabaseEntry PadComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 1080", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@@ -172,7 +180,7 @@ const Database::DatabaseEntry PadComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
}
@@ -201,8 +209,10 @@ const Database::DatabaseEntry PadDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
@@ -217,6 +227,7 @@ const Database::DatabaseEntry PadDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 1080", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@@ -262,10 +273,12 @@ const Database::DatabaseEntry PadComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
},
{ // Intel accelerators
@@ -278,6 +291,7 @@ const Database::DatabaseEntry PadComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
+ { "GeForce GTX 1080", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 8ef09e85..212723c7 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -44,6 +44,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Turks", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
@@ -55,8 +56,10 @@ const Database::DatabaseEntry PadtransposeSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
}
@@ -83,6 +86,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 1080", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -117,6 +121,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Turks", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
@@ -128,10 +133,12 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
}
},
{ // Intel GPUs
@@ -156,6 +163,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 1080", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -201,8 +209,10 @@ const Database::DatabaseEntry PadtransposeDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
@@ -217,6 +227,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 1080", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -262,10 +273,12 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
{ // Intel accelerators
@@ -278,6 +291,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 1080", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 23fecb49..f33f2a04 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -44,7 +44,8 @@ const Database::DatabaseEntry TransposeSingle = {
{ "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Tonga", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "Turks", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
{ // ARM GPUs
@@ -55,8 +56,10 @@ const Database::DatabaseEntry TransposeSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
@@ -83,6 +86,7 @@ const Database::DatabaseEntry TransposeSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+ { "GeForce GTX 1080", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@@ -117,6 +121,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
{ "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tonga", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "Turks", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
@@ -128,8 +133,10 @@ const Database::DatabaseEntry TransposeComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
@@ -150,6 +157,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 1070", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "GeForce GTX 1080", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@@ -166,7 +174,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
}
@@ -195,10 +203,12 @@ const Database::DatabaseEntry TransposeDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
},
{ // Intel accelerators
@@ -211,6 +221,7 @@ const Database::DatabaseEntry TransposeDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "GeForce GTX 1080", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@@ -256,16 +267,19 @@ const Database::DatabaseEntry TransposeComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX 1080", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 52845e96..e4e3c621 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -44,6 +44,7 @@ const Database::DatabaseEntry XaxpySingle = {
{ "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
+ { "Turks", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
}
},
@@ -55,10 +56,12 @@ const Database::DatabaseEntry XaxpySingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
+ { "default", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
}
},
{ // Intel GPUs
@@ -83,6 +86,7 @@ const Database::DatabaseEntry XaxpySingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
+ { "GeForce GTX 1080", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 480", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@@ -94,7 +98,7 @@ const Database::DatabaseEntry XaxpySingle = {
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
- { "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
+ { "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
}
},
{ // Default
@@ -117,6 +121,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
{ "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
+ { "Turks", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
@@ -128,8 +133,10 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "default", { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
}
@@ -156,6 +163,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
+ { "GeForce GTX 1080", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
@@ -201,10 +209,12 @@ const Database::DatabaseEntry XaxpyDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",8}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
- { "default", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
+ { "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Intel accelerators
@@ -217,6 +227,7 @@ const Database::DatabaseEntry XaxpyDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
+ { "GeForce GTX 1080", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -262,8 +273,10 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
}
@@ -278,6 +291,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
+ { "GeForce GTX 1080", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -294,7 +308,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
}
},
}
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 8b07c539..30d98e5d 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -43,13 +43,16 @@ const Database::DatabaseEntry XdotSingle = {
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",128}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",64}, {"WGS2",32} } },
- { "default", { {"WGS1",128}, {"WGS2",32} } },
+ { "Turks", { {"WGS1",128}, {"WGS2",64} } },
+ { "default", { {"WGS1",128}, {"WGS2",64} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WGS2",32} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
- { "default", { {"WGS1",1024}, {"WGS2",32} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WGS2",32} } },
+ { "default", { {"WGS1",64}, {"WGS2",32} } },
}
},
{ // Intel GPUs
@@ -67,6 +70,7 @@ const Database::DatabaseEntry XdotSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",1024} } },
+ { "GeForce GTX 1080", { {"WGS1",512}, {"WGS2",64} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",1024} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } },
@@ -76,12 +80,12 @@ const Database::DatabaseEntry XdotSingle = {
{ "GeForce GTX TITAN Black", { {"WGS1",512}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
- { "default", { {"WGS1",256}, {"WGS2",256} } },
+ { "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"WGS1",256}, {"WGS2",32} } },
+ { "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
}
@@ -98,13 +102,16 @@ const Database::DatabaseEntry XdotComplexSingle = {
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",256}, {"WGS2",64} } },
- { "default", { {"WGS1",256}, {"WGS2",64} } },
+ { "Turks", { {"WGS1",128}, {"WGS2",32} } },
+ { "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WGS2",64} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
- { "default", { {"WGS1",1024}, {"WGS2",32} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",32} } },
+ { "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
{ // Intel GPUs
@@ -122,6 +129,7 @@ const Database::DatabaseEntry XdotComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",32} } },
+ { "GeForce GTX 1080", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
@@ -136,7 +144,7 @@ const Database::DatabaseEntry XdotComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"WGS1",256}, {"WGS2",64} } },
+ { "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
}
@@ -158,14 +166,17 @@ const Database::DatabaseEntry XdotDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",64}, {"WGS2",128} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",64} } },
- { "default", { {"WGS1",512}, {"WGS2",64} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",32} } },
+ { "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",512} } },
+ { "GeForce GTX 1080", { {"WGS1",128}, {"WGS2",128} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
@@ -175,7 +186,7 @@ const Database::DatabaseEntry XdotDouble = {
{ "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
- { "default", { {"WGS1",128}, {"WGS2",64} } },
+ { "default", { {"WGS1",128}, {"WGS2",128} } },
}
},
{ // Default
@@ -202,14 +213,17 @@ const Database::DatabaseEntry XdotComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WGS2",128} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
- { "default", { {"WGS1",1024}, {"WGS2",32} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",32}, {"WGS2",32} } },
+ { "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",64} } },
+ { "GeForce GTX 1080", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",128} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } },
@@ -224,7 +238,7 @@ const Database::DatabaseEntry XdotComplexDouble = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"WGS1",256}, {"WGS2",64} } },
+ { "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
}
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index 66ac8a9f..d9414f8b 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -43,7 +43,8 @@ const Database::DatabaseEntry XgemmSingle = {
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+ { "Turks", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
{ // ARM GPUs
@@ -54,10 +55,12 @@ const Database::DatabaseEntry XgemmSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
{ // Intel GPUs
@@ -82,6 +85,7 @@ const Database::DatabaseEntry XgemmSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+ { "GeForce GTX 1080", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
@@ -93,7 +97,7 @@ const Database::DatabaseEntry XgemmSingle = {
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
}
},
{ // Default
@@ -116,7 +120,8 @@ const Database::DatabaseEntry XgemmComplexSingle = {
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
+ { "Turks", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
}
},
{ // ARM GPUs
@@ -127,10 +132,12 @@ const Database::DatabaseEntry XgemmComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
}
},
{ // Intel GPUs
@@ -155,6 +162,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+ { "GeForce GTX 1080", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
@@ -171,7 +179,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
}
@@ -200,10 +208,12 @@ const Database::DatabaseEntry XgemmDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
- { "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
}
},
{ // Intel accelerators
@@ -216,6 +226,7 @@ const Database::DatabaseEntry XgemmDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+ { "GeForce GTX 1080", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
@@ -227,12 +238,12 @@ const Database::DatabaseEntry XgemmDouble = {
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
}
@@ -261,10 +272,12 @@ const Database::DatabaseEntry XgemmComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
}
},
{ // Intel accelerators
@@ -277,6 +290,7 @@ const Database::DatabaseEntry XgemmComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 1070", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+ { "GeForce GTX 1080", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -287,12 +301,12 @@ const Database::DatabaseEntry XgemmComplexDouble = {
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
}
},
}
diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp
index 4413cf1b..c0cd2c04 100644
--- a/src/database/kernels/xgemm_direct.hpp
+++ b/src/database/kernels/xgemm_direct.hpp
@@ -39,9 +39,17 @@ const Database::DatabaseEntry XgemmDirectSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "Tonga", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+ { "Turks", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",8}, {"WGD",64} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",64} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
@@ -51,6 +59,7 @@ const Database::DatabaseEntry XgemmDirectSingle = {
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 1080", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
@@ -72,9 +81,17 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ { "Turks", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
}
},
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
@@ -84,6 +101,7 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 1080", { {"KWID",8}, {"MDIMAD",8}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "GeForce GTX 750 Ti", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
@@ -91,7 +109,7 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
}
},
}
@@ -108,16 +126,24 @@ const Database::DatabaseEntry XgemmDirectDouble = {
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 1080", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
- { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
}
},
}
@@ -134,8 +160,16 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = {
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
+ }
+ },
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 1080", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
@@ -143,7 +177,7 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
}
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 5f25f210..52b17d94 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -44,13 +44,16 @@ const Database::DatabaseEntry XgemvSingle = {
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",128}, {"WPT1",2} } },
+ { "Turks", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",4} } },
}
},
@@ -76,6 +79,7 @@ const Database::DatabaseEntry XgemvSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WPT1",1} } },
+ { "GeForce GTX 1080", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1} } },
@@ -110,14 +114,17 @@ const Database::DatabaseEntry XgemvComplexSingle = {
{ "Pitcairn", { {"WGS1",64}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",64}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",32}, {"WPT1",1} } },
+ { "Turks", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
- { "default", { {"WGS1",64}, {"WPT1",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WPT1",4} } },
+ { "default", { {"WGS1",64}, {"WPT1",2} } },
}
},
{ // Intel GPUs
@@ -142,6 +149,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 1070", { {"WGS1",64}, {"WPT1",1} } },
+ { "GeForce GTX 1080", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1} } },
@@ -177,8 +185,10 @@ const Database::DatabaseEntry XgemvDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",4} } },
}
},
@@ -192,6 +202,7 @@ const Database::DatabaseEntry XgemvDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 1070", { {"WGS1",64}, {"WPT1",1} } },
+ { "GeForce GTX 1080", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1} } },
@@ -208,7 +219,7 @@ const Database::DatabaseEntry XgemvDouble = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"WGS1",128}, {"WPT1",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
}
@@ -231,8 +242,10 @@ const Database::DatabaseEntry XgemvComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",32}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",4} } },
}
},
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 994a220c..2dd400bc 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -44,14 +44,17 @@ const Database::DatabaseEntry XgemvFastSingle = {
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tonga", { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
+ { "Turks", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
- { "default", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",2}, {"WGS2",16}, {"WPT2",4} } },
+ { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
}
},
{ // Intel GPUs
@@ -76,6 +79,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 1070", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+ { "GeForce GTX 1080", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@@ -110,13 +114,16 @@ const Database::DatabaseEntry XgemvFastComplexSingle = {
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tonga", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
+ { "Turks", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",4}, {"WGS2",16}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
}
},
@@ -173,8 +180,10 @@ const Database::DatabaseEntry XgemvFastDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
}
},
@@ -188,6 +197,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 1070", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+ { "GeForce GTX 1080", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@@ -227,9 +237,11 @@ const Database::DatabaseEntry XgemvFastComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",4}, {"WGS2",32}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
- { "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",1}, {"WGS2",16}, {"WPT2",2} } },
+ { "default", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
}
},
{ // Intel accelerators
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index da8bcfeb..36a435b5 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -39,13 +39,16 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "Tonga", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+ { "Turks", { {"VW3",8}, {"WGS3",128}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
- { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+ { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Intel GPUs
@@ -60,6 +63,7 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 1080", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX 750 Ti", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "GeForce GTX TITAN Black", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
@@ -68,7 +72,7 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+ { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
}
@@ -82,13 +86,16 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "Tonga", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
- { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+ { "Turks", { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
+ { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
- { "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+ { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Intel GPUs
@@ -122,12 +129,15 @@ const Database::DatabaseEntry XgemvFastRotDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
- { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+ { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 1080", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX 750 Ti", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "GeForce GTX TITAN Black", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
@@ -155,13 +165,15 @@ const Database::DatabaseEntry XgemvFastRotComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
- { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+ { "default", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
+ { "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
}
},
}
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 5e2be6a9..f99b7632 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -44,6 +44,7 @@ const Database::DatabaseEntry XgerSingle = {
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "Tonga", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+ { "Turks", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
}
},
@@ -55,7 +56,9 @@ const Database::DatabaseEntry XgerSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WGS2",4}, {"WPT",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",4}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",128}, {"WGS2",8}, {"WPT",4} } },
}
@@ -75,6 +78,7 @@ const Database::DatabaseEntry XgerSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
+ { "GeForce GTX 1080", { {"WGS1",16}, {"WGS2",4}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
@@ -105,7 +109,8 @@ const Database::DatabaseEntry XgerComplexSingle = {
{ "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "Tonga", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
- { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+ { "Turks", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
+ { "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
}
},
{ // ARM GPUs
@@ -116,9 +121,11 @@ const Database::DatabaseEntry XgerComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",2}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
- { "default", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
+ { "default", { {"WGS1",256}, {"WGS2",2}, {"WPT",4} } },
}
},
{ // Intel GPUs
@@ -136,6 +143,7 @@ const Database::DatabaseEntry XgerComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",16}, {"WGS2",64}, {"WPT",2} } },
+ { "GeForce GTX 1080", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 670", { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
@@ -143,7 +151,7 @@ const Database::DatabaseEntry XgerComplexSingle = {
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
- { "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
+ { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
}
},
{ // Default
@@ -177,15 +185,18 @@ const Database::DatabaseEntry XgerDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",4}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
- { "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
+ { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
+ { "GeForce GTX 1080", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
@@ -227,7 +238,9 @@ const Database::DatabaseEntry XgerComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WGS2",4}, {"WPT",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
+ { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",512}, {"WGS2",2}, {"WPT",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
}
@@ -236,6 +249,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",8}, {"WGS2",128}, {"WPT",1} } },
+ { "GeForce GTX 1080", { {"WGS1",8}, {"WGS2",4}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 670", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index c052e94f..0ce4f367 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -31,9 +31,7 @@ R"(
// Enable support for double-precision
#if PRECISION == 64 || PRECISION == 6464
- #if __OPENCL_VERSION__ <= CL_VERSION_1_1
- #pragma OPENCL EXTENSION cl_khr_fp64: enable
- #endif
+ #pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
// Half-precision
diff --git a/src/routine.cpp b/src/routine.cpp
index acafb0d2..4fe04a60 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -32,11 +32,34 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
event_(event),
context_(queue_.GetContext()),
device_(queue_.GetDevice()),
- device_name_(device_.Name()),
- db_(queue_, routines, precision_, userDatabase) {
+ device_name_(device_.Name()) {
+
+ InitDatabase(routines, userDatabase);
+ InitProgram(source);
+}
+
+void Routine::InitDatabase(const std::vector<std::string> &routines,
+ const std::vector<const Database::DatabaseEntry*> &userDatabase) {
+
+ // Queries the cache to see whether or not the kernel parameter database is already there
+ bool has_db;
+ db_ = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision_, device_name_, routines },
+ &has_db);
+ if (has_db) { return; }
+
+ // Builds the parameter database for this device and routine set and stores it in the cache
+ db_ = Database(device_, routines, precision_, userDatabase);
+ DatabaseCache::Instance().Store(DatabaseKey{ precision_, device_name_, routines },
+ Database{ db_ });
+}
+
+void Routine::InitProgram(std::initializer_list<const char *> source) {
// Queries the cache to see whether or not the program (context-specific) is already there
- if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }
+ bool has_program;
+ program_ = ProgramCache::Instance().Get(ProgramKeyRef{ context_(), precision_, routine_name_ },
+ &has_program);
+ if (has_program) { return; }
// Sets the build options from an environmental variable (if set)
auto options = std::vector<std::string>();
@@ -47,29 +70,29 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
- if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
- auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
- auto program = Program(device_, context_, binary);
- program.Build(device_, options);
- StoreProgramToCache(program, context_, precision_, routine_name_);
+ bool has_binary;
+ auto binary = BinaryCache::Instance().Get(BinaryKeyRef{ precision_, routine_name_, device_name_ },
+ &has_binary);
+ if (has_binary) {
+ program_ = Program(device_, context_, binary);
+ program_.Build(device_, options);
+ ProgramCache::Instance().Store(ProgramKey{ context_(), precision_, routine_name_ },
+ Program{ program_ });
+ return;
}
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
// program will be added to the cache.
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
- const auto extensions = device_.Capabilities();
- if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
- if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
- throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
- }
+ if ((precision_ == Precision::kDouble && !PrecisionSupported<double>(device_)) ||
+ (precision_ == Precision::kComplexDouble && !PrecisionSupported<double2>(device_))) {
+ throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
}
// As above, but for cl_khr_fp16 (half precision)
- if (precision_ == Precision::kHalf) {
- if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
- throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
- }
+ if (precision_ == Precision::kHalf && !PrecisionSupported<half>(device_)) {
+ throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
}
// Collects the parameters for this device in the form of defines, and adds the precision
@@ -114,21 +137,23 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
#endif
// Compiles the kernel
- auto program = Program(context_, source_string);
+ program_ = Program(context_, source_string);
try {
- program.Build(device_, options);
+ program_.Build(device_, options);
} catch (const CLError &e) {
if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
fprintf(stdout, "OpenCL compiler error/warning: %s\n",
- program.GetBuildInfo(device_).c_str());
+ program_.GetBuildInfo(device_).c_str());
}
throw;
}
// Store the compiled binary and program in the cache
- const auto binary = program.GetIR();
- StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
- StoreProgramToCache(program, context_, precision_, routine_name_);
+ BinaryCache::Instance().Store(BinaryKey{ precision_, routine_name_, device_name_ },
+ program_.GetIR());
+
+ ProgramCache::Instance().Store(ProgramKey{ context_(), precision_, routine_name_ },
+ Program{ program_ });
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
diff --git a/src/routine.hpp b/src/routine.hpp
index 2d8b2415..f366e4d9 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -35,11 +35,22 @@ class Routine {
// Base class constructor. The user database is an optional extra database to override the
// built-in database.
// All heavy preparation work is done inside this constructor.
+ // NOTE: the caller must provide the same userDatabase for each combination of device, precision
+ // and routine list, otherwise the caching logic will break.
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
const std::vector<const Database::DatabaseEntry*> &userDatabase,
std::initializer_list<const char *> source);
+ private:
+
+ // Initializes program_, fetching cached program or building one
+ void InitProgram(std::initializer_list<const char *> source);
+
+ // Initializes db_, fetching cached database or building one
+ void InitDatabase(const std::vector<std::string> &routines,
+ const std::vector<const Database::DatabaseEntry*> &userDatabase);
+
protected:
// Non-static variable for the precision
@@ -57,8 +68,11 @@ class Routine {
// OpenCL device properties
const std::string device_name_;
+ // Compiled program (either retrieved from cache or compiled in slow path)
+ Program program_;
+
// Connection to the database for all the device-specific parameters
- const Database db_;
+ Database db_;
};
// =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 8046c0be..bdea0086 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -19,8 +19,8 @@
#include <string>
#include <vector>
-#include "clblast.h"
#include "clpp11.hpp"
+#include "clblast.h"
#include "database/database.hpp"
namespace clblast {
diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp
index e9efa1a7..40a66517 100644
--- a/src/routines/level1/xamax.cpp
+++ b/src/routines/level1/xamax.cpp
@@ -43,9 +43,8 @@ void Xamax<T>::DoAmax(const size_t n,
TestVectorIndex(1, imax_buffer, imax_offset);
// Retrieves the Xamax kernels from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xamax");
- auto kernel2 = Kernel(program, "XamaxEpilogue");
+ auto kernel1 = Kernel(program_, "Xamax");
+ auto kernel2 = Kernel(program_, "XamaxEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp
index a242a5fa..b93b271c 100644
--- a/src/routines/level1/xasum.cpp
+++ b/src/routines/level1/xasum.cpp
@@ -43,9 +43,8 @@ void Xasum<T>::DoAsum(const size_t n,
TestVectorScalar(1, asum_buffer, asum_offset);
// Retrieves the Xasum kernels from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xasum");
- auto kernel2 = Kernel(program, "XasumEpilogue");
+ auto kernel1 = Kernel(program_, "Xasum");
+ auto kernel2 = Kernel(program_, "XasumEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp
index 5436c5b7..39f61ef4 100644
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@@ -52,8 +52,7 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
// Retrieves the Xaxpy kernel from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp
index d86200c0..62889764 100644
--- a/src/routines/level1/xcopy.cpp
+++ b/src/routines/level1/xcopy.cpp
@@ -52,8 +52,7 @@ void Xcopy<T>::DoCopy(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";
// Retrieves the Xcopy kernel from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp
index 9d718913..9f9c0590 100644
--- a/src/routines/level1/xdot.cpp
+++ b/src/routines/level1/xdot.cpp
@@ -46,9 +46,8 @@ void Xdot<T>::DoDot(const size_t n,
TestVectorScalar(1, dot_buffer, dot_offset);
// Retrieves the Xdot kernels from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xdot");
- auto kernel2 = Kernel(program, "XdotEpilogue");
+ auto kernel1 = Kernel(program_, "Xdot");
+ auto kernel2 = Kernel(program_, "XdotEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp
index 373820a4..aa341aff 100644
--- a/src/routines/level1/xnrm2.cpp
+++ b/src/routines/level1/xnrm2.cpp
@@ -43,9 +43,8 @@ void Xnrm2<T>::DoNrm2(const size_t n,
TestVectorScalar(1, nrm2_buffer, nrm2_offset);
// Retrieves the Xnrm2 kernels from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xnrm2");
- auto kernel2 = Kernel(program, "Xnrm2Epilogue");
+ auto kernel1 = Kernel(program_, "Xnrm2");
+ auto kernel2 = Kernel(program_, "Xnrm2Epilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp
index 0521b1e5..9bc096e5 100644
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@@ -49,8 +49,7 @@ void Xscal<T>::DoScal(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";
// Retrieves the Xscal kernel from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp
index c9b97dc9..f046575f 100644
--- a/src/routines/level1/xswap.cpp
+++ b/src/routines/level1/xswap.cpp
@@ -52,8 +52,7 @@ void Xswap<T>::DoSwap(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";
// Retrieves the Xswap kernel from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 52e66de6..7d2e5f60 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -123,8 +123,7 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
}
// Retrieves the Xgemv kernel from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_real));
diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp
index d16ebd11..9ec156a1 100644
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@@ -53,8 +53,7 @@ void Xger<T>::DoGer(const Layout layout,
TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, "Xger");
+ auto kernel = Kernel(program_, "Xger");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(a_one));
diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp
index 6c334e63..ba12a3ef 100644
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@@ -67,8 +67,7 @@ void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const auto matching_alpha = GetAlpha(alpha);
// Retrieves the kernel from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, "Xher");
+ auto kernel = Kernel(program_, "Xher");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp
index 11e2c871..a420e693 100644
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@@ -54,8 +54,7 @@ void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, "Xher2");
+ auto kernel = Kernel(program_, "Xher2");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index b0e4c5ae..d5d009ff 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -37,9 +37,6 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };
- // Retrieves the program from the cache
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV");
-
// Translates CLBlast arguments to 0/1 integers for the OpenCL kernel
const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
@@ -52,7 +49,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
// Retrieves the kernel from the compiled binary
const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
@@ -94,9 +91,6 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
TestMatrixA(n, n, a_buffer, a_offset, a_ld);
TestVectorX(n, b_buffer, b_offset, b_inc);
- // Retrieves the program from the cache
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV");
-
// Creates a copy of B to avoid overwriting input while computing output
// TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels
const auto x_offset = b_offset;
@@ -108,7 +102,7 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
// Fills the output buffer with zeros
auto eventWaitList = std::vector<Event>();
auto fill_vector_event = Event();
- FillVector(queue_, device_, program, db_, fill_vector_event.pointer(), eventWaitList,
+ FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList,
n, x_inc, x_offset, x_buffer, ConstantZero<T>());
fill_vector_event.WaitForCompletion();
diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp
index dc3f32f0..67e626a1 100644
--- a/src/routines/level2/xtrsv.hpp
+++ b/src/routines/level2/xtrsv.hpp
@@ -27,11 +27,11 @@ class Xtrsv: public Xgemv<T> {
public:
// Uses the generic matrix-vector routine
- using Xgemv<T>::routine_name_;
using Xgemv<T>::queue_;
using Xgemv<T>::context_;
using Xgemv<T>::device_;
using Xgemv<T>::db_;
+ using Xgemv<T>::program_;
using Xgemv<T>::DoGemv;
// Constructor
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 0015b629..7bd388c1 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -150,9 +150,6 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
a_do_transpose == false && a_conjugate == false;
@@ -178,7 +175,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
a_one_i, a_two_i, a_one_i, 0, a_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, a_do_transpose, a_conjugate);
eventWaitList.push_back(eventProcessA);
}
@@ -189,7 +186,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
b_one, b_two, b_ld, b_offset, b_buffer,
b_one_i, b_two_i, b_one_i, 0, b_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, b_do_transpose, b_conjugate);
eventWaitList.push_back(eventProcessB);
}
@@ -200,13 +197,13 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
c_one, c_two, c_ld, c_offset, c_buffer,
c_one_i, c_two_i, c_one_i, 0, c_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, c_do_transpose, false);
eventWaitList.push_back(eventProcessC);
}
// Retrieves the Xgemm kernel from the compiled binary
- auto kernel = Kernel(program, "Xgemm");
+ auto kernel = Kernel(program_, "Xgemm");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_ceiled));
@@ -236,7 +233,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
c_one_i, c_two_i, c_one_i, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
false, c_do_transpose, false);
}
}
@@ -255,13 +252,10 @@ void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
const bool a_conjugate, const bool b_conjugate) {
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
// Retrieves the proper XgemmDirect kernel from the compiled binary
const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
(b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
- auto kernel = Kernel(program, name);
+ auto kernel = Kernel(program_, name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m));
diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp
index e5b1502a..8629f3de 100644
--- a/src/routines/level3/xhemm.cpp
+++ b/src/routines/level3/xhemm.cpp
@@ -58,8 +58,7 @@ void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle trian
// Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
// routine afterwards
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the arguments for the hermitian-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));
diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp
index 2385706e..7c011915 100644
--- a/src/routines/level3/xhemm.hpp
+++ b/src/routines/level3/xhemm.hpp
@@ -30,6 +30,7 @@ class Xhemm: public Xgemm<T> {
using Xgemm<T>::queue_;
using Xgemm<T>::context_;
using Xgemm<T>::device_;
+ using Xgemm<T>::program_;
using Xgemm<T>::db_;
using Xgemm<T>::DoGemm;
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index ee3bb8b8..2aed2781 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -81,9 +81,6 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
// Determines whether or not temporary matrices are needed
auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false && ab_conjugate == false;
@@ -116,7 +113,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessA1);
}
@@ -125,7 +122,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessA2);
}
@@ -134,7 +131,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessB1);
}
@@ -143,7 +140,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessB2);
}
@@ -154,12 +151,12 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
@@ -201,7 +198,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
false, c_rotated, false, upper, lower, true);
}
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index ae8e9324..d982859e 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -79,9 +79,6 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false && a_conjugate == false;
@@ -109,7 +106,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, a_rotated, a_conjugate);
eventWaitList.push_back(eventProcessA);
}
@@ -118,7 +115,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, a_rotated, b_conjugate);
eventWaitList.push_back(eventProcessB);
}
@@ -129,12 +126,12 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
@@ -163,7 +160,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
false, c_rotated, false, upper, lower, true);
}
diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp
index d7f771d1..969edfc8 100644
--- a/src/routines/level3/xsymm.cpp
+++ b/src/routines/level3/xsymm.cpp
@@ -30,12 +30,12 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
@@ -58,8 +58,7 @@ void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle trian
// Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
// routine afterwards
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the arguments for the symmetric-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));
diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp
index ee965364..7a584560 100644
--- a/src/routines/level3/xsymm.hpp
+++ b/src/routines/level3/xsymm.hpp
@@ -32,6 +32,7 @@ class Xsymm: public Xgemm<T> {
using Xgemm<T>::queue_;
using Xgemm<T>::context_;
using Xgemm<T>::device_;
+ using Xgemm<T>::program_;
using Xgemm<T>::db_;
using Xgemm<T>::DoGemm;
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index cb0e0461..fdef43dc 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -77,9 +77,6 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
// Determines whether or not temporary matrices are needed
auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false;
@@ -103,7 +100,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, ab_rotated, false);
eventWaitList.push_back(eventProcessA);
}
@@ -112,7 +109,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, ab_rotated, false);
eventWaitList.push_back(eventProcessB);
}
@@ -123,12 +120,12 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
@@ -168,7 +165,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
false, c_rotated, false, upper, lower, false);
}
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index bd6c4b25..9588c28c 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -74,9 +74,6 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false;
@@ -97,7 +94,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, a_rotated, false);
eventWaitList.push_back(eventProcessA);
}
@@ -108,12 +105,12 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
@@ -142,7 +139,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
+ ConstantOne<T>(), program_,
false, c_rotated, false, upper, lower, false);
}
diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp
index ed810e72..02c295ac 100644
--- a/src/routines/level3/xtrmm.cpp
+++ b/src/routines/level3/xtrmm.cpp
@@ -70,8 +70,7 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian
// Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
// routine afterwards
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program_, kernel_name);
// Sets the arguments for the triangular-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));
diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp
index 967bf132..e77b7214 100644
--- a/src/routines/level3/xtrmm.hpp
+++ b/src/routines/level3/xtrmm.hpp
@@ -31,6 +31,7 @@ class Xtrmm: public Xgemm<T> {
using Xgemm<T>::queue_;
using Xgemm<T>::context_;
using Xgemm<T>::device_;
+ using Xgemm<T>::program_;
using Xgemm<T>::db_;
using Xgemm<T>::DoGemm;
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index 8061b508..3a910261 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -79,9 +79,8 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
// Fills the output buffer with zeros
auto eventWaitList = std::vector<Event>();
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSM");
auto fill_matrix_event = Event();
- FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), eventWaitList,
+ FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList,
x_one, x_ld, x_offset, x_buffer, ConstantZero<T>());
fill_matrix_event.WaitForCompletion();
diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp
index 288e9d11..b9d5432a 100644
--- a/src/routines/level3/xtrsm.hpp
+++ b/src/routines/level3/xtrsm.hpp
@@ -26,11 +26,11 @@ class Xtrsm: public Xgemm<T> {
public:
// Uses methods and variables the Xgemm routine
- using Xgemm<T>::routine_name_;
using Xgemm<T>::queue_;
using Xgemm<T>::context_;
using Xgemm<T>::device_;
using Xgemm<T>::db_;
+ using Xgemm<T>::program_;
using Xgemm<T>::DoGemm;
// Constructor
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index ffee9b7c..696e694a 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -69,18 +69,15 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
const auto name_postfix = (is_upper) ? "Upper" : "Lower";
- // Retrieves the program from the cache
- auto event_wait_list = std::vector<Event>();
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "INVERT");
-
// Fills the output buffer with zeros
+ auto event_wait_list = std::vector<Event>();
auto fill_matrix_event = Event();
- FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), event_wait_list,
+ FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list,
num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
event_wait_list.push_back(fill_matrix_event);
// Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
- auto kernel = Kernel(program, "InvertDiagonalBlock");
+ auto kernel = Kernel(program_, "InvertDiagonalBlock");
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, src());
kernel.SetArgument(2, static_cast<int>(offset));
@@ -110,7 +107,7 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
// Part 1
- auto kernel1 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
+ auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, src());
kernel1.SetArgument(2, static_cast<int>(offset));
@@ -125,7 +122,7 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
// Part 2
const bool is_last_kernel = (current_size * 2 >= block_size);
- auto kernel2 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
+ auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
kernel2.SetArgument(0, static_cast<int>(n));
kernel2.SetArgument(1, dest());
kernel2.SetArgument(2, static_cast<int>(current_size));
diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp
index 875ca7d2..4ae8c056 100644
--- a/src/routines/levelx/xomatcopy.cpp
+++ b/src/routines/levelx/xomatcopy.cpp
@@ -65,14 +65,11 @@ void Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose,
TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
auto emptyEventList = std::vector<Event>();
PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
b_one, b_two, b_ld, b_offset, b_buffer,
- alpha, program, false, transpose, conjugate);
+ alpha, program_, false, transpose, conjugate);
}
// =================================================================================================
diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp
index f3c7b9a3..0d0033b6 100644
--- a/src/utilities/clblast_exceptions.hpp
+++ b/src/utilities/clblast_exceptions.hpp
@@ -16,8 +16,8 @@
#ifndef CLBLAST_EXCEPTIONS_H_
#define CLBLAST_EXCEPTIONS_H_
-#include "clblast.h"
#include "clpp11.hpp"
+#include "clblast.h"
namespace clblast {
// =================================================================================================
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 3e408bb7..757f1b5e 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -21,9 +21,9 @@
#include <functional>
#include <complex>
+#include "clpp11.hpp"
#include "clblast.h"
#include "clblast_half.h"
-#include "clpp11.hpp"
#include "utilities/clblast_exceptions.hpp"
#include "utilities/msvc.hpp"
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index dc0f842e..c0ed5ba4 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -248,8 +248,29 @@ template <typename T, typename U>
void Tester<T,U>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
const Arguments<U> &args) {
+ // Either an OpenCL or CLBlast internal error occurred, fail the test immediately
+ // NOTE: the OpenCL error codes grow downwards without any declared lower bound, hence the magic
+ // number. The last error code is atm around -70, but -500 is chosen to be on the safe side.
+ if (clblast_status != StatusCode::kSuccess &&
+ (clblast_status > static_cast<StatusCode>(-500) /* matches OpenCL errors (see above) */ ||
+ clblast_status < StatusCode::kNotImplemented) /* matches CLBlast internal errors */) {
+ PrintTestResult(kErrorStatus);
+ ReportError({StatusCode::kSuccess, clblast_status, kStatusError, args});
+ if (verbose_) {
+ fprintf(stdout, "\n");
+ PrintErrorLog({{StatusCode::kSuccess, clblast_status, kStatusError, args}});
+ fprintf(stdout, " ");
+ }
+ }
+
+ // Routine is not implemented
+ else if (clblast_status == StatusCode::kNotImplemented) {
+ PrintTestResult(kSkippedCompilation);
+ ReportSkipped();
+ }
+
// Cannot compare error codes against a library other than clBLAS
- if (compare_cblas_) {
+ else if (compare_cblas_) {
PrintTestResult(kUnsupportedReference);
ReportSkipped();
}
@@ -267,13 +288,6 @@ void Tester<T,U>::TestErrorCodes(const StatusCode clblas_status, const StatusCod
ReportSkipped();
}
- // Could not compile the CLBlast kernel properly
- else if (clblast_status == StatusCode::kOpenCLBuildProgramFailure ||
- clblast_status == StatusCode::kNotImplemented) {
- PrintTestResult(kSkippedCompilation);
- ReportSkipped();
- }
-
// Error occurred
else {
PrintTestResult(kErrorStatus);
@@ -388,7 +402,9 @@ void Tester<T,U>::PrintErrorLog(const std::vector<ErrorLogEntry> &error_log) {
fprintf(stdout, " Error rate %.1lf%%: ", entry.error_percentage);
}
else {
- fprintf(stdout, " Status code %d (expected %d): ", entry.status_found, entry.status_expect);
+ fprintf(stdout, " Status code %d (expected %d): ",
+ static_cast<int>(entry.status_found),
+ static_cast<int>(entry.status_expect));
}
fprintf(stdout, "%s\n", GetOptionsString(entry.args).c_str());
}
diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp
index d8462cef..113f03ef 100644
--- a/test/correctness/tester.hpp
+++ b/test/correctness/tester.hpp
@@ -22,14 +22,14 @@
#include <vector>
#include <memory>
+#include "utilities/utilities.hpp"
+
// The libraries
#ifdef CLBLAST_REF_CLBLAS
#include <clBLAS.h>
#endif
#include "clblast.h"
-#include "utilities/utilities.hpp"
-
namespace clblast {
// =================================================================================================
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index 4554c67f..4b3e17c7 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -25,14 +25,14 @@
#include <vector>
#include <utility>
+#include "utilities/utilities.hpp"
+
// The libraries to test
#ifdef CLBLAST_REF_CLBLAS
#include <clBLAS.h>
#endif
#include "clblast.h"
-#include "utilities/utilities.hpp"
-
namespace clblast {
// =================================================================================================