summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/clpp11.hpp48
-rw-r--r--src/database/database.cpp53
-rw-r--r--src/database/database.hpp62
-rw-r--r--src/database/kernel_selection.hpp131
-rw-r--r--src/database/kernels/copy.hpp24
-rw-r--r--src/database/kernels/pad.hpp24
-rw-r--r--src/database/kernels/padtranspose.hpp16
-rw-r--r--src/database/kernels/transpose.hpp18
-rw-r--r--src/database/kernels/xaxpy.hpp20
-rw-r--r--src/database/kernels/xdot.hpp18
-rw-r--r--src/database/kernels/xgemm.hpp24
-rw-r--r--src/database/kernels/xgemm_direct.hpp138
-rw-r--r--src/database/kernels/xgemv.hpp20
-rw-r--r--src/database/kernels/xgemv_fast.hpp21
-rw-r--r--src/database/kernels/xgemv_fast_rot.hpp20
-rw-r--r--src/database/kernels/xger.hpp22
-rw-r--r--src/kernels/common.opencl2
-rw-r--r--src/kernels/level3/xgemm_direct_part1.opencl273
-rw-r--r--src/kernels/level3/xgemm_direct_part2.opencl314
-rw-r--r--src/kernels/level3/xgemm_direct_part3.opencl214
-rw-r--r--src/msvc.hpp39
-rw-r--r--src/routine.cpp12
-rw-r--r--src/routine.hpp2
-rw-r--r--src/routines/level3/xgemm.cpp108
-rw-r--r--src/routines/level3/xgemm.hpp23
-rw-r--r--src/tuning/kernels/copy_fast.cpp1
-rw-r--r--src/tuning/kernels/copy_pad.cpp1
-rw-r--r--src/tuning/kernels/transpose_fast.cpp1
-rw-r--r--src/tuning/kernels/transpose_pad.cpp1
-rw-r--r--src/tuning/kernels/xaxpy.cpp1
-rw-r--r--src/tuning/kernels/xdot.cpp1
-rw-r--r--src/tuning/kernels/xgemm.cpp7
-rw-r--r--src/tuning/kernels/xgemm_direct.cpp196
-rw-r--r--src/tuning/kernels/xgemv.cpp1
-rw-r--r--src/tuning/kernels/xger.cpp1
-rw-r--r--src/tuning/tuning.hpp18
-rw-r--r--src/utilities.cpp20
-rw-r--r--src/utilities.hpp6
38 files changed, 1713 insertions, 188 deletions
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index d57223dd..aaa76cb4 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -12,8 +12,8 @@
// Portability here means that a similar header exists for CUDA with the same classes and
// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
//
-// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore
-// contains the following header copyright notice:
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
//
// =================================================================================================
//
@@ -97,14 +97,12 @@ class Event {
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
float GetElapsedTime() const {
WaitForCompletion();
- auto bytes = size_t{0};
- clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
- auto time_start = size_t{0};
+ const auto bytes = sizeof(cl_ulong);
+ auto time_start = cl_ulong{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
- clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
- auto time_end = size_t{0};
+ auto time_end = cl_ulong{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
- return (time_end - time_start) * 1.0e-6f;
+ return static_cast<float>(time_end - time_start) * 1.0e-6f;
}
// Accessor to the private data-member
@@ -152,6 +150,17 @@ class Platform {
cl_platform_id platform_;
};
+// Retrieves a vector with all platforms
+inline std::vector<Platform> GetAllPlatforms() {
+ auto num_platforms = cl_uint{0};
+ CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
+ auto all_platforms = std::vector<Platform>();
+ for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
+ all_platforms.push_back(Platform(platform_id));
+ }
+ return all_platforms;
+}
+
// =================================================================================================
// C++11 version of 'cl_device_id'
@@ -201,8 +210,8 @@ class Device {
std::vector<size_t> MaxWorkItemSizes() const {
return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
}
- cl_ulong LocalMemSize() const {
- return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
+ unsigned long LocalMemSize() const {
+ return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
}
std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
size_t CoreClock() const {
@@ -238,9 +247,11 @@ class Device {
// Query for a specific type of device or brand
bool IsCPU() const { return Type() == "CPU"; }
bool IsGPU() const { return Type() == "GPU"; }
- bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+ bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
+ Vendor() == "AuthenticAMD";; }
bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
- bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+ bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
+ Vendor() == "GenuineIntel"; }
bool IsARM() const { return Vendor() == "ARM"; }
// Accessor to the private data-member
@@ -606,8 +617,7 @@ class Buffer {
// Retrieves the actual allocated size in bytes
size_t GetSize() const {
- auto bytes = size_t{0};
- CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
+ const auto bytes = sizeof(size_t);
auto result = size_t{0};
CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
return result;
@@ -658,17 +668,16 @@ class Kernel {
}
// Retrieves the amount of local memory used per work-group for this kernel
- cl_ulong LocalMemUsage(const Device &device) const {
- auto bytes = size_t{0};
+ unsigned long LocalMemUsage(const Device &device) const {
+ const auto bytes = sizeof(cl_ulong);
auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
- CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
auto result = cl_ulong{0};
CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
- return result;
+ return static_cast<unsigned long>(result);
}
// Retrieves the name of the kernel
- std::string GetFunctionName() {
+ std::string GetFunctionName() const {
auto bytes = size_t{0};
CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
auto result = std::string{};
@@ -689,6 +698,7 @@ class Kernel {
void Launch(const Queue &queue, const std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event,
const std::vector<Event> &waitForEvents) {
+
// Builds a plain version of the events waiting list
auto waitForEventsPlain = std::vector<cl_event>();
for (auto &waitEvent : waitForEvents) {
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 34c44a29..2340a89c 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -21,27 +21,42 @@
#include "database/kernels/xgemv_fast_rot.hpp"
#include "database/kernels/xger.hpp"
#include "database/kernels/xgemm.hpp"
+#include "database/kernels/xgemm_direct.hpp"
#include "database/kernels/copy.hpp"
#include "database/kernels/pad.hpp"
#include "database/kernels/transpose.hpp"
#include "database/kernels/padtranspose.hpp"
+#include "database/kernel_selection.hpp"
namespace clblast {
// =================================================================================================
// Initializes the database
-const std::vector<Database::DatabaseEntry> Database::database = {
- XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
- XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
- XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
- XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble,
- XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
- XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
- XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
- CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
- PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
- TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
- PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
+const std::vector<const Database::DatabaseEntry*> Database::database = {
+ &database::XaxpyHalf, &database::XaxpySingle, &database::XaxpyDouble, &database::XaxpyComplexSingle, &database::XaxpyComplexDouble,
+ &database::XdotHalf, &database::XdotSingle, &database::XdotDouble, &database::XdotComplexSingle, &database::XdotComplexDouble,
+ &database::XgemvHalf, &database::XgemvSingle, &database::XgemvDouble, &database::XgemvComplexSingle, &database::XgemvComplexDouble,
+ &database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
+ &database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
+ &database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
+ &database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
+ &database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
+ &database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
+ &database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
+ &database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
+ &database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
+ &database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
+};
+
+// The OpenCL device vendors
+const std::string Database::kDeviceVendorAll = "default";
+
+// Alternative names for some OpenCL vendors
+const std::unordered_map<std::string, std::string> Database::kVendorNames{
+ { "Intel(R) Corporation", "Intel" },
+ { "GenuineIntel", "Intel" },
+ { "Advanced Micro Devices, Inc.", "AMD" },
+ { "NVIDIA Corporation", "NVIDIA" },
};
// =================================================================================================
@@ -49,7 +64,7 @@ const std::vector<Database::DatabaseEntry> Database::database = {
// Constructor, computing device properties and populating the parameter-vector from the database.
// This takes an optional overlay database in case of custom tuning or custom kernels.
Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
- const Precision precision, const std::vector<DatabaseEntry> &overlay):
+ const Precision precision, const std::vector<const DatabaseEntry*> &overlay):
parameters_{} {
// Finds information of the current device
@@ -69,8 +84,8 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
for (auto &kernel: kernels) {
auto search_result = ParametersPtr{};
- for (auto db: { &overlay, &database }) {
- search_result = Search(kernel, device_type, device_vendor, device_name, precision, *db);
+ for (auto &db: { database, overlay}) {
+ search_result = Search(kernel, device_type, device_vendor, device_name, precision, db);
if (search_result) {
parameters_.insert(search_result->begin(), search_result->end());
break;
@@ -100,17 +115,17 @@ Database::ParametersPtr Database::Search(const std::string &this_kernel,
const std::string &this_vendor,
const std::string &this_device,
const Precision this_precision,
- const std::vector<DatabaseEntry> &this_database) const {
+ const std::vector<const DatabaseEntry*> &this_database) const {
// Selects the right kernel
for (auto &db: this_database) {
- if (db.kernel == this_kernel && db.precision == this_precision) {
+ if (db->kernel == this_kernel && db->precision == this_precision) {
// Searches for the right vendor and device type, or selects the default if unavailable. This
// assumes that the default vendor / device type is last in the database.
- for (auto &vendor: db.vendors) {
+ for (auto &vendor: db->vendors) {
if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
- (vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
+ (vendor.type == this_type || vendor.type == database::kDeviceTypeAll)) {
// Searches for the right device. If the current device is unavailable, selects the vendor
// default parameters. This assumes the default is last in the database.
diff --git a/src/database/database.hpp b/src/database/database.hpp
index a6ab49c5..8a3e7040 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -26,6 +26,19 @@
namespace clblast {
// =================================================================================================
+// A special namespace to hold all the global constant variables (including the database entries)
+namespace database {
+
+ // The OpenCL device types
+ const std::string kDeviceTypeCPU = "CPU";
+ const std::string kDeviceTypeGPU = "GPU";
+ const std::string kDeviceTypeAccelerator = "accelerator";
+ const std::string kDeviceTypeAll = "default";
+
+} // namespace database
+
+// =================================================================================================
+
// See comment at top of file for a description of the class
class Database {
public:
@@ -36,54 +49,32 @@ class Database {
// Structures for content inside the database
struct DatabaseDevice {
- const std::string name;
- const Parameters parameters;
+ std::string name;
+ Parameters parameters;
};
struct DatabaseVendor {
- const std::string type;
- const std::string name;
- const std::vector<DatabaseDevice> devices;
+ std::string type;
+ std::string name;
+ std::vector<DatabaseDevice> devices;
};
struct DatabaseEntry {
- const std::string kernel;
- const Precision precision;
- const std::vector<DatabaseVendor> vendors;
+ std::string kernel;
+ Precision precision;
+ std::vector<DatabaseVendor> vendors;
};
- // The OpenCL device types
- static constexpr auto kDeviceTypeCPU = "CPU";
- static constexpr auto kDeviceTypeGPU = "GPU";
- static constexpr auto kDeviceTypeAccelerator = "accelerator";
- static constexpr auto kDeviceTypeAll = "default";
-
// The OpenCL device vendors
- static constexpr auto kDeviceVendorAll = "default";
+ static const std::string kDeviceVendorAll;
// Alternative names for some OpenCL vendors
- const std::unordered_map<std::string,std::string> kVendorNames {
- {"Intel(R) Corporation", "Intel"},
- {"GenuineIntel", "Intel"},
- {"Advanced Micro Devices, Inc.", "AMD"},
- {"NVIDIA Corporation", "NVIDIA"},
- };
+ static const std::unordered_map<std::string, std::string> kVendorNames;
// The database consists of separate database entries, stored together in a vector
- static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
- static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
- static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
- static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble;
- static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
- static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
- static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
- static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
- static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
- static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
- static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
- static const std::vector<DatabaseEntry> database;
+ static const std::vector<const DatabaseEntry*> database;
// The constructor with a user-provided database overlay (potentially an empty vector)
explicit Database(const Queue &queue, const std::vector<std::string> &routines,
- const Precision precision, const std::vector<DatabaseEntry> &overlay);
+ const Precision precision, const std::vector<const DatabaseEntry*> &overlay);
// Accessor of values by key
size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
@@ -95,7 +86,8 @@ class Database {
// Search method for a specified database, returning pointer (possibly a nullptr)
ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
const std::string &this_vendor, const std::string &this_device,
- const Precision this_precision, const std::vector<DatabaseEntry> &db) const;
+ const Precision this_precision,
+ const std::vector<const DatabaseEntry*> &db) const;
// Found parameters suitable for this device/kernel
Parameters parameters_;
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
new file mode 100644
index 00000000..7e5e7821
--- /dev/null
+++ b/src/database/kernel_selection.hpp
@@ -0,0 +1,131 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel
+// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the
+// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use
+// some common default values.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionHalf = {
+ "KernelSelection", Precision::kHalf, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionSingle = {
+ "KernelSelection", Precision::kSingle, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionComplexSingle = {
+ "KernelSelection", Precision::kComplexSingle, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionDouble = {
+ "KernelSelection", Precision::kDouble, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionComplexDouble = {
+ "KernelSelection", Precision::kComplexDouble, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index a6b7dfe8..16aa6b3f 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::CopyHalf = {
+const Database::DatabaseEntry CopyHalf = {
"Copy", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::CopyHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::CopySingle = {
+const Database::DatabaseEntry CopySingle = {
"Copy", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::CopySingle = {
{ "Intel(R) HD Graphics 530", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
@@ -84,7 +86,7 @@ const Database::DatabaseEntry Database::CopySingle = {
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::CopySingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::CopyComplexSingle = {
+const Database::DatabaseEntry CopyComplexSingle = {
"Copy", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -128,6 +130,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
@@ -147,7 +150,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
@@ -165,7 +168,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::CopyDouble = {
+const Database::DatabaseEntry CopyDouble = {
"Copy", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -205,13 +208,13 @@ const Database::DatabaseEntry Database::CopyDouble = {
{ "GeForce GTX 670", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Default
@@ -224,7 +227,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::CopyComplexDouble = {
+const Database::DatabaseEntry CopyComplexDouble = {
"Copy", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -264,7 +267,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -282,4 +285,5 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index 3cfabaf4..6c5e0c2f 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::PadHalf = {
+const Database::DatabaseEntry PadHalf = {
"Pad", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::PadHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::PadSingle = {
+const Database::DatabaseEntry PadSingle = {
"Pad", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::PadSingle = {
{ "Intel(R) HD Graphics 530", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
@@ -84,7 +86,7 @@ const Database::DatabaseEntry Database::PadSingle = {
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
- { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::PadSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::PadComplexSingle = {
+const Database::DatabaseEntry PadComplexSingle = {
"Pad", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -134,10 +136,11 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // Intel accelerators
@@ -154,13 +157,13 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
{ "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // Default
@@ -173,7 +176,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::PadDouble = {
+const Database::DatabaseEntry PadDouble = {
"Pad", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -232,7 +235,7 @@ const Database::DatabaseEntry Database::PadDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::PadComplexDouble = {
+const Database::DatabaseEntry PadComplexDouble = {
"Pad", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -272,7 +275,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -290,4 +293,5 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 88bd4ea7..4003ec6d 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::PadtransposeHalf = {
+const Database::DatabaseEntry PadtransposeHalf = {
"Padtranspose", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::PadtransposeHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::PadtransposeSingle = {
+const Database::DatabaseEntry PadtransposeSingle = {
"Padtranspose", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
{ "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
+const Database::DatabaseEntry PadtransposeComplexSingle = {
"Padtranspose", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -134,6 +136,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -173,7 +176,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::PadtransposeDouble = {
+const Database::DatabaseEntry PadtransposeDouble = {
"Padtranspose", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -232,7 +235,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
+const Database::DatabaseEntry PadtransposeComplexDouble = {
"Padtranspose", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -272,7 +275,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
@@ -290,4 +293,5 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 0e1b608e..c5ea50c2 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::TransposeHalf = {
+const Database::DatabaseEntry TransposeHalf = {
"Transpose", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::TransposeHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::TransposeSingle = {
+const Database::DatabaseEntry TransposeSingle = {
"Transpose", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
{ "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::TransposeComplexSingle = {
+const Database::DatabaseEntry TransposeComplexSingle = {
"Transpose", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -134,6 +136,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@@ -159,7 +162,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
}
@@ -167,7 +170,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::TransposeDouble = {
+const Database::DatabaseEntry TransposeDouble = {
"Transpose", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -207,7 +210,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 750", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@@ -226,7 +229,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::TransposeComplexDouble = {
+const Database::DatabaseEntry TransposeComplexDouble = {
"Transpose", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -278,4 +281,5 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 9c1bcd99..60471bef 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::XaxpyHalf = {
+const Database::DatabaseEntry XaxpyHalf = {
"Xaxpy", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XaxpyHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::XaxpySingle = {
+const Database::DatabaseEntry XaxpySingle = {
"Xaxpy", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
{ "Intel(R) HD Graphics 530", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } },
{ "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
@@ -84,7 +86,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
{ "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XaxpyComplexSingle = {
+const Database::DatabaseEntry XaxpyComplexSingle = {
"Xaxpy", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -134,6 +136,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"VW",4}, {"WGS",64}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
{ "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
@@ -173,7 +176,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XaxpyDouble = {
+const Database::DatabaseEntry XaxpyDouble = {
"Xaxpy", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -213,7 +216,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
{ "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
@@ -232,7 +235,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::XaxpyComplexDouble = {
+const Database::DatabaseEntry XaxpyComplexDouble = {
"Xaxpy", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -272,7 +275,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
{ "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+ { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
@@ -290,4 +293,5 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 987a990d..686b2839 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::XdotHalf = {
+const Database::DatabaseEntry XdotHalf = {
"Xdot", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XdotHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::XdotSingle = {
+const Database::DatabaseEntry XdotSingle = {
"Xdot", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -55,6 +56,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WGS2",32} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",512}, {"WGS2",128} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } },
{ "Iris Pro", { {"WGS1",512}, {"WGS2",64} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
@@ -68,6 +70,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",1024} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } },
{ "GeForce GTX 750", { {"WGS1",128}, {"WGS2",32} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
@@ -84,7 +87,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XdotComplexSingle = {
+const Database::DatabaseEntry XdotComplexSingle = {
"Xdot", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -106,6 +109,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",512}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
{ "Iris Pro", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",32}, {"WGS2",32} } },
@@ -119,6 +123,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",32} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
@@ -135,7 +140,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XdotDouble = {
+const Database::DatabaseEntry XdotDouble = {
"Xdot", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -160,6 +165,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",256} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
@@ -176,7 +182,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::XdotComplexDouble = {
+const Database::DatabaseEntry XdotComplexDouble = {
"Xdot", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -201,6 +207,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",128} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",32} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
@@ -216,4 +223,5 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index d19c55b5..8303fa83 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemmHalf = {
+const Database::DatabaseEntry XgemmHalf = {
"Xgemm", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
@@ -26,7 +27,7 @@ const Database::DatabaseEntry Database::XgemmHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemmSingle = {
+const Database::DatabaseEntry XgemmSingle = {
"Xgemm", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -57,9 +58,10 @@ const Database::DatabaseEntry Database::XgemmSingle = {
{ "Intel(R) HD Graphics 530", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
- { "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+ { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
@@ -77,7 +79,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "GeForce GTX 750", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
+ { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
@@ -96,7 +98,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemmComplexSingle = {
+const Database::DatabaseEntry XgemmComplexSingle = {
"Xgemm", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -127,6 +129,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
{ "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
@@ -147,7 +150,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 750", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
@@ -166,7 +169,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemmDouble = {
+const Database::DatabaseEntry XgemmDouble = {
"Xgemm", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -206,7 +209,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 750", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+ { "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -225,7 +228,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemmComplexDouble = {
+const Database::DatabaseEntry XgemmComplexDouble = {
"Xgemm", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -265,7 +268,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 750", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -282,4 +285,5 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp
new file mode 100644
index 00000000..89499cc6
--- /dev/null
+++ b/src/database/kernels/xgemm_direct.hpp
@@ -0,0 +1,138 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectHalf = {
+ "XgemmDirect", Precision::kHalf, {
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectSingle = {
+ "XgemmDirect", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Iris Pro", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectComplexSingle = {
+ "XgemmDirect", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Iris Pro", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectDouble = {
+ "XgemmDirect", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectComplexDouble = {
+ "XgemmDirect", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index e5e8845e..90355b96 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvHalf = {
+const Database::DatabaseEntry XgemvHalf = {
"Xgemv", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgemvHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvSingle = {
+const Database::DatabaseEntry XgemvSingle = {
"Xgemv", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -57,6 +58,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Iris", { {"WGS1",64}, {"WPT1",2} } },
{ "Iris Pro", { {"WGS1",256}, {"WPT1",2} } },
@@ -77,7 +79,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WPT1",1} } },
- { "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } },
@@ -96,7 +98,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvComplexSingle = {
+const Database::DatabaseEntry XgemvComplexSingle = {
"Xgemv", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -120,6 +122,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Iris", { {"WGS1",256}, {"WPT1",1} } },
{ "Iris Pro", { {"WGS1",64}, {"WPT1",1} } },
@@ -140,7 +143,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",128}, {"WPT1",1} } },
- { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
@@ -155,7 +158,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvDouble = {
+const Database::DatabaseEntry XgemvDouble = {
"Xgemv", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -188,7 +191,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
{ "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WPT1",1} } },
- { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } },
@@ -207,7 +210,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvComplexDouble = {
+const Database::DatabaseEntry XgemvComplexDouble = {
"Xgemv", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -249,4 +252,5 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 52af628c..8e6254ac 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvFastHalf = {
+const Database::DatabaseEntry XgemvFastHalf = {
"XgemvFast", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgemvFastHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvFastSingle = {
+const Database::DatabaseEntry XgemvFastSingle = {
"XgemvFast", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -57,10 +58,11 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
{ "Intel(R) HD Graphics 530", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Iris", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "Iris Pro", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
- { "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+ { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
}
},
{ // Intel accelerators
@@ -77,7 +79,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
{ "GeForce GTX 670", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 750", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
- { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+ { "GeForce GTX 750 Ti", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@@ -96,7 +98,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
+const Database::DatabaseEntry XgemvFastComplexSingle = {
"XgemvFast", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -120,6 +122,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Iris", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Iris Pro", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
@@ -139,7 +142,6 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
- { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@@ -153,7 +155,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvFastDouble = {
+const Database::DatabaseEntry XgemvFastDouble = {
"XgemvFast", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -186,7 +188,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 750", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
- { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+ { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@@ -205,7 +207,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
+const Database::DatabaseEntry XgemvFastComplexDouble = {
"XgemvFast", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -247,4 +249,5 @@ const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 328094e1..8fe45e01 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvFastRotHalf = {
+const Database::DatabaseEntry XgemvFastRotHalf = {
"XgemvFastRot", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
@@ -26,7 +27,7 @@ const Database::DatabaseEntry Database::XgemvFastRotHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvFastRotSingle = {
+const Database::DatabaseEntry XgemvFastRotSingle = {
"XgemvFastRot", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -44,6 +45,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
{ "Iris Pro", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
@@ -51,8 +53,9 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
- { "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+ { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Default
@@ -65,7 +68,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
+const Database::DatabaseEntry XgemvFastRotComplexSingle = {
"XgemvFastRot", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -83,6 +86,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
{ "Iris Pro", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
@@ -98,7 +102,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvFastRotDouble = {
+const Database::DatabaseEntry XgemvFastRotDouble = {
"XgemvFastRot", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -114,8 +118,9 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
- { "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+ { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
}
},
{ // Default
@@ -128,7 +133,7 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
+const Database::DatabaseEntry XgemvFastRotComplexDouble = {
"XgemvFastRot", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -151,4 +156,5 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 3e9c25c1..f2fc2a9a 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -12,9 +12,10 @@
// =================================================================================================
namespace clblast {
+namespace database {
// =================================================================================================
-const Database::DatabaseEntry Database::XgerHalf = {
+const Database::DatabaseEntry XgerHalf = {
"Xger", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgerHalf = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgerSingle = {
+const Database::DatabaseEntry XgerSingle = {
"Xger", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -63,6 +64,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
{ "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
@@ -76,8 +78,9 @@ const Database::DatabaseEntry Database::XgerSingle = {
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
- { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+ { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
}
},
{ // Default
@@ -90,7 +93,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgerComplexSingle = {
+const Database::DatabaseEntry XgerComplexSingle = {
"Xger", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -120,6 +123,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
{ "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
@@ -133,6 +137,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
{ "GeForce GTX 670", { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 750", { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
@@ -147,7 +152,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgerDouble = {
+const Database::DatabaseEntry XgerDouble = {
"Xger", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -180,8 +185,9 @@ const Database::DatabaseEntry Database::XgerDouble = {
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
- { "default", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+ { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
},
{ // Default
@@ -194,7 +200,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
// =================================================================================================
-const Database::DatabaseEntry Database::XgerComplexDouble = {
+const Database::DatabaseEntry XgerComplexDouble = {
"Xger", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@@ -227,6 +233,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
{ "GeForce GTX 670", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX 750", { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
}
@@ -240,4 +247,5 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
};
// =================================================================================================
+} // namespace database
} // namespace clblast
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 223501fd..b0817242 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -204,7 +204,7 @@ R"(
#if PRECISION == 3232 || PRECISION == 6464
#define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y
#else
- #define COMPLEX_CONJUGATE(value) value = value
+ #define COMPLEX_CONJUGATE(value)
#endif
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl
new file mode 100644
index 00000000..a8bd450e
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part1.opencl
@@ -0,0 +1,273 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any
+// pre and and post-processing kernels.
+//
+// This kernel is seperated into three files. This is part 1 out of 3.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library. Note that all parameters here have a
+// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel.
+#ifndef WGD
+ #define WGD 8 // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64)
+#endif
+#ifndef MDIMCD
+ #define MDIMCD 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMCD
+ #define NDIMCD 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMAD
+ #define MDIMAD 8 // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#endif
+#ifndef NDIMBD
+ #define NDIMBD 8 // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#endif
+#ifndef KWID
+ #define KWID 1 // Unroll factor of the WGD loop (smaller or equal than WGD)
+#endif
+#ifndef VWMD
+ #define VWMD 1 // Vector width of matrices A and C
+#endif
+#ifndef VWND
+ #define VWND 1 // Vector width of matrix B
+#endif
+#ifndef PADA
+ #define PADA 1 // Local memory padding for matrix A
+#endif
+#ifndef PADB
+ #define PADB 1 // Local memory padding for matrix B
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWID (WGD/MDIMCD) // Work per work-item (M-dimension)
+#define NWID (WGD/NDIMCD) // Work per work-item (N-dimension)
+#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#define MWAD (WGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWAD (WGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWBD (WGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWBD (WGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension)
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWMD == 1
+ typedef real realMD;
+#elif VWMD == 2
+ typedef real2 realMD;
+#elif VWMD == 4
+ typedef real4 realMD;
+#elif VWMD == 8
+ typedef real8 realMD;
+#elif VWMD == 16
+ typedef real16 realMD;
+#endif
+
+// Data-widths in dimension N
+#if VWND == 1
+ typedef real realND;
+#elif VWND == 2
+ typedef real2 realND;
+#elif VWND == 4
+ typedef real4 realND;
+#elif VWND == 8
+ typedef real8 realND;
+#elif VWND == 16
+ typedef real16 realND;
+#endif
+
+// =================================================================================================
+
+// Initializes the accumulation registers to zero
+inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ SetToZero(cpm[ni][mi]);
+ }
+ }
+}
+
+// =================================================================================================
+
+// Performs the actual computation: Cpm += Apm * Bpm
+inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ MultiplyAdd(cpm[ni][mi], apm[mi], bpm[ni]);
+ }
+ }
+}
+
+// =================================================================================================
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix.
+inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
+ const int a_ld, const int a_offset, const int idm, const int idk,
+ const int a_transpose, const int a_conjugate) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+ apm[mi] = agms[a_index + a_offset];
+ if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
+ const int b_ld, const int b_offset, const int idn, const int idk,
+ const int b_transpose, const int b_conjugate) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+ bpm[ni] = bgms[b_index + b_offset];
+ if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+ }
+}
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix. This is the same as above but now includes a bounds check.
+inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
+ const int a_ld, const int a_offset, const int idm, const int idk,
+ const int a_transpose, const int a_conjugate,
+ const int kSizeM) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ if (idm + mi < kSizeM) {
+ const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+ apm[mi] = agms[a_index + a_offset];
+ if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+ }
+ else {
+ SetToZero(apm[mi]);
+ }
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
+ const int b_ld, const int b_offset, const int idn, const int idk,
+ const int b_transpose, const int b_conjugate,
+ const int kSizeN) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ if (idn + ni < kSizeN) {
+ const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+ bpm[ni] = bgms[b_index + b_offset];
+ if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+ }
+ else {
+ SetToZero(bpm[ni]);
+ }
+ }
+}
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
+ const int a_transpose) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ const int mg = mi + get_local_id(0)*MWID;
+ const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg;
+ apm[mi] = alm[index];
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
+ const int b_transpose) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ const int ng = ni + get_local_id(1)*NWID;
+ const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng;
+ bpm[ni] = blm[index];
+ }
+}
+
+// =================================================================================================
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
+ const int idm, const int idn,
+ const real alpha, const real beta,
+ const int c_ld, const int c_offset, const int c_transpose) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+
+ // Determines the destination index
+ int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+ // The final multiplication with alpha (in case beta == 0)
+ real result;
+ if (IsZero(beta)) {
+ Multiply(result, alpha, cpm[ni][mi]);
+ }
+ // The final multiplication with alpha and the addition with beta*C
+ else {
+ AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+ }
+ cgm[c_index + c_offset] = result;
+ }
+ }
+}
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
+ const int idm, const int idn, const int kSizeM, const int kSizeN,
+ const real alpha, const real beta,
+ const int c_ld, const int c_offset, const int c_transpose) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ if ((idm + mi) < kSizeM && (idn + ni) < kSizeN) {
+
+ // Determines the destination index
+ int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+ // The final multiplication with alpha (in case beta == 0)
+ real result;
+ if (IsZero(beta)) {
+ Multiply(result, alpha, cpm[ni][mi]);
+ }
+ // The final multiplication with alpha and the addition with beta*C
+ else {
+ AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+ }
+ cgm[c_index + c_offset] = result;
+ }
+ }
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
new file mode 100644
index 00000000..d77cbf65
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -0,0 +1,314 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+ const int a_ld, const int a_offset, const int kwg,
+ const int a_transpose, const int a_conjugate) {
+ #if MDIMCD == MDIMAD
+ const int la0 = get_local_id(0);
+ const int la1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int la0 = tid % MDIMAD;
+ const int la1 = tid / MDIMAD;
+ #endif
+ #pragma unroll
+ for (int mia=0; mia<MWAD/VWMD; ++mia) {
+ #pragma unroll
+ for (int kia=0; kia<KWAD; ++kia) {
+
+ // Computes the indices for the global memory
+ int mg = mia + la0*(MWAD/VWMD);
+ int kg = kia + la1*KWAD;
+ int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD);
+ int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+ #if VWMD == 1
+ alm[kg*(WGD + PADA) + mg] = avec;
+ #elif VWMD == 2
+ alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+ alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+ #elif VWMD == 4
+ alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+ alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+ alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.z;
+ alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.w;
+ #elif VWMD == 8
+ alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+ alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+ alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+ alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+ alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+ alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+ alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+ alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+ #elif VWMD == 16
+ alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+ alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+ alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+ alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+ alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+ alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+ alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+ alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+ alm[kg*(WGD + PADA) + mg*VWMD + 8] = avec.s8;
+ alm[kg*(WGD + PADA) + mg*VWMD + 9] = avec.s9;
+ alm[kg*(WGD + PADA) + mg*VWMD + 10] = avec.sA;
+ alm[kg*(WGD + PADA) + mg*VWMD + 11] = avec.sB;
+ alm[kg*(WGD + PADA) + mg*VWMD + 12] = avec.sC;
+ alm[kg*(WGD + PADA) + mg*VWMD + 13] = avec.sD;
+ alm[kg*(WGD + PADA) + mg*VWMD + 14] = avec.sE;
+ alm[kg*(WGD + PADA) + mg*VWMD + 15] = avec.sF;
+ #endif
+ if (a_conjugate) {
+ for (int vm=0; vm<VWMD; ++vm) {
+ COMPLEX_CONJUGATE(alm[kg*(WGD + PADA) + mg*VWMD + vm]);
+ }
+ }
+ }
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+ const int b_ld, const int b_offset, const int kwg,
+ const int b_transpose, const int b_conjugate) {
+ #if MDIMCD == NDIMBD
+ const int lb0 = get_local_id(0);
+ const int lb1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int lb0 = tid % NDIMBD;
+ const int lb1 = tid / NDIMBD;
+ #endif
+ #pragma unroll
+ for (int kib=0; kib<KWBD; ++kib) {
+ #pragma unroll
+ for (int nib=0; nib<NWBD/VWND; ++nib) {
+
+ // Computes the indices for the global memory
+ int ng = nib + lb0*(NWBD/VWND);
+ int kg = kib + lb1*KWBD;
+ int idn = (b_transpose) ? ng + kwg/VWND : ng + GetGroupID1()*(WGD/VWND);
+ int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+ #if VWND == 1
+ blm[kg*(WGD + PADB) + ng] = bvec;
+ #elif VWND == 2
+ blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+ blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+ #elif VWND == 4
+ blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+ blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+ blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.z;
+ blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.w;
+ #elif VWND == 8
+ blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+ blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+ blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+ blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+ blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+ blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+ blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+ blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+ #elif VWND == 16
+ blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+ blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+ blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+ blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+ blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+ blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+ blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+ blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+ blm[kg*(WGD + PADB) + ng*VWND + 8] = bvec.s8;
+ blm[kg*(WGD + PADB) + ng*VWND + 9] = bvec.s9;
+ blm[kg*(WGD + PADB) + ng*VWND + 10] = bvec.sA;
+ blm[kg*(WGD + PADB) + ng*VWND + 11] = bvec.sB;
+ blm[kg*(WGD + PADB) + ng*VWND + 12] = bvec.sC;
+ blm[kg*(WGD + PADB) + ng*VWND + 13] = bvec.sD;
+ blm[kg*(WGD + PADB) + ng*VWND + 14] = bvec.sE;
+ blm[kg*(WGD + PADB) + ng*VWND + 15] = bvec.sF;
+ #endif
+ if (b_conjugate) {
+ for (int vn=0; vn<VWND; ++vn) {
+ COMPLEX_CONJUGATE(blm[kg*(WGD + PADB) + ng*VWND + vn]);
+ }
+ }
+ }
+ }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs doesn't
+// use the vector data-types.
+inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+ const int a_ld, const int a_offset, const int kwg,
+ const int a_transpose, const int a_conjugate) {
+ #if MDIMCD == MDIMAD
+ const int la0 = get_local_id(0);
+ const int la1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int la0 = tid % MDIMAD;
+ const int la1 = tid / MDIMAD;
+ #endif
+ #pragma unroll
+ for (int mia=0; mia<MWAD; ++mia) {
+ #pragma unroll
+ for (int kia=0; kia<KWAD; ++kia) {
+
+ // Computes the indices for the global memory
+ int mg = mia + la0*MWAD;
+ int kg = kia + la1*KWAD;
+ int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+ int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ real result = agms[idk*a_ld + idm + a_offset];
+ if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+ alm[kg*(WGD + PADA) + mg] = result;
+ }
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+ const int b_ld, const int b_offset, const int kwg,
+ const int b_transpose, const int b_conjugate) {
+ #if MDIMCD == NDIMBD
+ const int lb0 = get_local_id(0);
+ const int lb1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int lb0 = tid % NDIMBD;
+ const int lb1 = tid / NDIMBD;
+ #endif
+ #pragma unroll
+ for (int kib=0; kib<KWBD; ++kib) {
+ #pragma unroll
+ for (int nib=0; nib<NWBD; ++nib) {
+
+ // Computes the indices for the global memory
+ int ng = nib + lb0*NWBD;
+ int kg = kib + lb1*KWBD;
+ int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+ int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ real result = bgms[idk*b_ld + idn + b_offset];
+ if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+ blm[kg*(WGD + PADB) + ng] = result;
+ }
+ }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs bounds
+// checks and doesn't use the vector data-types.
+inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+ const int a_ld, const int a_offset, const int kwg,
+ const int a_transpose, const int a_conjugate,
+ const int kSizeM, const int kSizeK) {
+ #if MDIMCD == MDIMAD
+ const int la0 = get_local_id(0);
+ const int la1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int la0 = tid % MDIMAD;
+ const int la1 = tid / MDIMAD;
+ #endif
+ #pragma unroll
+ for (int mia=0; mia<MWAD; ++mia) {
+ #pragma unroll
+ for (int kia=0; kia<KWAD; ++kia) {
+
+ // Computes the indices for the global memory
+ int mg = mia + la0*MWAD;
+ int kg = kia + la1*KWAD;
+ int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+ int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ int condition = (a_transpose) ? idm < kSizeK : idm < kSizeM;
+ if (condition) {
+ real result = agms[idk*a_ld + idm + a_offset];
+ if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+ alm[kg*(WGD + PADA) + mg] = result;
+ }
+ else {
+ SetToZero(alm[kg*(WGD + PADA) + mg]);
+ }
+ }
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+ const int b_ld, const int b_offset, const int kwg,
+ const int b_transpose, const int b_conjugate,
+ const int kSizeN, const int kSizeK) {
+ #if MDIMCD == NDIMBD
+ const int lb0 = get_local_id(0);
+ const int lb1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int lb0 = tid % NDIMBD;
+ const int lb1 = tid / NDIMBD;
+ #endif
+ #pragma unroll
+ for (int kib=0; kib<KWBD; ++kib) {
+ #pragma unroll
+ for (int nib=0; nib<NWBD; ++nib) {
+
+ // Computes the indices for the global memory
+ int ng = nib + lb0*NWBD;
+ int kg = kib + lb1*KWBD;
+ int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+ int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ int condition = (b_transpose) ? idn < kSizeK : idn < kSizeN;
+ if (condition) {
+ real result = bgms[idk*b_ld + idn + b_offset];
+ if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+ blm[kg*(WGD + PADB) + ng] = result;
+ }
+ else {
+ SetToZero(blm[kg*(WGD + PADB) + ng]);
+ }
+ }
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
new file mode 100644
index 00000000..a9350e00
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -0,0 +1,214 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the kernel. This is the direct version without pre/post processing and restrictions.
+inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha,
+ const real_arg arg_beta,
+ const __global realMD* restrict agm, const int a_offset, const int a_ld,
+ const __global realND* restrict bgm, const int b_offset, const int b_ld,
+ __global real* cgm, const int c_offset, const int c_ld,
+ __local real* alm, __local real* blm,
+ const int a_transpose, const int b_transpose, const int c_transpose,
+ const int a_conjugate, const int b_conjugate) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ // Extra pointers to scalar versions of global memory
+ const __global real* restrict agms = (const __global real* restrict) agm;
+ const __global real* restrict bgms = (const __global real* restrict) bgm;
+
+ // Allocates workitem-private memory (registers)
+ real apm[MWID];
+ real bpm[NWID];
+ real cpm[NWID][MWID];
+
+ // Initializes the accumulation registers
+ InitAccRegistersDirect(cpm);
+
+ // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section
+ // processes only the main parts: output blocks of WGD by WGD.
+ const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD;
+ const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD;
+ if ((idm < (kSizeM/WGD)*WGD) && (idn < (kSizeN/WGD)*WGD)) {
+
+ // Loops over all complete workgroup tiles (K-dimension)
+ int kwg = 0;
+ for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+ // Loads data: off-chip --> local (matrix A and B)
+ if (a_ld % VWMD == 0) {
+ GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+ }
+ else {
+ GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+ }
+ if (b_ld % VWND == 0) {
+ GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+ }
+ else {
+ GlobalToLocalScalarB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Loops over all workitem tiles, unrolled by a factor KWID
+ for (int pwi=0; pwi<WGD; pwi+=KWID) {
+ #pragma unroll
+ for (int pit=0; pit<KWID; ++pit) {
+ int kg = pwi + pit;
+
+ // Loads data: local --> private (matrix A and B)
+ LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+ LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulateDirect(cpm, apm, bpm);
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ // Loop over the remaining part (incomplete tile in K-dimension)
+ for (; kwg < kSizeK; ++kwg) {
+
+ // Loads data: off-chip --> private (matrix A and B)
+ GlobalToPrivateDirectA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate);
+ GlobalToPrivateDirectB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate);
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulateDirect(cpm, apm, bpm);
+ }
+
+ // Stores a tile of results and performs the multiplication with alpha and beta
+ StoreResultsDirect(cgm, cpm, idm, idn, alpha, beta, c_ld, c_offset, c_transpose);
+ }
+
+ // Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions)
+ else {
+
+ // Loops over all complete workgroup tiles (K-dimension)
+ int kwg = 0;
+ for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+ // Loads data: off-chip --> local (matrix A and B)
+ GlobalToLocalCheckedA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate, kSizeM, kSizeK);
+ GlobalToLocalCheckedB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate, kSizeN, kSizeK);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Loops over all workitem tiles, unrolled by a factor KWID
+ for (int pwi=0; pwi<WGD; pwi+=KWID) {
+ #pragma unroll
+ for (int pit=0; pit<KWID; ++pit) {
+ int kg = pwi + pit;
+
+ // Loads data: local --> private (matrix A and B)
+ LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+ LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulateDirect(cpm, apm, bpm);
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ // Loop over the remaining part (incomplete tile in K-dimension)
+ for (; kwg < kSizeK; ++kwg) {
+
+ // Loads data: off-chip --> private (matrix A and B)
+ GlobalToPrivateCheckedA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM);
+ GlobalToPrivateCheckedB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN);
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulateDirect(cpm, apm, bpm);
+ }
+
+ // Stores a tile of results and performs the multiplication with alpha and beta
+ StoreResultsChecked(cgm, cpm, idm, idn, kSizeM, kSizeN, alpha, beta, c_ld, c_offset, c_transpose);
+ }
+}
+
+// =================================================================================================
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realMD* restrict agm, const int a_offset, const int a_ld,
+ const __global realND* restrict bgm, const int b_offset, const int b_ld,
+ __global real* cgm, const int c_offset, const int c_ld,
+ const int c_transpose, const int a_conjugate, const int b_conjugate) {
+ __local real alm[WGD * (WGD + PADA)];
+ __local real blm[WGD * (WGD + PADB)];
+ XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+ agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+ alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realMD* restrict agm, const int a_offset, const int a_ld,
+ const __global realND* restrict bgm, const int b_offset, const int b_ld,
+ __global real* cgm, const int c_offset, const int c_ld,
+ const int c_transpose, const int a_conjugate, const int b_conjugate) {
+ __local real alm[WGD * (WGD + PADA)];
+ __local real blm[WGD * (WGD + PADB)];
+ XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+ agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+ alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realMD* restrict agm, const int a_offset, const int a_ld,
+ const __global realND* restrict bgm, const int b_offset, const int b_ld,
+ __global real* cgm, const int c_offset, const int c_ld,
+ const int c_transpose, const int a_conjugate, const int b_conjugate) {
+ __local real alm[WGD * (WGD + PADA)];
+ __local real blm[WGD * (WGD + PADB)];
+ XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+ agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+ alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realMD* restrict agm, const int a_offset, const int a_ld,
+ const __global realND* restrict bgm, const int b_offset, const int b_ld,
+ __global real* cgm, const int c_offset, const int c_ld,
+ const int c_transpose, const int a_conjugate, const int b_conjugate) {
+ __local real alm[WGD * (WGD + PADA)];
+ __local real blm[WGD * (WGD + PADB)];
+ XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+ agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+ alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/msvc.hpp b/src/msvc.hpp
new file mode 100644
index 00000000..a45105df
--- /dev/null
+++ b/src/msvc.hpp
@@ -0,0 +1,39 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides macro's and definitions to make compilation work on Microsoft Visual Studio,
+// in particular for versions older than 2015 with limited C++11 support.
+// MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015)
+// MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013)
+// MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012)
+// MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010)
+// MSVC++ 9.0 _MSC_VER == 1500 (Visual Studio 2008)
+//
+// =================================================================================================
+
+#ifndef CLBLAST_MSVC_HPP_
+#define CLBLAST_MSVC_HPP_
+
+namespace clblast {
+// =================================================================================================
+#ifdef _MSC_VER
+
+// No support for constexpr prior to 2015. Note that this only works with constants, not with
+// constexpr functions (unused in this project).
+#if _MSC_VER < 1900
+#define constexpr const
+#endif
+
+// _MSC_VER
+#endif
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_MSVC_HPP_
+#endif
diff --git a/src/routine.cpp b/src/routine.cpp
index 189ae190..80764b74 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -14,6 +14,7 @@
#include <string>
#include <vector>
#include <chrono>
+#include <cstdlib>
#include "routine.hpp"
@@ -23,7 +24,7 @@ namespace clblast {
// Constructor: not much here, because no status codes can be returned
Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
- const std::vector<Database::DatabaseEntry> &userDatabase):
+ const std::vector<const Database::DatabaseEntry*> &userDatabase):
precision_(precision),
routine_name_(name),
queue_(queue),
@@ -42,13 +43,19 @@ StatusCode Routine::SetUp() {
// Queries the cache to see whether or not the program (context-specific) is already there
if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
+ // Sets the build options from an environmental variable (if set)
+ auto options = std::vector<std::string>();
+ const auto environment_variable = std::getenv("CLBLAST_BUILD_OPTIONS");
+ if (environment_variable != nullptr) {
+ options.push_back(std::string(environment_variable));
+ }
+
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
try {
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
- auto options = std::vector<std::string>();
program.Build(device_, options);
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
@@ -115,7 +122,6 @@ StatusCode Routine::SetUp() {
// Compiles the kernel
try {
auto program = Program(context_, source_string);
- auto options = std::vector<std::string>();
const auto build_status = program.Build(device_, options);
// Checks for compiler crashes/errors/warnings
diff --git a/src/routine.hpp b/src/routine.hpp
index f5c607af..8582a2b7 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -36,7 +36,7 @@ class Routine {
// built-in database.
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
- const std::vector<Database::DatabaseEntry> &userDatabase = {});
+ const std::vector<const Database::DatabaseEntry*> &userDatabase = {});
// Set-up phase of the kernel
StatusCode SetUp();
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 0b8e768f..1602c69f 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -22,7 +22,9 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
+ Routine(queue, event, name,
+ {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+ PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
@@ -32,10 +34,16 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/convert_symmetric.opencl"
#include "../../kernels/level3/convert_triangular.opencl"
#include "../../kernels/level3/convert_hermitian.opencl"
+ #include "../../kernels/level3/xgemm_direct_part1.opencl"
+ #include "../../kernels/level3/xgemm_direct_part2.opencl"
+ #include "../../kernels/level3/xgemm_direct_part3.opencl"
+ ;
+ auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
+ source_string_ += source_string_part_2;
}
// =================================================================================================
@@ -98,6 +106,44 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
if (ErrorIn(status)) { return status; }
+ // Selects which version of GEMM to run
+ const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
+ if (do_gemm_direct) { // for small sizes (single kernel)
+ return GemmDirect(m, n, k, alpha,
+ a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+ c_buffer, c_offset, c_ld,
+ a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
+ }
+ else { // for larger sizes (pre/post-processing plus a very fast kernel)
+ return GemmIndirect(m, n, k, alpha,
+ a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+ c_buffer, c_offset, c_ld,
+ a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
+ a_one, a_two, a_want_rotated,
+ b_one, b_two, b_want_rotated,
+ c_one, c_two, c_want_rotated);
+ }
+}
+
+// =================================================================================================
+
+// The indirect version of GEMM. This uses the faster but non-general kernel. It has specific
+// requirements, but several pre and post-processing kernels take care of those. However, the
+// overhead of these extra kernels might not be ideal for certain devices/arguments.
+template <typename T>
+StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate,
+ const size_t a_one, const size_t a_two, const bool a_want_rotated,
+ const size_t b_one, const size_t b_two, const bool b_want_rotated,
+ const size_t c_one, const size_t c_two, const bool c_want_rotated) {
+ auto status = StatusCode::kSuccess;
+
// Calculates the ceiled versions of m, n, and k
const auto m_ceiled = Ceil(m, db_["MWG"]);
const auto n_ceiled = Ceil(n, db_["NWG"]);
@@ -217,6 +263,66 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
+
+// =================================================================================================
+
+// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
+template <typename T>
+StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate) {
+
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Retrieves the proper XgemmDirect kernel from the compiled binary
+ try {
+ const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
+ (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
+ auto kernel = Kernel(program, name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(m));
+ kernel.SetArgument(1, static_cast<int>(n));
+ kernel.SetArgument(2, static_cast<int>(k));
+ kernel.SetArgument(3, GetRealArg(alpha));
+ kernel.SetArgument(4, GetRealArg(beta));
+ kernel.SetArgument(5, a_buffer());
+ kernel.SetArgument(6, static_cast<int>(a_offset));
+ kernel.SetArgument(7, static_cast<int>(a_ld));
+ kernel.SetArgument(8, b_buffer());
+ kernel.SetArgument(9, static_cast<int>(b_offset));
+ kernel.SetArgument(10, static_cast<int>(b_ld));
+ kernel.SetArgument(11, c_buffer());
+ kernel.SetArgument(12, static_cast<int>(c_offset));
+ kernel.SetArgument(13, static_cast<int>(c_ld));
+ kernel.SetArgument(14, static_cast<int>(c_do_transpose));
+ kernel.SetArgument(15, static_cast<int>(a_conjugate));
+ kernel.SetArgument(16, static_cast<int>(b_conjugate));
+
+ // Computes the global and local thread sizes
+ const auto m_ceiled = Ceil(m, db_["WGD"]);
+ const auto n_ceiled = Ceil(n, db_["WGD"]);
+ const auto global = std::vector<size_t>{
+ (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+ (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
+ };
+ const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
+
+ // Launches the kernel
+ auto status = RunKernel(kernel, queue_, device_, global, local, event_);
+ if (ErrorIn(status)) { return status; }
+
+ // Successfully finished the computation
+ return StatusCode::kSuccess;
+ } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
// =================================================================================================
// Compiles the templated class
diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp
index bc51c7f5..46e12453 100644
--- a/src/routines/level3/xgemm.hpp
+++ b/src/routines/level3/xgemm.hpp
@@ -35,6 +35,29 @@ class Xgemm: public Routine {
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ // Indirect version of GEMM (with pre and post-processing kernels)
+ StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate,
+ const size_t a_one, const size_t a_two, const bool a_want_rotated,
+ const size_t b_one, const size_t b_two, const bool b_want_rotated,
+ const size_t c_one, const size_t c_two, const bool c_want_rotated);
+
+ // Direct version of GEMM (no pre and post-processing kernels)
+ StatusCode GemmDirect(const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate);
};
// =================================================================================================
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 78ded56e..c57aab39 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -47,6 +47,7 @@ class TuneCopy {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 90f5ea82..9486ee8d 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -47,6 +47,7 @@ class TunePad {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 10fa80cb..2d9d5e49 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -47,6 +47,7 @@ class TuneTranspose {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index 507718eb..d364dabe 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -47,6 +47,7 @@ class TunePadTranspose {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 0033b3c6..403ee9e4 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -51,6 +51,7 @@ class TuneXaxpy {
static size_t DefaultN() { return 4096*1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp
index 1581e13f..f8416761 100644
--- a/src/tuning/kernels/xdot.cpp
+++ b/src/tuning/kernels/xdot.cpp
@@ -47,6 +47,7 @@ class TuneXdot {
static size_t DefaultN() { return 2*1024*1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 4cb7fd00..0eb1875b 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -52,6 +52,7 @@ class TuneXgemm {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1024; }
static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -126,10 +127,10 @@ class TuneXgemm {
// Sets the local memory size
static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
auto LocalMemorySize = [args] (std::vector<size_t> v) {
- return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision));
+ return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision));
};
- tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM",
- "SB", "KWG", "NWG", "VWN"});
+ tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG",
+ "SB", "KWG", "NWG"});
}
// Sets the base thread configuration
diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp
new file mode 100644
index 00000000..204e0be4
--- /dev/null
+++ b/src/tuning/kernels/xgemm_direct.cpp
@@ -0,0 +1,196 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations:
+// - V==1: This tests some limited set of tuning parameters exhaustively.
+// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, int V>
+class TuneXgemmDirect {
+ public:
+
+ // The representative kernel and the source code
+ static std::string KernelFamily() { return (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; }
+ static std::string KernelName() { return "XgemmDirectTN"; }
+ static std::string GetSources() {
+ return
+ #include "../src/kernels/common.opencl"
+ #include "../src/kernels/level3/xgemm_direct_part1.opencl"
+ #include "../src/kernels/level3/xgemm_direct_part2.opencl"
+ #include "../src/kernels/level3/xgemm_direct_part3.opencl"
+ ;
+ }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction};
+ }
+
+ // Tests for valid arguments
+ static void TestValidArguments(const Arguments<T> &) { }
+
+ // Sets the default values for the arguments
+ static size_t DefaultM() { return 256; }
+ static size_t DefaultN() { return 256; }
+ static size_t DefaultK() { return 256; }
+ static double DefaultFraction() { return (V==1) ? 1.0 : 32.0; } // test all or sample randomly
+ static size_t DefaultNumRuns() { return 4; } // run every kernel this many times for averaging
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
+ static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
+ static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.k; }
+ static size_t GetSizeB(const Arguments<T> &args) { return args.n * args.k; }
+ static size_t GetSizeC(const Arguments<T> &args) { return args.m * args.n; }
+ static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
+
+ // Sets the tuning parameters and their possible values
+ static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+ if (V==1) { // limited subset of tuning parameters - but explorable exhaustively
+ tuner.AddParameter(id, "WGD", {8, 16, 32});
+ tuner.AddParameter(id, "MDIMCD", {8, 16, 32});
+ tuner.AddParameter(id, "NDIMCD", {8, 16, 32});
+ tuner.AddParameter(id, "MDIMAD", {8, 16, 32});
+ tuner.AddParameter(id, "NDIMBD", {8, 16, 32});
+ tuner.AddParameter(id, "KWID", {2});
+ tuner.AddParameter(id, "VWMD", {1, 2, 4, 8});
+ tuner.AddParameter(id, "VWND", {1, 2, 4, 8});
+ tuner.AddParameter(id, "PADA", {1});
+ tuner.AddParameter(id, "PADB", {1});
+ } // a lot more tuning parameters - has to be sampled randomly, too much to test all
+ else {
+ tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128});
+ tuner.AddParameter(id, "MDIMCD", {8, 16, 32});
+ tuner.AddParameter(id, "NDIMCD", {8, 16, 32});
+ tuner.AddParameter(id, "MDIMAD", {8, 16, 32});
+ tuner.AddParameter(id, "NDIMBD", {8, 16, 32});
+ tuner.AddParameter(id, "KWID", {2, 8, 16});
+ tuner.AddParameter(id, "VWMD", {1, 2, 4, 8});
+ tuner.AddParameter(id, "VWND", {1, 2, 4, 8});
+ tuner.AddParameter(id, "PADA", {0, 1});
+ tuner.AddParameter(id, "PADB", {0, 1});
+ }
+ }
+
+ // Sets the constraints
+ static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+ auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
+ auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
+ auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
+ // Requirement for unrolling the WGD loop
+ tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"});
+ // Required for integer MWID and NWID
+ tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"});
+ tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"});
+ // Required for integer MWIAD and NWIBD
+ tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"});
+ tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"});
+ // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...)
+ tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"});
+ tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"});
+
+ // Extra constraints for variation 1 to limit the set of options significantly
+ if (V==1) {
+ auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
+ tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"});
+ tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"});
+ }
+ }
+
+ // Sets the local memory size
+ static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
+ auto LocalMemorySize = [args] (std::vector<size_t> v) {
+ return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision));
+ };
+ tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"});
+ }
+
+ // Sets the base thread configuration
+ static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
+ static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
+ static std::vector<size_t> LocalSize() { return {1, 1}; }
+ static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
+
+ // Transforms the thread configuration based on the parameters
+ using TransformVector = std::vector<std::vector<std::string>>;
+ static TransformVector MulLocal() { return {{"MDIMCD", "NDIMCD"}}; }
+ static TransformVector DivLocal() { return {}; }
+ static TransformVector MulGlobal() { return {{"MDIMCD", "NDIMCD"}}; }
+ static TransformVector DivGlobal() { return {{"WGD", "WGD"}}; }
+
+ // Sets the kernel's arguments
+ static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
+ std::vector<T> &, std::vector<T> &,
+ std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
+ std::vector<T> &) {
+ tuner.AddArgumentScalar(static_cast<int>(args.m));
+ tuner.AddArgumentScalar(static_cast<int>(args.n));
+ tuner.AddArgumentScalar(static_cast<int>(args.k));
+ tuner.AddArgumentScalar(GetRealArg(args.alpha));
+ tuner.AddArgumentScalar(GetRealArg(args.beta));
+ tuner.AddArgumentInput(a_mat);
+ tuner.AddArgumentScalar(0); // a_offset
+ tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld
+ tuner.AddArgumentInput(b_mat);
+ tuner.AddArgumentScalar(0); // b_offset
+ tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld
+ tuner.AddArgumentOutput(c_mat);
+ tuner.AddArgumentScalar(0); // c_offset
+ tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld
+ tuner.AddArgumentScalar(1); // c_do_transpose
+ tuner.AddArgumentScalar(0); // a_conjugate
+ tuner.AddArgumentScalar(0); // b_conjugate
+ }
+
+ // Describes how to compute the performance metrics
+ static size_t GetMetric(const Arguments<T> &args) {
+ return 2 * args.m * args.n * args.k;
+ }
+ static std::string PerformanceUnit() { return "GFLOPS"; }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Function to tune a specific variation V (not within the clblast namespace)
+template <int V>
+void StartVariation(int argc, char *argv[]) {
+ switch(clblast::GetPrecision(argc, argv)) {
+ case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemmDirect<half,V>, half>(argc, argv); break;
+ case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemmDirect<float,V>, float>(argc, argv); break;
+ case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemmDirect<double,V>, double>(argc, argv); break;
+ case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemmDirect<float2,V>, float2>(argc, argv); break;
+ case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemmDirect<double2,V>, double2>(argc, argv); break;
+ }
+}
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+ StartVariation<1>(argc, argv);
+ StartVariation<2>(argc, argv);
+ return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index 7229602d..f332f52a 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -50,6 +50,7 @@ class TuneXgemv {
static size_t DefaultN() { return 2048; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index 1fb5c531..c3d0c7dd 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -47,6 +47,7 @@ class TuneXger {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) { return args.m; }
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 19df5f9a..afb092bc 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -30,6 +30,7 @@ namespace clblast {
// that it is automatically compiled for the various kernels (given as the 'C' template argument).
template <typename C, typename T>
void Tuner(int argc, char* argv[]) {
+ constexpr auto kSeed = 42; // fixed seed for reproducibility
// Sets the parameters and platform/device for which to tune (command-line options)
auto help = std::string{"* Options given/available:\n"};
@@ -45,6 +46,8 @@ void Tuner(int argc, char* argv[]) {
if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); }
}
+ const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, C::DefaultNumRuns());
+
fprintf(stdout, "%s\n", help.c_str());
// Tests validity of the given arguments
@@ -73,12 +76,12 @@ void Tuner(int argc, char* argv[]) {
auto b_mat = std::vector<T>(C::GetSizeB(args));
auto c_mat = std::vector<T>(C::GetSizeC(args));
auto temp = std::vector<T>(C::GetSizeTemp(args));
- PopulateVector(x_vec);
- PopulateVector(y_vec);
- PopulateVector(a_mat);
- PopulateVector(b_mat);
- PopulateVector(c_mat);
- PopulateVector(temp);
+ PopulateVector(x_vec, kSeed);
+ PopulateVector(y_vec, kSeed);
+ PopulateVector(a_mat, kSeed);
+ PopulateVector(b_mat, kSeed);
+ PopulateVector(c_mat, kSeed);
+ PopulateVector(temp, kSeed);
// Initializes the tuner for the chosen device
cltune::Tuner tuner(args.platform_id, args.device_id);
@@ -126,6 +129,7 @@ void Tuner(int argc, char* argv[]) {
C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
// Starts the tuning process
+ tuner.SetNumRuns(num_runs);
tuner.Tune();
// Prints the results to screen
@@ -134,7 +138,7 @@ void Tuner(int argc, char* argv[]) {
// Also prints the performance of the best-case in terms of GB/s or GFLOPS
if (time_ms != 0.0) {
- printf("[ -------> ] %.1lf ms", time_ms);
+ printf("[ -------> ] %.2lf ms", time_ms);
printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
}
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 77bc72d7..86cc2d13 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -270,40 +270,40 @@ unsigned int GetRandomSeed() {
// Create a random number generator and populates a vector with samples from a random distribution
template <typename T>
-void PopulateVector(std::vector<T> &vector) {
+void PopulateVector(std::vector<T> &vector, const unsigned int seed) {
auto lower_limit = static_cast<T>(kTestDataLowerLimit);
auto upper_limit = static_cast<T>(kTestDataUpperLimit);
- std::mt19937 mt(GetRandomSeed());
+ std::mt19937 mt(seed);
std::uniform_real_distribution<T> dist(lower_limit, upper_limit);
for (auto &element: vector) { element = dist(mt); }
}
-template void PopulateVector<float>(std::vector<float>&);
-template void PopulateVector<double>(std::vector<double>&);
+template void PopulateVector<float>(std::vector<float>&, const unsigned int);
+template void PopulateVector<double>(std::vector<double>&, const unsigned int);
// Specialized versions of the above for complex data-types
template <>
-void PopulateVector(std::vector<float2> &vector) {
+void PopulateVector(std::vector<float2> &vector, const unsigned int seed) {
auto lower_limit = static_cast<float>(kTestDataLowerLimit);
auto upper_limit = static_cast<float>(kTestDataUpperLimit);
- std::mt19937 mt(GetRandomSeed());
+ std::mt19937 mt(seed);
std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
}
template <>
-void PopulateVector(std::vector<double2> &vector) {
+void PopulateVector(std::vector<double2> &vector, const unsigned int seed) {
auto lower_limit = static_cast<double>(kTestDataLowerLimit);
auto upper_limit = static_cast<double>(kTestDataUpperLimit);
- std::mt19937 mt(GetRandomSeed());
+ std::mt19937 mt(seed);
std::uniform_real_distribution<double> dist(lower_limit, upper_limit);
for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
}
// Specialized versions of the above for half-precision
template <>
-void PopulateVector(std::vector<half> &vector) {
+void PopulateVector(std::vector<half> &vector, const unsigned int seed) {
const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
- std::mt19937 mt(GetRandomSeed());
+ std::mt19937 mt(seed);
std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
}
diff --git a/src/utilities.hpp b/src/utilities.hpp
index 75bd5a69..038a8a96 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -25,6 +25,8 @@
#include "clblast_half.h"
#include "clpp11.hpp"
+#include "msvc.hpp"
+
namespace clblast {
// =================================================================================================
@@ -206,7 +208,7 @@ bool CheckArgument(const int argc, char *argv[], std::string &help, const std::s
// =================================================================================================
// Helper function to check for errors in the status code
-constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
+inline bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
// =================================================================================================
@@ -219,7 +221,7 @@ constexpr auto kTestDataUpperLimit = 2.0;
// Populates a vector with random data
template <typename T>
-void PopulateVector(std::vector<T> &vector);
+void PopulateVector(std::vector<T> &vector, const unsigned int seed);
// =================================================================================================