From dc93523204ebe8562145997673f25f8e59f9d2f5 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 8 Feb 2017 21:14:38 +0100 Subject: Added tuning results for Titan X (Pascal version) --- README.md | 1 + src/database/kernels/copy.hpp | 10 +++++++--- src/database/kernels/pad.hpp | 4 ++++ src/database/kernels/padtranspose.hpp | 4 ++++ src/database/kernels/transpose.hpp | 4 ++++ src/database/kernels/xaxpy.hpp | 4 ++++ src/database/kernels/xdot.hpp | 6 +++++- src/database/kernels/xgemm.hpp | 10 +++++++--- src/database/kernels/xgemm_direct.hpp | 10 +++++++--- src/database/kernels/xgemv.hpp | 5 ++++- src/database/kernels/xgemv_fast.hpp | 2 ++ src/database/kernels/xgemv_fast_rot.hpp | 2 ++ src/database/kernels/xger.hpp | 8 ++++++-- 13 files changed, 57 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 35e79db8..67c7703c 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,7 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC - GeForce GTX TITAN - GeForce GTX TITAN Black - GeForce GTX TITAN X + - TITAN X (Pascal) - Tesla K20m - Tesla K40m * AMD GPUs: diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp index f0431933..9bc613b9 100644 --- a/src/database/kernels/copy.hpp +++ b/src/database/kernels/copy.hpp @@ -96,6 +96,7 @@ const Database::DatabaseEntry CopySingle = { { "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, { "GeForce GTX TITAN Black", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } }, { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "TITAN X (Pascal)", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, @@ -165,9 +166,10 @@ const Database::DatabaseEntry CopyComplexSingle = { { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN Black", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "TITAN X (Pascal)", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } }, { "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, { // Default @@ -229,14 +231,15 @@ const Database::DatabaseEntry CopyDouble = { { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "GeForce GTX TITAN Black", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } }, { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "TITAN X (Pascal)", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, } }, } @@ -293,6 +296,7 @@ const Database::DatabaseEntry CopyComplexDouble = { { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN Black", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "TITAN X (Pascal)", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp index 3378709c..f22399eb 100644 --- a/src/database/kernels/pad.hpp +++ b/src/database/kernels/pad.hpp @@ -96,6 +96,7 @@ const Database::DatabaseEntry PadSingle = { { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN Black", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "TITAN X (Pascal)", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, @@ -173,6 +174,7 @@ const Database::DatabaseEntry PadComplexSingle = { { "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "TITAN X (Pascal)", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, @@ -237,6 +239,7 @@ const Database::DatabaseEntry PadDouble = { { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "TITAN X (Pascal)", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, @@ -301,6 +304,7 @@ const Database::DatabaseEntry PadComplexDouble = { { "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "TITAN X (Pascal)", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp index 212723c7..ce40914c 100644 --- a/src/database/kernels/padtranspose.hpp +++ b/src/database/kernels/padtranspose.hpp @@ -96,6 +96,7 @@ const Database::DatabaseEntry PadtransposeSingle = { { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "GeForce GTX TITAN Black", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "TITAN X (Pascal)", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, @@ -173,6 +174,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = { { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "TITAN X (Pascal)", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, @@ -237,6 +239,7 @@ const Database::DatabaseEntry PadtransposeDouble = { { "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "TITAN X (Pascal)", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, @@ -301,6 +304,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = { { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "TITAN X (Pascal)", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp index f33f2a04..b80565b9 100644 --- a/src/database/kernels/transpose.hpp +++ b/src/database/kernels/transpose.hpp @@ -96,6 +96,7 @@ const Database::DatabaseEntry TransposeSingle = { { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "GeForce GTX TITAN Black", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "TITAN X (Pascal)", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, @@ -167,6 +168,7 @@ const Database::DatabaseEntry TransposeComplexSingle = { { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "TITAN X (Pascal)", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, @@ -231,6 +233,7 @@ const Database::DatabaseEntry TransposeDouble = { { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "TITAN X (Pascal)", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, @@ -289,6 +292,7 @@ const Database::DatabaseEntry TransposeComplexDouble = { { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "TITAN X (Pascal)", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp index e4e3c621..5fefb5c3 100644 --- a/src/database/kernels/xaxpy.hpp +++ b/src/database/kernels/xaxpy.hpp @@ -96,6 +96,7 @@ const Database::DatabaseEntry XaxpySingle = { { "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX TITAN Black", { {"VW",4}, {"WGS",128}, {"WPT",4} } }, { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "TITAN X (Pascal)", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, { "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, @@ -173,6 +174,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = { { "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX TITAN Black", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, + { "TITAN X (Pascal)", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, @@ -237,6 +239,7 @@ const Database::DatabaseEntry XaxpyDouble = { { "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX TITAN Black", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, + { "TITAN X (Pascal)", { {"VW",2}, {"WGS",256}, {"WPT",4} } }, { "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "default", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, @@ -301,6 +304,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = { { "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } }, { "GeForce GTX TITAN Black", { {"VW",1}, {"WGS",128}, {"WPT",4} } }, { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + { "TITAN X (Pascal)", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp index 30d98e5d..67360b76 100644 --- a/src/database/kernels/xdot.hpp +++ b/src/database/kernels/xdot.hpp @@ -79,8 +79,9 @@ const Database::DatabaseEntry XdotSingle = { { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX TITAN Black", { {"WGS1",512}, {"WGS2",64} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, + { "TITAN X (Pascal)", { {"WGS1",256}, {"WGS2",512} } }, { "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",256}, {"WGS2",64} } }, + { "default", { {"WGS1",256}, {"WGS2",512} } }, } }, { // Default @@ -138,6 +139,7 @@ const Database::DatabaseEntry XdotComplexSingle = { { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } }, { "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, + { "TITAN X (Pascal)", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, { "default", { {"WGS1",512}, {"WGS2",64} } }, } @@ -185,6 +187,7 @@ const Database::DatabaseEntry XdotDouble = { { "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } }, { "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, + { "TITAN X (Pascal)", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, { "default", { {"WGS1",128}, {"WGS2",128} } }, } @@ -232,6 +235,7 @@ const Database::DatabaseEntry XdotComplexDouble = { { "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } }, { "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",32} } }, { "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } }, + { "TITAN X (Pascal)", { {"WGS1",128}, {"WGS2",512} } }, { "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } }, { "default", { {"WGS1",128}, {"WGS2",64} } }, } diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp index d9414f8b..5f62672b 100644 --- a/src/database/kernels/xgemm.hpp +++ b/src/database/kernels/xgemm.hpp @@ -95,9 +95,10 @@ const Database::DatabaseEntry XgemmSingle = { { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } }, + { "TITAN X (Pascal)", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, } }, { // Default @@ -172,9 +173,10 @@ const Database::DatabaseEntry XgemmComplexSingle = { { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, + { "TITAN X (Pascal)", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // Default @@ -236,6 +238,7 @@ const Database::DatabaseEntry XgemmDouble = { { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "TITAN X (Pascal)", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, @@ -299,9 +302,10 @@ const Database::DatabaseEntry XgemmComplexDouble = { { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "TITAN X (Pascal)", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, } }, { // Default diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp index c0cd2c04..bec0164f 100644 --- a/src/database/kernels/xgemm_direct.hpp +++ b/src/database/kernels/xgemm_direct.hpp @@ -62,6 +62,7 @@ const Database::DatabaseEntry XgemmDirectSingle = { { "GeForce GTX 1080", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } }, { "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } }, + { "TITAN X (Pascal)", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } }, } }, @@ -104,12 +105,13 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = { { "GeForce GTX 1080", { {"KWID",8}, {"MDIMAD",8}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, { "GeForce GTX 750 Ti", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } }, { "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "TITAN X (Pascal)", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, } }, } @@ -138,12 +140,13 @@ const Database::DatabaseEntry XgemmDirectDouble = { { "GeForce GTX 1080", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } }, { "GeForce GTX TITAN Black", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, - { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "TITAN X (Pascal)", { {"KWID",8}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } }, } }, } @@ -172,6 +175,7 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = { { "GeForce GTX 1080", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, { "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } }, + { "TITAN X (Pascal)", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, } }, diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp index 52b17d94..3bb31dc2 100644 --- a/src/database/kernels/xgemv.hpp +++ b/src/database/kernels/xgemv.hpp @@ -89,9 +89,10 @@ const Database::DatabaseEntry XgemvSingle = { { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, { "GeForce GTX TITAN Black", { {"WGS1",256}, {"WPT1",1} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } }, + { "TITAN X (Pascal)", { {"WGS1",32}, {"WPT1",1} } }, { "Tesla K20m", { {"WGS1",128}, {"WPT1",1} } }, { "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } }, - { "default", { {"WGS1",256}, {"WPT1",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1} } }, } }, { // Default @@ -157,6 +158,7 @@ const Database::DatabaseEntry XgemvComplexSingle = { { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } }, { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, { "GeForce GTX TITAN Black", { {"WGS1",32}, {"WPT1",1} } }, + { "TITAN X (Pascal)", { {"WGS1",32}, {"WPT1",1} } }, { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, @@ -212,6 +214,7 @@ const Database::DatabaseEntry XgemvDouble = { { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, { "GeForce GTX TITAN Black", { {"WGS1",32}, {"WPT1",1} } }, { "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } }, + { "TITAN X (Pascal)", { {"WGS1",32}, {"WPT1",1} } }, { "Tesla K20m", { {"WGS1",256}, {"WPT1",1} } }, { "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } }, { "default", { {"WGS1",128}, {"WPT1",1} } }, diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp index 2dd400bc..b9a2eba2 100644 --- a/src/database/kernels/xgemv_fast.hpp +++ b/src/database/kernels/xgemv_fast.hpp @@ -89,6 +89,7 @@ const Database::DatabaseEntry XgemvFastSingle = { { "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "GeForce GTX TITAN Black", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "TITAN X (Pascal)", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "Tesla K20m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "default", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, @@ -207,6 +208,7 @@ const Database::DatabaseEntry XgemvFastDouble = { { "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "GeForce GTX TITAN Black", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "TITAN X (Pascal)", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } }, { "Tesla K20m", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "default", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp index 36a435b5..ee3cebdc 100644 --- a/src/database/kernels/xgemv_fast_rot.hpp +++ b/src/database/kernels/xgemv_fast_rot.hpp @@ -67,6 +67,7 @@ const Database::DatabaseEntry XgemvFastRotSingle = { { "GeForce GTX 750 Ti", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, { "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, { "GeForce GTX TITAN Black", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } }, + { "TITAN X (Pascal)", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, } }, @@ -141,6 +142,7 @@ const Database::DatabaseEntry XgemvFastRotDouble = { { "GeForce GTX 750 Ti", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, { "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, { "GeForce GTX TITAN Black", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, + { "TITAN X (Pascal)", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } }, { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, } }, diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp index f99b7632..ade9dcbf 100644 --- a/src/database/kernels/xger.hpp +++ b/src/database/kernels/xger.hpp @@ -86,7 +86,8 @@ const Database::DatabaseEntry XgerSingle = { { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "GeForce GTX TITAN Black", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, + { "TITAN X (Pascal)", { {"WGS1",128}, {"WGS2",4}, {"WPT",1} } }, + { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, } }, { // Default @@ -151,7 +152,8 @@ const Database::DatabaseEntry XgerComplexSingle = { { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, { "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, - { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, + { "TITAN X (Pascal)", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } }, } }, { // Default @@ -204,6 +206,7 @@ const Database::DatabaseEntry XgerDouble = { { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } }, { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX TITAN Black", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, + { "TITAN X (Pascal)", { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } }, { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, } }, @@ -257,6 +260,7 @@ const Database::DatabaseEntry XgerComplexDouble = { { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, + { "TITAN X (Pascal)", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, { "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, } }, -- cgit v1.2.3 From 36b942a6982578af33ca26a5306ebd7012f2329b Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 11 Feb 2017 14:05:38 +0100 Subject: Added an option to remove items from the caches, optionally by a subset of 2 specific key-values only --- src/cache.cpp | 32 ++++++++++++++++++++++++++++++++ src/cache.hpp | 5 ++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/cache.cpp b/src/cache.cpp index c5cc6a4d..c7b4792e 100644 --- a/src/cache.cpp +++ b/src/cache.cpp @@ -64,6 +64,37 @@ void Cache::Store(Key &&key, Value &&value) { #endif } +template +void Cache::Remove(const Key &key) { + std::lock_guard lock(cache_mutex_); +#if __cplusplus >= 201402L + cache_.erase(key); +#else + auto it = cache_.begin(); + while (it != cache_.end()) { + if ((*it).first == key) { + it = cache_.erase(it); + } + else ++it; + } +#endif +} + +template +template +void Cache::RemoveBySubset(const Key key) { + std::lock_guard lock(cache_mutex_); + auto it = cache_.begin(); + while (it != cache_.end()) { + const auto current_key = (*it).first; + if ((std::get(key) == std::get(current_key)) && + (std::get(key) == std::get(current_key))) { + it = cache_.erase(it); + } + else ++it; + } +} + template void Cache::Invalidate() { std::lock_guard lock(cache_mutex_); @@ -88,6 +119,7 @@ template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const; template class Cache; template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const; +template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey); // by precision and routine name // ================================================================================================= diff --git a/src/cache.hpp b/src/cache.hpp index c3675f07..f3685b03 100644 --- a/src/cache.hpp +++ b/src/cache.hpp @@ -42,6 +42,10 @@ public: void Store(Key &&key, Value &&value); void Invalidate(); + // Removes all entries with a given key + void Remove(const Key &key); + template void RemoveBySubset(const Key key); // currently only supports 2 indices + static Cache &Instance(); private: @@ -72,7 +76,6 @@ typedef Cache BinaryCache; extern template class Cache; extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const; - // ================================================================================================= // The key struct for the cache of compiled OpenCL programs (context-dependent) -- cgit v1.2.3 From faa842b927ede6df1763607e3732151162875d73 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 12 Feb 2017 11:58:20 +0100 Subject: Made RemoveBySubset from the cache work with references to keys --- src/cache.cpp | 4 ++-- src/cache.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cache.cpp b/src/cache.cpp index c7b4792e..4b74b0a1 100644 --- a/src/cache.cpp +++ b/src/cache.cpp @@ -82,7 +82,7 @@ void Cache::Remove(const Key &key) { template template -void Cache::RemoveBySubset(const Key key) { +void Cache::RemoveBySubset(const Key &key) { std::lock_guard lock(cache_mutex_); auto it = cache_.begin(); while (it != cache_.end()) { @@ -119,7 +119,7 @@ template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const; template class Cache; template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const; -template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey); // by precision and routine name +template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name // ================================================================================================= diff --git a/src/cache.hpp b/src/cache.hpp index f3685b03..f7ca3dc8 100644 --- a/src/cache.hpp +++ b/src/cache.hpp @@ -44,7 +44,7 @@ public: // Removes all entries with a given key void Remove(const Key &key); - template void RemoveBySubset(const Key key); // currently only supports 2 indices + template void RemoveBySubset(const Key &key); // currently supports 2 indices static Cache &Instance(); -- cgit v1.2.3 From 345a5feb9a18641ceffd7ce5e0cb9387686cf32c Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 12 Feb 2017 12:02:39 +0100 Subject: Split the database into several smaller cached per-kernel databases (in preparation of per-kernel database overrides) --- src/cache.hpp | 6 +++--- src/database/database.cpp | 23 ++++++++++------------- src/database/database.hpp | 30 ++++++++++++++++++++++++++++-- src/routine.cpp | 36 +++++++++++++++++++++--------------- src/routine.hpp | 8 ++++---- src/routines/common.hpp | 2 +- 6 files changed, 67 insertions(+), 38 deletions(-) diff --git a/src/cache.hpp b/src/cache.hpp index f7ca3dc8..694de839 100644 --- a/src/cache.hpp +++ b/src/cache.hpp @@ -93,9 +93,9 @@ extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const; class Database; // The key struct for the cache of database maps. -// Order of fields: precision, device_name, routines (smaller fields first) -typedef std::tuple> DatabaseKey; -typedef std::tuple &> DatabaseKeyRef; +// Order of fields: precision, device_name, kernel_name (smaller fields first) +typedef std::tuple DatabaseKey; +typedef std::tuple DatabaseKeyRef; typedef Cache DatabaseCache; diff --git a/src/database/database.cpp b/src/database/database.cpp index c1cb9d56..8019d558 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -63,7 +63,7 @@ const std::unordered_map Database::kVendorNames{ // Constructor, computing device properties and populating the parameter-vector from the database. // This takes an optional overlay database in case of custom tuning or custom kernels. -Database::Database(const Device &device, const std::vector &kernels, +Database::Database(const Device &device, const std::string &kernel_name, const Precision precision, const std::vector &overlay): parameters_(std::make_shared()) { @@ -79,20 +79,17 @@ Database::Database(const Device &device, const std::vector &kernels } } - // Iterates over all kernels to include, and retrieves the parameters for each of them - for (auto &kernel: kernels) { - auto search_result = ParametersPtr{}; - - for (auto &db: { database, overlay}) { - search_result = Search(kernel, device_type, device_vendor, device_name, precision, db); - if (search_result) { - parameters_->insert(search_result->begin(), search_result->end()); - break; - } + // Searches potentially multiple databases + auto search_result = ParametersPtr{}; + for (auto &db: { overlay, database}) { + search_result = Search(kernel_name, device_type, device_vendor, device_name, precision, db); + if (search_result) { + parameters_->insert(search_result->begin(), search_result->end()); + break; } - - if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); } } + + if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); } } // ================================================================================================= diff --git a/src/database/database.hpp b/src/database/database.hpp index 87c12293..b6760ec3 100644 --- a/src/database/database.hpp +++ b/src/database/database.hpp @@ -75,11 +75,12 @@ class Database { Database() = default; // The constructor with a user-provided database overlay (potentially an empty vector) - explicit Database(const Device &device, const std::vector &routines, + explicit Database(const Device &device, const std::string &kernel_name, const Precision precision, const std::vector &overlay); // Accessor of values by key - size_t operator[](const std::string key) const { return parameters_->find(key)->second; } + size_t operator[](const std::string &key) const { return parameters_->find(key)->second; } + bool exists(const std::string &key) const { return (parameters_->count(key) == 1); } // Obtain a list of OpenCL pre-processor defines based on the parameters std::string GetDefines() const; @@ -95,6 +96,31 @@ class Database { std::shared_ptr parameters_; }; +// ================================================================================================= + +// Multiple databases together in a map +class Databases { + public: + + explicit Databases(const std::vector &kernel_names): kernel_names_(kernel_names) { } + + // Database accessor + Database& operator()(const std::string &kernel_name) { return databases_[kernel_name]; } + + // Retrieves a parameter from the database + size_t operator[](const std::string &key) const { + for (const auto &kernel_name : kernel_names_) { + const auto &kernel_db = databases_.find(kernel_name)->second; + if (kernel_db.exists(key)) { return kernel_db[key]; } + } + throw RuntimeErrorCode(StatusCode::kDatabaseError); + } + + private: + const std::vector kernel_names_; + std::unordered_map databases_; +}; + // ================================================================================================= } // namespace clblast diff --git a/src/routine.cpp b/src/routine.cpp index 4fe04a60..854c7046 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -23,34 +23,37 @@ namespace clblast { // The constructor does all heavy work, errors are returned as exceptions Routine::Routine(Queue &queue, EventPointer event, const std::string &name, - const std::vector &routines, const Precision precision, + const std::vector &kernel_names, const Precision precision, const std::vector &userDatabase, std::initializer_list source): precision_(precision), routine_name_(name), + kernel_names_(kernel_names), queue_(queue), event_(event), context_(queue_.GetContext()), device_(queue_.GetDevice()), - device_name_(device_.Name()) { + device_name_(device_.Name()), + db_(kernel_names) { - InitDatabase(routines, userDatabase); + InitDatabase(userDatabase); InitProgram(source); } -void Routine::InitDatabase(const std::vector &routines, - const std::vector &userDatabase) { +void Routine::InitDatabase(const std::vector &userDatabase) { + for (const auto &kernel_name : kernel_names_) { - // Queries the cache to see whether or not the kernel parameter database is already there - bool has_db; - db_ = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision_, device_name_, routines }, - &has_db); - if (has_db) { return; } + // Queries the cache to see whether or not the kernel parameter database is already there + bool has_db; + db_(kernel_name) = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision_, device_name_, kernel_name }, + &has_db); + if (has_db) { continue; } - // Builds the parameter database for this device and routine set and stores it in the cache - db_ = Database(device_, routines, precision_, userDatabase); - DatabaseCache::Instance().Store(DatabaseKey{ precision_, device_name_, routines }, - Database{ db_ }); + // Builds the parameter database for this device and routine set and stores it in the cache + db_(kernel_name) = Database(device_, kernel_name, precision_, userDatabase); + DatabaseCache::Instance().Store(DatabaseKey{ precision_, device_name_, kernel_name }, + Database{ db_(kernel_name) }); + } } void Routine::InitProgram(std::initializer_list source) { @@ -96,7 +99,10 @@ void Routine::InitProgram(std::initializer_list source) { } // Collects the parameters for this device in the form of defines, and adds the precision - auto source_string = db_.GetDefines(); + auto source_string = std::string{""}; + for (const auto &kernel_name : kernel_names_) { + source_string += db_(kernel_name).GetDefines(); + } source_string += "#define PRECISION "+ToString(static_cast(precision_))+"\n"; // Adds the name of the routine as a define diff --git a/src/routine.hpp b/src/routine.hpp index f366e4d9..ba8b9f60 100644 --- a/src/routine.hpp +++ b/src/routine.hpp @@ -48,16 +48,16 @@ class Routine { void InitProgram(std::initializer_list source); // Initializes db_, fetching cached database or building one - void InitDatabase(const std::vector &routines, - const std::vector &userDatabase); + void InitDatabase(const std::vector &userDatabase); protected: // Non-static variable for the precision const Precision precision_; - // The routine's name + // The routine's name and the corresponding kernels const std::string routine_name_; + const std::vector kernel_names_; // The OpenCL objects, accessible only from derived classes Queue queue_; @@ -72,7 +72,7 @@ class Routine { Program program_; // Connection to the database for all the device-specific parameters - Database db_; + Databases db_; }; // ================================================================================================= diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 7c211c0d..d268d58b 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -37,7 +37,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device, // to write to symmetric and triangular matrices through optional arguments. template void PadCopyTransposeMatrix(Queue &queue, const Device &device, - const Database &db, + const Databases &db, EventPointer event, const std::vector &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, -- cgit v1.2.3 From 00eb55a2d449e98816e77576f166e89682efbfd6 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 13 Feb 2017 20:48:32 +0100 Subject: Fixed a small bug in GEMV: unused kernel in parameter list --- src/routines/level2/xgemv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 9e9c2db4..aae66798 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -22,7 +22,7 @@ namespace clblast { // Constructor: forwards to base class constructor template Xgemv::Xgemv(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue(), {}, { + Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue(), {}, { #include "../../kernels/level2/xgemv.opencl" #include "../../kernels/level2/xgemv_fast.opencl" }) { -- cgit v1.2.3 From cdb3bb7166bc75842ff95e14915bff881297fc62 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 13 Feb 2017 20:53:06 +0100 Subject: Added first version of the OverrideParameters function --- include/clblast.h | 8 ++++++++ scripts/generator/generator.py | 4 ++-- src/clblast.cpp | 32 ++++++++++++++++++++++++++++++++ src/routine.cpp | 25 +++++++++++++++++++++++++ src/routine.hpp | 10 ++++++++++ 5 files changed, 77 insertions(+), 2 deletions(-) diff --git a/include/clblast.h b/include/clblast.h index 7b2021d8..e7b53d65 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -17,6 +17,8 @@ #define CLBLAST_CLBLAST_H_ #include // For size_t +#include // For OverrideParameters function +#include // For OverrideParameters function // Includes the normal OpenCL C header #if defined(__APPLE__) || defined(__MACOSX) @@ -617,6 +619,12 @@ StatusCode PUBLIC_API FillCache(const cl_device_id device); // ================================================================================================= +StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name, + const Precision precision, + const std::unordered_map ¶meters); + +// ================================================================================================= + } // namespace clblast // CLBLAST_CLBLAST_H_ diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 6591cbf7..aaf1b121 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,8 +41,8 @@ FILES = [ "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", ] -HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32] -FOOTER_LINES = [17, 95, 19, 18, 6, 6, 9, 2] +HEADER_LINES = [119, 73, 118, 22, 29, 41, 65, 32] +FOOTER_LINES = [23, 128, 19, 18, 6, 6, 9, 2] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." diff --git a/src/clblast.cpp b/src/clblast.cpp index 35f3f552..885b849e 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -2253,5 +2253,37 @@ StatusCode FillCache(const cl_device_id device) { return StatusCode::kSuccess; } +// ================================================================================================= + +StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name, + const Precision precision, + const std::unordered_map ¶meters) { + try { + + // Retrieves the device name + const auto device_cpp = Device(device); + const auto device_name = device_cpp.Name(); + + // Clears the existing program & binary cache for routines with the target kernel + const auto routine_names = Routine::routines_by_kernel.at(kernel_name); + for (const auto &routine_name : routine_names) { + ProgramCache::Instance().RemoveBySubset<1, 2>(ProgramKey{nullptr, precision, routine_name}); + BinaryCache::Instance().Remove(BinaryKey{precision, routine_name, device_name}); + } + + // Creates a small custom database based on the provided parameters + const auto database_device = Database::DatabaseDevice{"default", parameters}; + const auto database_vendor = Database::DatabaseVendor{database::kDeviceTypeAll, "default", {database_device}}; + const auto database_entry = Database::DatabaseEntry{kernel_name, precision, {database_vendor}}; + const auto database = Database(device_cpp, kernel_name, precision, {&database_entry}); + + // Removes the old database entry and stores the new one in the cache + DatabaseCache::Instance().Remove(DatabaseKey{ precision, device_name, kernel_name }); + DatabaseCache::Instance().Store(DatabaseKey{ precision, device_name, kernel_name }, Database(database)); + + } catch (...) { return DispatchException(); } + return StatusCode::kSuccess; +} + // ================================================================================================= } // namespace clblast diff --git a/src/routine.cpp b/src/routine.cpp index 854c7046..3cd045c8 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -21,6 +21,31 @@ namespace clblast { // ================================================================================================= +// For each kernel this map contains a list of routines it is used in +const std::vector Routine::routines_axpy = {"AXPY", "COPY", "SCAL", "SWAP"}; +const std::vector Routine::routines_dot = {"AMAX", "ASUM", "DOT", "DOTC", "DOTU", "MAX", "MIN", "NRM2", "SUM"}; +const std::vector Routine::routines_ger = {"GER", "GERC", "GERU", "HER", "HER2", "HPR", "HPR2", "SPR", "SPR2", "SYR", "SYR2"}; +const std::vector Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV"}; +const std::vector Routine::routines_gemm = {"GEMM", "HEMM", "SYMM", "TRMM"}; +const std::vector Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM"}; +const std::unordered_map> Routine::routines_by_kernel = { + {"Xaxpy", routines_axpy}, + {"Xdot", routines_dot}, + {"Xgemv", routines_gemv}, + {"XgemvFast", routines_gemv}, + {"XgemvFastRot", routines_gemv}, + {"Xgemv", {}}, + {"Xger", routines_ger}, + {"Copy", routines_gemm_syrk}, + {"Pad", routines_gemm_syrk}, + {"Transpose", routines_gemm_syrk}, + {"Padtranspose", routines_gemm_syrk}, + {"Xgemm", routines_gemm_syrk}, + {"XgemmDirect", routines_gemm}, + {"KernelSelection", routines_gemm}, +}; +// ================================================================================================= + // The constructor does all heavy work, errors are returned as exceptions Routine::Routine(Queue &queue, EventPointer event, const std::string &name, const std::vector &kernel_names, const Precision precision, diff --git a/src/routine.hpp b/src/routine.hpp index ba8b9f60..622a1c0d 100644 --- a/src/routine.hpp +++ b/src/routine.hpp @@ -18,6 +18,7 @@ #include #include +#include #include "utilities/utilities.hpp" #include "cache.hpp" @@ -42,6 +43,15 @@ class Routine { const std::vector &userDatabase, std::initializer_list source); + // List of kernel-routine look-ups + static const std::vector routines_axpy; + static const std::vector routines_dot; + static const std::vector routines_ger; + static const std::vector routines_gemv; + static const std::vector routines_gemm; + static const std::vector routines_gemm_syrk; + static const std::unordered_map> routines_by_kernel; + private: // Initializes program_, fetching cached program or building one -- cgit v1.2.3 From bdc57221bd0279bcdb4f024df54f08a2fe1bb8d4 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 14 Feb 2017 21:09:00 +0100 Subject: Added simple tests for the OverrideParameters function --- CMakeLists.txt | 12 +++ test/correctness/misc/override_parameters.cpp | 135 ++++++++++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 test/correctness/misc/override_parameters.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ab9c4f0..6edc70da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -414,6 +414,18 @@ if(TESTS) add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE}) endforeach() + # Miscellaneous tests + set(MISC_TESTS override_parameters) + foreach(MISC_TEST ${MISC_TESTS}) + add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON} + test/correctness/misc/${MISC_TEST}.cpp) + target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_include_directories(clblast_test_${MISC_TEST} PUBLIC + $ + ${clblast_SOURCE_DIR} ${REF_INCLUDES}) + add_test(clblast_test_${MISC_TEST} clblast_test_${MISC_TEST}) + endforeach() + # Adds 'alltests' target: runs all tests set(ALLTESTS ) set(ALLTESTSDEPENDS ) diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp new file mode 100644 index 00000000..54d18cfa --- /dev/null +++ b/test/correctness/misc/override_parameters.cpp @@ -0,0 +1,135 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the tests for the OverrideParameters function +// +// ================================================================================================= + +#include "utilities/utilities.hpp" +#include "test/routines/level3/xgemm.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +template +size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::string &routine_name) { + auto arguments = RetrieveCommandLineArguments(argc, argv); + auto errors = size_t{0}; + auto passed = size_t{0}; + auto example_routine = TestXgemm(); + constexpr auto kSeed = 42; // fixed seed for reproducibility + + // Determines the test settings + const auto kernel_name = std::string{"Xgemm"}; + const auto precision = PrecisionValue(); + const auto valid_settings = std::vector>{ + { {"KWG",16}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} }, + { {"KWG",32}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",32}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} }, + }; + const auto invalid_settings = std::vector>{ + { {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0} }, + }; + + // Retrieves the arguments + auto help = std::string{"Options given/available:\n"}; + const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); + const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); + auto args = Arguments{}; + args.m = GetArgument(arguments, help, kArgM, size_t{512}); + args.n = GetArgument(arguments, help, kArgN, size_t{512}); + args.k = GetArgument(arguments, help, kArgK, size_t{512}); + args.a_ld = GetArgument(arguments, help, kArgALeadDim, args.k); + args.b_ld = GetArgument(arguments, help, kArgBLeadDim, args.n); + args.c_ld = GetArgument(arguments, help, kArgCLeadDim, args.n); + args.a_offset = GetArgument(arguments, help, kArgAOffset, size_t{0}); + args.b_offset = GetArgument(arguments, help, kArgBOffset, size_t{0}); + args.c_offset = GetArgument(arguments, help, kArgCOffset, size_t{0}); + args.layout = GetArgument(arguments, help, kArgLayout, Layout::kRowMajor); + args.a_transpose = GetArgument(arguments, help, kArgATransp, Transpose::kNo); + args.b_transpose = GetArgument(arguments, help, kArgBTransp, Transpose::kNo); + args.alpha = GetArgument(arguments, help, kArgAlpha, GetScalar()); + args.beta = GetArgument(arguments, help, kArgBeta, GetScalar()); + + // Prints the help message (command-line arguments) + if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); } + + // Initializes OpenCL + const auto platform = Platform(platform_id); + const auto device = Device(platform, device_id); + const auto context = Context(device); + auto queue = Queue(context, device); + + // Populate host matrices with some example data + auto host_a = std::vector(args.m * args.k); + auto host_b = std::vector(args.n * args.k); + auto host_c = std::vector(args.m * args.n); + PopulateVector(host_a, kSeed); + PopulateVector(host_b, kSeed); + PopulateVector(host_c, kSeed); + + // Copy the matrices to the device + auto device_a = Buffer(context, host_a.size()); + auto device_b = Buffer(context, host_b.size()); + auto device_c = Buffer(context, host_c.size()); + device_a.Write(queue, host_a.size(), host_a); + device_b.Write(queue, host_b.size(), host_b); + device_c.Write(queue, host_c.size(), host_c); + auto dummy = Buffer(context, 1); + auto buffers = Buffers{dummy, dummy, device_a, device_b, device_c, dummy, dummy}; + + // Loops over the valid combinations: run before and run afterwards + fprintf(stdout, "* Testing OverrideParameters for '%s'\n", routine_name.c_str()); + for (const auto &override_setting : valid_settings) { + const auto status_before = example_routine.RunRoutine(args, buffers, queue); + if (status_before != StatusCode::kSuccess) { errors++; continue; } + + // Overrides the parameters + const auto status = OverrideParameters(device(), kernel_name, precision, override_setting); + if (status != StatusCode::kSuccess) { errors++; continue; } // error shouldn't occur + + const auto status_after = example_routine.RunRoutine(args, buffers, queue); + if (status_after != StatusCode::kSuccess) { errors++; continue; } + passed++; + } + + // Loops over the invalid combinations: run before and run afterwards + for (const auto &override_setting : invalid_settings) { + const auto status_before = example_routine.RunRoutine(args, buffers, queue); + if (status_before != StatusCode::kSuccess) { errors++; continue; } + + // Overrides the parameters + const auto status = OverrideParameters(device(), kernel_name, precision, override_setting); + if (status == StatusCode::kSuccess) { errors++; continue; } // error should occur + + const auto status_after = example_routine.RunRoutine(args, buffers, queue); + if (status_after != StatusCode::kSuccess) { errors++; continue; } + passed++; + } + + // Prints and returns the statistics + fprintf(stdout, " %zu test(s) passed\n", passed); + fprintf(stdout, " %zu test(s) failed\n", errors); + fprintf(stdout, "\n"); + return errors; +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunOverrideTests(argc, argv, false, "SGEMM"); + errors += clblast::RunOverrideTests(argc, argv, true, "DGEMM"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= -- cgit v1.2.3 From 08bfb75a9d72b6b373d8f18e8be83fe4ea31015b Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 16 Feb 2017 21:12:50 +0100 Subject: Added input-sanity checks for the OverrideParameters function --- include/clblast.h | 2 ++ scripts/generator/generator.py | 4 ++-- src/clblast.cpp | 10 ++++++++++ src/database/database.cpp | 9 +++++++++ src/database/database.hpp | 3 +++ 5 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/clblast.h b/include/clblast.h index e7b53d65..1350cb10 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -97,6 +97,8 @@ enum class StatusCode { kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small // Custom additional status codes for CLBlast + kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel + kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index aaf1b121..f43464b9 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,8 +41,8 @@ FILES = [ "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", ] -HEADER_LINES = [119, 73, 118, 22, 29, 41, 65, 32] -FOOTER_LINES = [23, 128, 19, 18, 6, 6, 9, 2] +HEADER_LINES = [121, 73, 118, 22, 29, 41, 65, 32] +FOOTER_LINES = [23, 138, 19, 18, 6, 6, 9, 2] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." diff --git a/src/clblast.cpp b/src/clblast.cpp index 885b849e..871a4804 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -2264,6 +2264,16 @@ StatusCode OverrideParameters(const cl_device_id device, const std::string &kern const auto device_cpp = Device(device); const auto device_name = device_cpp.Name(); + // Retrieves the current database values to verify whether the new ones are complete + auto in_cache = false; + const auto current_database = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision, device_name, kernel_name }, &in_cache); + if (!in_cache) { return StatusCode::kInvalidOverrideKernel; } + for (const auto ¤t_param : current_database.GetParameterNames()) { + if (parameters.find(current_param) == parameters.end()) { + return StatusCode::kMissingOverrideParameter; + } + } + // Clears the existing program & binary cache for routines with the target kernel const auto routine_names = Routine::routines_by_kernel.at(kernel_name); for (const auto &routine_name : routine_names) { diff --git a/src/database/database.cpp b/src/database/database.cpp index 8019d558..02d0b139 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -103,6 +103,15 @@ std::string Database::GetDefines() const { return defines; } +// Retrieves the names of all the parameters +std::vector Database::GetParameterNames() const { + auto parameter_names = std::vector(); + for (auto ¶meter: *parameters_) { + parameter_names.push_back(parameter.first); + } + return parameter_names; +} + // ================================================================================================= // Searches a particular database for the right kernel and precision diff --git a/src/database/database.hpp b/src/database/database.hpp index b6760ec3..b34e0d8a 100644 --- a/src/database/database.hpp +++ b/src/database/database.hpp @@ -85,6 +85,9 @@ class Database { // Obtain a list of OpenCL pre-processor defines based on the parameters std::string GetDefines() const; + // Retrieves the names of all the parameters + std::vector GetParameterNames() const; + private: // Search method for a specified database, returning pointer (possibly a nullptr) ParametersPtr Search(const std::string &this_kernel, const std::string &this_type, -- cgit v1.2.3 From cda449a5c39041b2a0e6893ee254e145447b78ca Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 16 Feb 2017 21:14:48 +0100 Subject: Added a C interface to the OverrideParameters function; added some in-line comments to the API --- include/clblast.h | 3 +++ include/clblast_c.h | 16 ++++++++++++++++ scripts/generator/generator.py | 4 ++-- src/clblast.cpp | 1 + src/clblast_c.cpp | 21 +++++++++++++++++++++ 5 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/clblast.h b/include/clblast.h index 1350cb10..d9637d15 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -621,6 +621,9 @@ StatusCode PUBLIC_API FillCache(const cl_device_id device); // ================================================================================================= +// Overrides tuning parameters for a specific device-precision-routine combination. The next time +// (and all further times) the target routine is called it will re-compile and use the new +// parameters. StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name, const Precision precision, const std::unordered_map ¶meters); diff --git a/include/clblast_c.h b/include/clblast_c.h index 72f50d83..cd657f3b 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -96,6 +96,8 @@ typedef enum CLBlastStatusCode_ { CLBlastInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small // Custom additional status codes for CLBlast + CLBlastInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel + CLBlastMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel CLBlastInvalidLocalMemUsage = -2046, // Not enough local memory available on this device CLBlastNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device CLBlastNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device @@ -117,6 +119,11 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, CLBlastDiagonalUnit = 132 } CLBlastDiagonal; typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; +// Precision enum (values in bits) +typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32, + CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232, + CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision; + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= @@ -1338,6 +1345,15 @@ CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device); // ================================================================================================= +// Overrides tuning parameters for a specific device-precision-routine combination. The next time +// (and all further times) the target routine is called it will re-compile and use the new +// parameters. +CLBlastStatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const char* kernel_name, + const CLBlastPrecision precision, const size_t num_parameters, + const char** parameters_names, const size_t* parameters_values); + +// ================================================================================================= + #ifdef __cplusplus } // extern "C" #endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index f43464b9..9bc48502 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,8 +41,8 @@ FILES = [ "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", ] -HEADER_LINES = [121, 73, 118, 22, 29, 41, 65, 32] -FOOTER_LINES = [23, 138, 19, 18, 6, 6, 9, 2] +HEADER_LINES = [121, 73, 125, 23, 29, 41, 65, 32] +FOOTER_LINES = [26, 139, 28, 38, 6, 6, 9, 2] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." diff --git a/src/clblast.cpp b/src/clblast.cpp index 871a4804..a8e4d084 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -2255,6 +2255,7 @@ StatusCode FillCache(const cl_device_id device) { // ================================================================================================= +// Overrides the tuning parameters for this device-precision-kernel combination StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name, const Precision precision, const std::unordered_map ¶meters) { diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index e4f2b3ed..79b6a640 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -12,6 +12,7 @@ // ================================================================================================= #include +#include #include "utilities/utilities.hpp" #include "clblast_c.h" @@ -3484,3 +3485,23 @@ CLBlastStatusCode CLBlastFillCache(const cl_device_id device) { } // ================================================================================================= + +// Overrides the tuning parameters for this device-precision-kernel combination +CLBlastStatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const char* kernel_name, + const CLBlastPrecision precision, const size_t num_parameters, + const char** parameters_names, const size_t* parameters_values) { + try { + const auto kernel_name_cpp = std::string(kernel_name); + const auto precision_cpp = static_cast(precision); + auto parameters = std::unordered_map(); + for (auto i = size_t{0}; i < num_parameters; ++i) { + const auto parameter_name = std::string(parameters_names[i]); + const auto parameter_value = parameters_values[i]; + parameters[parameter_name] = parameter_value; + } + const auto status = clblast::OverrideParameters(device, kernel_name_cpp, precision_cpp, parameters); + return static_cast(status); + } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } +} + +// ================================================================================================= -- cgit v1.2.3 From 3d10690c830a749d1e9c2c60bcd68a61590ef994 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 18 Feb 2017 10:32:32 +0100 Subject: Added missing documentation for the fill and clear cache functions --- doc/clblast.md | 35 +++++++++++++++++++++++++++++++++++ scripts/generator/generator.py | 16 +++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/doc/clblast.md b/doc/clblast.md index 37b99f3d..11560dce 100644 --- a/doc/clblast.md +++ b/doc/clblast.md @@ -2781,3 +2781,38 @@ Requirements for OMATCOPY: +ClearCache: Resets the cache of compiled binaries (auxiliary function) +------------- + +CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache can be cleared to free up system memory or it can be useful in case of debugging. + +C++ API: +``` +StatusCode ClearCache() +``` + +C API: +``` +CLBlastStatusCode CLBlastClearCache() +``` + + + +FillCache: Populates the cache of compiled binaries for a specific device (auxiliary function) +------------- + +CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache is automatically populated whenever a new binary is created. Thus, the first run of a specific kernel could take extra time. For debugging or performance evaluation purposes, it might be useful to populate the cache upfront. This function populates the cache for all kernels in CLBlast for all precisions, but for a specific device only. + +C++ API: +``` +StatusCode FillCache(const cl_device_id device) +``` + +C API: +``` +CLBlastStatusCode CLBlastFillCache(const cl_device_id device) +``` + +Arguments to FillCache: + +* `const cl_device_id device`: The OpenCL device to fill the cache for. diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 9bc48502..38f18e8a 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -43,6 +43,8 @@ FILES = [ ] HEADER_LINES = [121, 73, 125, 23, 29, 41, 65, 32] FOOTER_LINES = [26, 139, 28, 38, 6, 6, 9, 2] +HEADER_LINES_DOC = 0 +FOOTER_LINES_DOC = 35 # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." @@ -233,11 +235,20 @@ def main(argv): f.write(cpp.performance_test(routine, level_string)) f.write(cpp.FOOTER) - # Outputs the API documentation + # API documentation filename = cl_args.clblast_root + "/doc/clblast.md" + + # Stores the header and the footer of the original documentation file + with open(filename) as f: + original = f.readlines() + file_header = original[:HEADER_LINES_DOC] + file_footer = original[-FOOTER_LINES_DOC:] + + # Outputs the API documentation with open(filename, "w") as f: # Outputs the header + f.write("".join(file_header)) doc_header = doc.header() f.write(doc_header) @@ -248,5 +259,8 @@ def main(argv): doc_routine = doc.generate(routine) f.write(doc_routine) + # Outputs the footer + f.write("".join(file_footer)) + if __name__ == '__main__': main(sys.argv[1:]) -- cgit v1.2.3 From d6538dfc25a14251e49da0f95007e03b2b3fe3be Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 18 Feb 2017 10:59:38 +0100 Subject: Fixed the naming of the C API of OverrideParameters and fixed the description --- include/clblast.h | 5 ++--- include/clblast_c.h | 11 +++++------ src/clblast_c.cpp | 6 +++--- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/clblast.h b/include/clblast.h index d9637d15..9fdd5df1 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -621,9 +621,8 @@ StatusCode PUBLIC_API FillCache(const cl_device_id device); // ================================================================================================= -// Overrides tuning parameters for a specific device-precision-routine combination. The next time -// (and all further times) the target routine is called it will re-compile and use the new -// parameters. +// Overrides tuning parameters for a specific device-precision-kernel combination. The next time +// the target routine is called it will re-compile and use the new parameters from then on. StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name, const Precision precision, const std::unordered_map ¶meters); diff --git a/include/clblast_c.h b/include/clblast_c.h index cd657f3b..d4b0b004 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -1345,12 +1345,11 @@ CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device); // ================================================================================================= -// Overrides tuning parameters for a specific device-precision-routine combination. The next time -// (and all further times) the target routine is called it will re-compile and use the new -// parameters. -CLBlastStatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const char* kernel_name, - const CLBlastPrecision precision, const size_t num_parameters, - const char** parameters_names, const size_t* parameters_values); +// Overrides tuning parameters for a specific device-precision-kernel combination. The next time +// the target routine is called it will re-compile and use the new parameters from then on. +CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name, + const CLBlastPrecision precision, const size_t num_parameters, + const char** parameters_names, const size_t* parameters_values); // ================================================================================================= diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index 79b6a640..de431fa4 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -3487,9 +3487,9 @@ CLBlastStatusCode CLBlastFillCache(const cl_device_id device) { // ================================================================================================= // Overrides the tuning parameters for this device-precision-kernel combination -CLBlastStatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const char* kernel_name, - const CLBlastPrecision precision, const size_t num_parameters, - const char** parameters_names, const size_t* parameters_values) { +CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name, + const CLBlastPrecision precision, const size_t num_parameters, + const char** parameters_names, const size_t* parameters_values) { try { const auto kernel_name_cpp = std::string(kernel_name); const auto precision_cpp = static_cast(precision); -- cgit v1.2.3 From fef11a208c46d51eefcde31e19654d8f26fad470 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 18 Feb 2017 11:02:57 +0100 Subject: Added documentation for the OverrideParameters function --- CHANGELOG | 1 + README.md | 4 +++- doc/clblast.md | 28 ++++++++++++++++++++++++++++ scripts/generator/generator.py | 4 ++-- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 20f17807..80551611 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,6 +5,7 @@ Development version (next release) - Fixed a bug when using offsets in the direct version of the GEMM kernels - Fixed a missing cl_khr_fp64 when running double-precision on Intel CPUs - Tests now also exit with an error code when OpenCL errors or compilation errors occur +- Added the OverrideParameters function to the API to be able to supply custom tuning parmeters - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) diff --git a/README.md b/README.md index 67c7703c..418634d5 100644 --- a/README.md +++ b/README.md @@ -156,7 +156,7 @@ Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https:/ Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. -The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl). +The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python (2.7 or 3.x) script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl). In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder): @@ -168,6 +168,8 @@ In summary, tuning the entire library for your device can be done as follows (st python ../scripts/database/database.py . .. make +Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. See the [API documentation](doc/clblast.md#overrideparameters-override-tuning-parameters-auxiliar-function) for more details. + Compiling the correctness tests (optional) ------------- diff --git a/doc/clblast.md b/doc/clblast.md index 11560dce..79ff5eb2 100644 --- a/doc/clblast.md +++ b/doc/clblast.md @@ -2816,3 +2816,31 @@ CLBlastStatusCode CLBlastFillCache(const cl_device_id device) Arguments to FillCache: * `const cl_device_id device`: The OpenCL device to fill the cache for. + + + +OverrideParameters: Override tuning parameters (auxiliary function) +------------- + +This function overrides tuning parameters for a specific device-precision-kernel combination. The next time the target routine is called it will be re-compiled and use the new parameters. All further times (until `OverrideParameters` is called again) it will load the kernel from the cache and thus continue to use the new parameters. Note that the first time after calling `OverrideParameters` a performance drop can be observable due to the re-compilation of the kernel. + +C++ API: +``` +StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name, + const Precision precision, + const std::unordered_map ¶meters) +``` + +C API: +``` +CLBlastStatusCode CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name, + const CLBlastPrecision precision, const size_t num_parameters, + const char** parameters_names, const size_t* parameters_values) +``` + +Arguments to OverrideParameters (C++ version): + +* `const cl_device_id device`: The OpenCL device to set the new parameters for. +* `const std::string &kernel_name`: The target kernel name. This has to be one of the existing CLBlast kernels (Xaxpy, Xdot, Xgemv, XgemvFast, XgemvFastRot, Xgemv, Xger, Copy, Pad, Transpose, Padtranspose, Xgemm, or XgemmDirect). If this argument is incorrect, this function will return with the `clblast::kInvalidOverrideKernel` status-code. +* `const Precision precision`: The CLBlast precision enum to set the new parameters for. +* `const std::unordered_map ¶meters`: An unordered map of strings to integers. This has to contain all the tuning parameters for a specific kernel as reported by the included tuners (e.g. `{ {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} }` for the `Copy` kernel). If this argument is incorrect, this function will return with the `clblast::kMissingOverrideParameter` status-code. diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 38f18e8a..01231a6d 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -42,9 +42,9 @@ FILES = [ "/src/clblast_netlib_c.cpp", ] HEADER_LINES = [121, 73, 125, 23, 29, 41, 65, 32] -FOOTER_LINES = [26, 139, 28, 38, 6, 6, 9, 2] +FOOTER_LINES = [25, 139, 27, 38, 6, 6, 9, 2] HEADER_LINES_DOC = 0 -FOOTER_LINES_DOC = 35 +FOOTER_LINES_DOC = 63 # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." -- cgit v1.2.3 From 2e0951c6dc995775610d500fde01ef64d650ff5e Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 18 Feb 2017 11:05:54 +0100 Subject: Fixed small typo in the documentation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 418634d5..0e11e719 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ In summary, tuning the entire library for your device can be done as follows (st python ../scripts/database/database.py . .. make -Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. See the [API documentation](doc/clblast.md#overrideparameters-override-tuning-parameters-auxiliar-function) for more details. +Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. See the [API documentation](doc/clblast.md#overrideparameters-override-tuning-parameters-auxiliary-function) for more details. Compiling the correctness tests (optional) -- cgit v1.2.3 From 7b2170818f11e0714c8b08aa1dd5b32bfef3f4b6 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 18 Feb 2017 11:22:07 +0100 Subject: Changed the override-parameters test such that it is compatible with more devices --- test/correctness/misc/override_parameters.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp index 54d18cfa..a4cecf0d 100644 --- a/test/correctness/misc/override_parameters.cpp +++ b/test/correctness/misc/override_parameters.cpp @@ -43,9 +43,9 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); auto args = Arguments{}; - args.m = GetArgument(arguments, help, kArgM, size_t{512}); - args.n = GetArgument(arguments, help, kArgN, size_t{512}); - args.k = GetArgument(arguments, help, kArgK, size_t{512}); + args.m = GetArgument(arguments, help, kArgM, size_t{256}); + args.n = GetArgument(arguments, help, kArgN, size_t{256}); + args.k = GetArgument(arguments, help, kArgK, size_t{256}); args.a_ld = GetArgument(arguments, help, kArgALeadDim, args.k); args.b_ld = GetArgument(arguments, help, kArgBLeadDim, args.n); args.c_ld = GetArgument(arguments, help, kArgCLeadDim, args.n); @@ -124,11 +124,15 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st // ================================================================================================= } // namespace clblast +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunOverrideTests(argc, argv, false, "SGEMM"); - errors += clblast::RunOverrideTests(argc, argv, true, "DGEMM"); + errors += clblast::RunOverrideTests(argc, argv, true, "CGEMM"); if (errors > 0) { return 1; } else { return 0; } } -- cgit v1.2.3 From 0643a29af51f9eb13e2b276d0a0e74590c699d3b Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 18 Feb 2017 13:59:10 +0100 Subject: Added tuning parameters for the AMD RX480 GPU (Ellesmere) --- README.md | 1 + src/database/kernels/copy.hpp | 22 ++++++++++++++++------ src/database/kernels/pad.hpp | 14 ++++++++++++-- src/database/kernels/padtranspose.hpp | 12 +++++++++++- src/database/kernels/transpose.hpp | 14 ++++++++++++-- src/database/kernels/xaxpy.hpp | 16 +++++++++++++--- src/database/kernels/xdot.hpp | 12 +++++++++++- src/database/kernels/xgemm.hpp | 16 +++++++++++++--- src/database/kernels/xgemm_direct.hpp | 17 +++++++++++++---- src/database/kernels/xgemv.hpp | 10 ++++++++++ src/database/kernels/xgemv_fast.hpp | 10 ++++++++++ src/database/kernels/xgemv_fast_rot.hpp | 10 ++++++++++ src/database/kernels/xger.hpp | 20 +++++++++++++++----- 13 files changed, 147 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 0e11e719..ceebc588 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,7 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC - Tesla K40m * AMD GPUs: - AMD Radeon R9 M370X Compute Engine + - Ellesmere - Hawaii - Oland - Pitcairn diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp index 9bc613b9..b2aa736f 100644 --- a/src/database/kernels/copy.hpp +++ b/src/database/kernels/copy.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry CopyHalf = { "Copy", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } }, @@ -26,7 +32,7 @@ const Database::DatabaseEntry CopyHalf = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, } }, } @@ -39,13 +45,14 @@ const Database::DatabaseEntry CopySingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "Ellesmere", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",8} } }, { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Tonga", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, { "Turks", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, } }, { // ARM GPUs @@ -104,7 +111,7 @@ const Database::DatabaseEntry CopySingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, } }, } @@ -117,13 +124,14 @@ const Database::DatabaseEntry CopyComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Ellesmere", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } }, { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Turks", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, { // Intel CPUs @@ -174,7 +182,7 @@ const Database::DatabaseEntry CopyComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, } @@ -187,12 +195,13 @@ const Database::DatabaseEntry CopyDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Ellesmere", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } }, { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } }, { "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } }, } }, { // ARM GPUs @@ -252,6 +261,7 @@ const Database::DatabaseEntry CopyComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Ellesmere", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } }, { "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp index f22399eb..e4005527 100644 --- a/src/database/kernels/pad.hpp +++ b/src/database/kernels/pad.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry PadHalf = { "Pad", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, @@ -39,6 +45,7 @@ const Database::DatabaseEntry PadSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Ellesmere", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, @@ -117,6 +124,7 @@ const Database::DatabaseEntry PadComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Ellesmere", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, @@ -182,7 +190,7 @@ const Database::DatabaseEntry PadComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } @@ -195,12 +203,13 @@ const Database::DatabaseEntry PadDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Ellesmere", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tonga", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // ARM GPUs @@ -260,6 +269,7 @@ const Database::DatabaseEntry PadComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Ellesmere", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp index ce40914c..92aa4f7b 100644 --- a/src/database/kernels/padtranspose.hpp +++ b/src/database/kernels/padtranspose.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry PadtransposeHalf = { "Padtranspose", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, @@ -39,6 +45,7 @@ const Database::DatabaseEntry PadtransposeSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Ellesmere", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, @@ -117,6 +124,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Ellesmere", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, @@ -195,6 +203,7 @@ const Database::DatabaseEntry PadtransposeDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Ellesmere", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, @@ -247,7 +256,7 @@ const Database::DatabaseEntry PadtransposeDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, } @@ -260,6 +269,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Ellesmere", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp index b80565b9..0ee2d83b 100644 --- a/src/database/kernels/transpose.hpp +++ b/src/database/kernels/transpose.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry TransposeHalf = { "Transpose", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, @@ -26,7 +32,7 @@ const Database::DatabaseEntry TransposeHalf = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, } }, } @@ -39,13 +45,14 @@ const Database::DatabaseEntry TransposeSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, + { "Ellesmere", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, { "Oland", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Tonga", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Turks", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, { // ARM GPUs @@ -117,6 +124,7 @@ const Database::DatabaseEntry TransposeComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "Ellesmere", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Oland", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, @@ -189,6 +197,7 @@ const Database::DatabaseEntry TransposeDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "Ellesmere", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Oland", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, @@ -254,6 +263,7 @@ const Database::DatabaseEntry TransposeComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Ellesmere", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Oland", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp index 5fefb5c3..4b747f0a 100644 --- a/src/database/kernels/xaxpy.hpp +++ b/src/database/kernels/xaxpy.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry XaxpyHalf = { "Xaxpy", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"VW",4}, {"WGS",128}, {"WPT",4} } }, + { "default", { {"VW",4}, {"WGS",128}, {"WPT",4} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, @@ -26,7 +32,7 @@ const Database::DatabaseEntry XaxpyHalf = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",8}, {"WGS",256}, {"WPT",4} } }, } }, } @@ -39,13 +45,14 @@ const Database::DatabaseEntry XaxpySingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Ellesmere", { {"VW",1}, {"WGS",64}, {"WPT",4} } }, { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } }, { "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } }, { "Turks", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, - { "default", { {"VW",2}, {"WGS",64}, {"WPT",2} } }, + { "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, } }, { // ARM GPUs @@ -117,6 +124,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } }, + { "Ellesmere", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, { "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, { "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, @@ -195,6 +203,7 @@ const Database::DatabaseEntry XaxpyDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "Ellesmere", { {"VW",2}, {"WGS",64}, {"WPT",4} } }, { "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } }, { "Oland", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, @@ -247,7 +256,7 @@ const Database::DatabaseEntry XaxpyDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, + { "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, } }, } @@ -260,6 +269,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Ellesmere", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "Oland", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp index 67360b76..ea154b6e 100644 --- a/src/database/kernels/xdot.hpp +++ b/src/database/kernels/xdot.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry XdotHalf = { "Xdot", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"WGS1",256}, {"WGS2",64} } }, + { "default", { {"WGS1",256}, {"WGS2",64} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } }, @@ -39,12 +45,13 @@ const Database::DatabaseEntry XdotSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } }, + { "Ellesmere", { {"WGS1",128}, {"WGS2",32} } }, { "Oland", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",128}, {"WGS2",32} } }, { "Tonga", { {"WGS1",64}, {"WGS2",32} } }, { "Turks", { {"WGS1",128}, {"WGS2",64} } }, - { "default", { {"WGS1",128}, {"WGS2",64} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, { // Intel CPUs @@ -99,6 +106,7 @@ const Database::DatabaseEntry XdotComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, + { "Ellesmere", { {"WGS1",256}, {"WGS2",32} } }, { "Oland", { {"WGS1",128}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",32} } }, @@ -159,6 +167,7 @@ const Database::DatabaseEntry XdotDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } }, + { "Ellesmere", { {"WGS1",128}, {"WGS2",64} } }, { "Oland", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, @@ -207,6 +216,7 @@ const Database::DatabaseEntry XdotComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, + { "Ellesmere", { {"WGS1",256}, {"WGS2",32} } }, { "Oland", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp index 5f62672b..751f403b 100644 --- a/src/database/kernels/xgemm.hpp +++ b/src/database/kernels/xgemm.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry XgemmHalf = { "Xgemm", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, @@ -38,6 +44,7 @@ const Database::DatabaseEntry XgemmSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } }, + { "Ellesmere", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, @@ -116,6 +123,7 @@ const Database::DatabaseEntry XgemmComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Ellesmere", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Oland", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, @@ -181,7 +189,7 @@ const Database::DatabaseEntry XgemmComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, } }, } @@ -194,6 +202,7 @@ const Database::DatabaseEntry XgemmDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, + { "Ellesmere", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, { "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, @@ -259,12 +268,13 @@ const Database::DatabaseEntry XgemmComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "Ellesmere", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // ARM GPUs @@ -310,7 +320,7 @@ const Database::DatabaseEntry XgemmComplexDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, } }, } diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp index bec0164f..4984475f 100644 --- a/src/database/kernels/xgemm_direct.hpp +++ b/src/database/kernels/xgemm_direct.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry XgemmDirectHalf = { "XgemmDirect", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, + { "default", { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } }, @@ -25,7 +31,7 @@ const Database::DatabaseEntry XgemmDirectHalf = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, } }, } @@ -38,9 +44,10 @@ const Database::DatabaseEntry XgemmDirectSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + { "Ellesmere", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",32}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } }, { "Tonga", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, { "Turks", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, - { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, } }, { // Intel CPUs @@ -68,7 +75,7 @@ const Database::DatabaseEntry XgemmDirectSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, } }, } @@ -124,6 +131,7 @@ const Database::DatabaseEntry XgemmDirectDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "Ellesmere", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } }, { "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, } @@ -159,6 +167,7 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "Ellesmere", { {"KWID",16}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, { "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, } @@ -181,7 +190,7 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, } }, } diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp index 3bb31dc2..05f47554 100644 --- a/src/database/kernels/xgemv.hpp +++ b/src/database/kernels/xgemv.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry XgemvHalf = { "Xgemv", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"WGS1",256}, {"WPT1",1} } }, + { "default", { {"WGS1",256}, {"WPT1",1} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } }, @@ -39,6 +45,7 @@ const Database::DatabaseEntry XgemvSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1} } }, + { "Ellesmere", { {"WGS1",256}, {"WPT1",1} } }, { "Hawaii", { {"WGS1",128}, {"WPT1",1} } }, { "Oland", { {"WGS1",128}, {"WPT1",1} } }, { "Pitcairn", { {"WGS1",256}, {"WPT1",1} } }, @@ -110,6 +117,7 @@ const Database::DatabaseEntry XgemvComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } }, + { "Ellesmere", { {"WGS1",32}, {"WPT1",1} } }, { "Hawaii", { {"WGS1",64}, {"WPT1",1} } }, { "Oland", { {"WGS1",64}, {"WPT1",1} } }, { "Pitcairn", { {"WGS1",64}, {"WPT1",1} } }, @@ -177,6 +185,7 @@ const Database::DatabaseEntry XgemvDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } }, + { "Ellesmere", { {"WGS1",32}, {"WPT1",1} } }, { "Hawaii", { {"WGS1",128}, {"WPT1",1} } }, { "Oland", { {"WGS1",256}, {"WPT1",1} } }, { "Pitcairn", { {"WGS1",256}, {"WPT1",1} } }, @@ -235,6 +244,7 @@ const Database::DatabaseEntry XgemvComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } }, + { "Ellesmere", { {"WGS1",32}, {"WPT1",1} } }, { "Hawaii", { {"WGS1",64}, {"WPT1",1} } }, { "Oland", { {"WGS1",256}, {"WPT1",1} } }, { "Pitcairn", { {"WGS1",256}, {"WPT1",1} } }, diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp index b9a2eba2..efc64d0c 100644 --- a/src/database/kernels/xgemv_fast.hpp +++ b/src/database/kernels/xgemv_fast.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry XgemvFastHalf = { "XgemvFast", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } }, @@ -39,6 +45,7 @@ const Database::DatabaseEntry XgemvFastSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "Ellesmere", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, @@ -110,6 +117,7 @@ const Database::DatabaseEntry XgemvFastComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } }, + { "Ellesmere", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, @@ -171,6 +179,7 @@ const Database::DatabaseEntry XgemvFastDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "Ellesmere", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, @@ -229,6 +238,7 @@ const Database::DatabaseEntry XgemvFastComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "Ellesmere", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } }, { "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Oland", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp index ee3cebdc..aff83ff2 100644 --- a/src/database/kernels/xgemv_fast_rot.hpp +++ b/src/database/kernels/xgemv_fast_rot.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry XgemvFastRotHalf = { "XgemvFastRot", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } }, @@ -38,6 +44,7 @@ const Database::DatabaseEntry XgemvFastRotSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } }, + { "Ellesmere", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, { "Tonga", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } }, { "Turks", { {"VW3",8}, {"WGS3",128}, {"WPT3",16} } }, { "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } }, @@ -86,6 +93,7 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } }, + { "Ellesmere", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } }, { "Tonga", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, { "Turks", { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } }, { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, @@ -124,6 +132,7 @@ const Database::DatabaseEntry XgemvFastRotDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, + { "Ellesmere", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, { "Tonga", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, { "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, } @@ -161,6 +170,7 @@ const Database::DatabaseEntry XgemvFastRotComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, + { "Ellesmere", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, { "Tonga", { {"VW3",4}, {"WGS3",16}, {"WPT3",8} } }, { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } }, } diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp index ade9dcbf..ab528800 100644 --- a/src/database/kernels/xger.hpp +++ b/src/database/kernels/xger.hpp @@ -17,6 +17,12 @@ namespace database { const Database::DatabaseEntry XgerHalf = { "Xger", Precision::kHalf, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "Ellesmere", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, + } + }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, @@ -26,7 +32,7 @@ const Database::DatabaseEntry XgerHalf = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",4}, {"WGS2",8}, {"WPT",2} } }, + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, } }, } @@ -39,13 +45,14 @@ const Database::DatabaseEntry XgerSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, + { "Ellesmere", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } }, { "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, { "Oland", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, { "Tonga", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, { "Turks", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } }, + { "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } }, } }, { // ARM GPUs @@ -92,7 +99,7 @@ const Database::DatabaseEntry XgerSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, + { "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } }, } }, } @@ -105,13 +112,14 @@ const Database::DatabaseEntry XgerComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, + { "Ellesmere", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, { "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, { "Oland", { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, { "Tonga", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Turks", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, - { "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, } }, { // ARM GPUs @@ -171,12 +179,13 @@ const Database::DatabaseEntry XgerDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, + { "Ellesmere", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } }, { "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "Oland", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, { "Tonga", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } }, - { "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } }, + { "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, } }, { // ARM GPUs @@ -225,6 +234,7 @@ const Database::DatabaseEntry XgerComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "Ellesmere", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } }, { "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } }, { "Oland", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, -- cgit v1.2.3