diff options
-rw-r--r-- | CMakeLists.txt | 98 | ||||
-rw-r--r-- | include/clblast_cuda.h | 643 | ||||
-rwxr-xr-x | scripts/generator/generator.py | 12 | ||||
-rw-r--r-- | scripts/generator/generator/cpp.py | 22 | ||||
-rw-r--r-- | scripts/generator/generator/routine.py | 28 | ||||
-rw-r--r-- | src/api_common.cpp | 2 | ||||
-rw-r--r-- | src/clblast_cuda.cpp | 2336 | ||||
-rw-r--r-- | src/cupp11.hpp | 770 | ||||
-rw-r--r-- | src/utilities/buffer_test.hpp | 2 | ||||
-rw-r--r-- | src/utilities/utilities.hpp | 9 |
10 files changed, 3874 insertions, 48 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 52accbd4..a5a41f35 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,6 +30,23 @@ option(TESTS "Enable compilation of the correctness tests" OFF) option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF) option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF) +# Select between an OpenCL API (default) or a CUDA API (beta) +option(OPENCL "Build CLBlast with an OpenCL API (default)" ON) +option(CUDA "Build CLBlast with a CUDA API (beta)" OFF) +if(NOT OPENCL AND NOT CUDA) + message(FATAL_ERROR "No API selected, choose from OpenCL (-DOPENCL=ON) or CUDA (-DCUDA=ON)") +endif() +if(OPENCL AND CUDA) + message(FATAL_ERROR "Multiple APIs selected, choose either OpenCL (-DOPENCL=ON -DCUDA=OFF) or CUDA (-DCUDA=ON -DOPENCL=OFF)") +endif() +if(OPENCL) + message("-- Building CLBlast with OpenCL API (default)") + add_definitions(-DOPENCL_API) +elseif(CUDA) + message("-- Building CLBlast with CUDA API (beta)") + add_definitions(-DCUDA_API) +endif() + # Compile in verbose mode with additional diagnostic messages option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF) if(VERBOSE) @@ -123,8 +140,18 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}") # Package scripts location set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/") -# Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH. -find_package(OpenCL REQUIRED) +if(OPENCL) + # Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH. + find_package(OpenCL REQUIRED) + set(API_LIBRARIES ${OPENCL_LIBRARIES}) + set(API_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS}) +elseif(CUDA) + # For CUDA, the "FindCUDA.cmake" is part of CMake + find_package(CUDA REQUIRED) + set(API_LIBRARIES cuda nvrtc) + set(API_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) + link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64) +endif() # Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included. if(TUNERS) @@ -161,11 +188,6 @@ set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemm_direct xgemv) set(DATABASES copy pad padtranspose transpose xaxpy xdot xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger) -set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched) -set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) -if(NETLIB) - set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) -endif() set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) @@ -173,6 +195,16 @@ set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm) set(LEVELX_ROUTINES xomatcopy xim2col xaxpybatched xgemmbatched) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(PRECISIONS 32 64 3232 6464 16) +if(OPENCL) + set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched) + set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) + if(NETLIB) + set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) + endif() +elseif(CUDA) + set(SAMPLE_PROGRAMS_CPP ) + set(SAMPLE_PROGRAMS_C ) +endif() # ================================================================================================== @@ -184,14 +216,10 @@ set(SOURCES src/utilities/utilities.cpp src/api_common.cpp src/cache.cpp - src/clblast.cpp - src/clblast_c.cpp src/routine.cpp src/routines/levelx/xinvert.cpp # only source, don't include it as a test ) set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio - include/clblast.h - include/clblast_c.h include/clblast_half.h src/database/apple_cpu_fallback.hpp src/database/database.hpp @@ -209,13 +237,19 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual src/utilities/msvc.hpp src/utilities/utilities.hpp src/cache.hpp - src/clpp11.hpp src/cxpp11_common.hpp src/routine.hpp ) -if(NETLIB) - set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp) - set(HEADERS ${HEADERS} include/clblast_netlib_c.h) +if(OPENCL) + set(SOURCES ${SOURCES} src/clblast.cpp src/clblast_c.cpp) + set(HEADERS ${HEADERS} include/clblast.h include/clblast_c.h src/clpp11.hpp) + if(NETLIB) + set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp) + set(HEADERS ${HEADERS} include/clblast_netlib_c.h) + endif() +elseif(CUDA) + set(SOURCES ${SOURCES} src/clblast_cuda.cpp) + set(HEADERS ${HEADERS} include/clblast_cuda.h src/cupp11.hpp) endif() foreach(ROUTINE ${LEVEL1_ROUTINES}) set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp) @@ -249,14 +283,14 @@ else(BUILD_SHARED_LIBS) add_library(clblast STATIC ${SOURCES} ${HEADERS}) endif() -target_link_libraries(clblast ${OPENCL_LIBRARIES}) +target_link_libraries(clblast ${API_LIBRARIES}) # Includes directories: CLBlast and OpenCL target_include_directories(clblast PUBLIC $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include> $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src> $<INSTALL_INTERFACE:include> - ${OPENCL_INCLUDE_DIRS}) + ${API_INCLUDE_DIRS}) # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built if(MSVC) @@ -267,11 +301,15 @@ endif() # Installs the library install(TARGETS clblast EXPORT CLBlast DESTINATION lib) -install(FILES include/clblast.h DESTINATION include) -install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) -if(NETLIB) - install(FILES include/clblast_netlib_c.h DESTINATION include) +if(OPENCL) + install(FILES include/clblast.h DESTINATION include) + install(FILES include/clblast_c.h DESTINATION include) + if(NETLIB) + install(FILES include/clblast_netlib_c.h DESTINATION include) + endif() +elseif(CUDA) + install(FILES include/clblast_cuda.h DESTINATION include) endif() # Installs the config for find_package in dependent projects @@ -291,19 +329,21 @@ endif() if(SAMPLES) # Downloads the cl.hpp file from Khronos - file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp) + if(OPENCL) + file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp) + endif() # Adds sample programs (C++) foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP}) add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp) - target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_sample_${SAMPLE} clblast ${API_LIBRARIES}) install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin) endforeach() # Adds sample programs (C) foreach(SAMPLE ${SAMPLE_PROGRAMS_C}) add_executable(clblast_sample_${SAMPLE}_c samples/${SAMPLE}.c) - target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${API_LIBRARIES}) install(TARGETS clblast_sample_${SAMPLE}_c DESTINATION bin) endforeach() @@ -324,7 +364,7 @@ if(TUNERS) # Adds tuning executables foreach(KERNEL ${KERNELS}) add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp) - target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS}) install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin) endforeach() @@ -429,7 +469,7 @@ if(CLIENTS) test/routines/levelx/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${ROUTINES}) - target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) install(TARGETS clblast_client_${ROUTINE} DESTINATION bin) endforeach() @@ -481,7 +521,7 @@ if(TESTS) test/routines/levelx/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${ROUTINES}) - target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) install(TARGETS clblast_test_${ROUTINE} DESTINATION bin) target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE}) @@ -492,7 +532,7 @@ if(TESTS) foreach(MISC_TEST ${MISC_TESTS}) add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON} test/correctness/misc/${MISC_TEST}.cpp) - target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_test_${MISC_TEST} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${clblast_SOURCE_DIR} ${REF_INCLUDES}) @@ -501,7 +541,7 @@ if(TESTS) # CLBlast diagnostics add_executable(clblast_test_diagnostics ${TESTS_COMMON} test/diagnostics.cpp) - target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_test_diagnostics PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${clblast_SOURCE_DIR} ${REF_INCLUDES}) diff --git a/include/clblast_cuda.h b/include/clblast_cuda.h new file mode 100644 index 00000000..c125c302 --- /dev/null +++ b/include/clblast_cuda.h @@ -0,0 +1,643 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the special CUDA interface to the CLBlast BLAS routines. It also contains the +// definitions of the returned status codes and the layout and transpose types. This is the header +// users of the CUDA API of CLBlast should include and use. +// +// ================================================================================================= + +#ifndef CLBLAST_CLBLAST_CUDA_H_ +#define CLBLAST_CLBLAST_CUDA_H_ + +#include <cstdlib> // For size_t +#include <string> // For OverrideParameters function +#include <unordered_map> // For OverrideParameters function + +// CUDA +#include <cuda.h> // CUDA driver API +#include <nvrtc.h> // NVIDIA runtime compilation API + +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#if defined(_WIN32) && defined(CLBLAST_DLL) + #if defined(COMPILING_DLL) + #define PUBLIC_API __declspec(dllexport) + #else + #define PUBLIC_API __declspec(dllimport) + #endif +#else + #define PUBLIC_API +#endif + +namespace clblast { +// ================================================================================================= + +// Status codes. These codes can be returned by functions declared in this header file. The error +// codes match either the standard CUDA driver API error codes or the regular CLBlast error codes. +enum class StatusCode { + + // Status codes in common with the OpenCL standard + kSuccess = 0, // CUDA_SUCCESS + kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions + kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total + kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension + + // Status codes in common with the clBLAS library + kNotImplemented = -1024, // Routine or functionality not implemented yet + kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer + kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer + kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer + kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer + kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer + kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero + kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension + kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension + kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension + kInvalidIncrementX = -1013, // Increment of vector X cannot be zero + kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero + kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small + kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small + kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small + kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small + kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small + + // Custom additional status codes for CLBlast + kInvalidBatchCount = -2049, // The batch count needs to be positive + kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel + kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel + kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device + kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device + kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device + kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer + kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small + kDatabaseError = -2041, // Entry for the device was not found in the database + kUnknownError = -2040, // A catch-all error code representing an unspecified error + kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception +}; + +// Matrix layout and transpose types +enum class Layout { kRowMajor = 101, kColMajor = 102 }; +enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 }; +enum class Triangle { kUpper = 121, kLower = 122 }; +enum class Diagonal { kNonUnit = 131, kUnit = 132 }; +enum class Side { kLeft = 141, kRight = 142 }; + +// Precision scoped enum (values in bits) +enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, + kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 }; + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +template <typename T> +StatusCode Rotg(CUdeviceptr sa_buffer, const size_t sa_offset, + CUdeviceptr sb_buffer, const size_t sb_offset, + CUdeviceptr sc_buffer, const size_t sc_offset, + CUdeviceptr ss_buffer, const size_t ss_offset, + CUstream* stream); + +// Generate modified givens plane rotation: SROTMG/DROTMG +template <typename T> +StatusCode Rotmg(CUdeviceptr sd1_buffer, const size_t sd1_offset, + CUdeviceptr sd2_buffer, const size_t sd2_offset, + CUdeviceptr sx1_buffer, const size_t sx1_offset, + const CUdeviceptr sy1_buffer, const size_t sy1_offset, + CUdeviceptr sparam_buffer, const size_t sparam_offset, + CUstream* stream); + +// Apply givens plane rotation: SROT/DROT +template <typename T> +StatusCode Rot(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const T cos, + const T sin, + CUstream* stream); + +// Apply modified givens plane rotation: SROTM/DROTM +template <typename T> +StatusCode Rotm(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr sparam_buffer, const size_t sparam_offset, + CUstream* stream); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +template <typename T> +StatusCode Swap(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +template <typename T> +StatusCode Scal(const size_t n, + const T alpha, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +template <typename T> +StatusCode Copy(const size_t n, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +template <typename T> +StatusCode Axpy(const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Dot product of two vectors: SDOT/DDOT/HDOT +template <typename T> +StatusCode Dot(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template <typename T> +StatusCode Dotu(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template <typename T> +StatusCode Dotc(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +template <typename T> +StatusCode Nrm2(const size_t n, + CUdeviceptr nrm2_buffer, const size_t nrm2_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +template <typename T> +StatusCode Asum(const size_t n, + CUdeviceptr asum_buffer, const size_t asum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +template <typename T> +StatusCode Sum(const size_t n, + CUdeviceptr sum_buffer, const size_t sum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +template <typename T> +StatusCode Amax(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +template <typename T> +StatusCode Amin(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +template <typename T> +StatusCode Max(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +template <typename T> +StatusCode Min(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +template <typename T> +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +template <typename T> +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +template <typename T> +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template <typename T> +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template <typename T> +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +template <typename T> +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +template <typename T> +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +template <typename T> +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +template <typename T> +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +template <typename T> +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +template <typename T> +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr ap_buffer, const size_t ap_offset, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template <typename T> +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template <typename T> +StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template <typename T> +StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr ap_buffer, const size_t ap_offset, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// General rank-1 matrix update: SGER/DGER/HGER +template <typename T> +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// General rank-1 complex matrix update: CGERU/ZGERU +template <typename T> +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template <typename T> +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Hermitian rank-1 matrix update: CHER/ZHER +template <typename T> +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template <typename T> +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template <typename T> +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template <typename T> +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +template <typename T> +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +template <typename T> +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +template <typename T> +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +template <typename T> +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +template <typename T> +StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +template <typename T> +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +template <typename T> +StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +template <typename T> +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +template <typename T> +StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +template <typename T> +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +template <typename T, typename U> +StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +template <typename T> +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template <typename T> +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template <typename T> +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream); + +// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL +template <typename T> +StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, + const CUdeviceptr im_buffer, const size_t im_offset, + CUdeviceptr col_buffer, const size_t col_offset, + CUstream* stream); + +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template <typename T> +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, + CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + CUstream* stream); + +// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED +template <typename T> +StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T *alphas, + const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld, + const T *betas, + CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, + const size_t batch_count, + CUstream* stream); + +// ================================================================================================= + +// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on +// for the same device. This cache can be cleared to free up system memory or in case of debugging. +StatusCode PUBLIC_API ClearCache(); + +// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels. +// Further CLBlast routine calls will then run at maximum speed. +StatusCode PUBLIC_API FillCache(const CUdevice device); + +// ================================================================================================= + +// Overrides tuning parameters for a specific device-precision-kernel combination. The next time +// the target routine is called it will re-compile and use the new parameters from then on. +StatusCode PUBLIC_API OverrideParameters(const CUdevice device, const std::string &kernel_name, + const Precision precision, + const std::unordered_map<std::string,size_t> ¶meters); + +// ================================================================================================= + +} // namespace clblast + +// CLBLAST_CLBLAST_CUDA_H_ +#endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 0d34d7fe..520e3fc8 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -12,6 +12,8 @@ # clblast.cpp # clblast_c.h # clblast_c.cpp +# clblast_cuda.h +# clblast_cuda.cpp # clblast_netlib_c.h # clblast_netlib_c.cpp # wrapper_clblas.h @@ -41,9 +43,11 @@ FILES = [ "/test/wrapper_cublas.hpp", "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", + "/include/clblast_cuda.h", + "/src/clblast_cuda.cpp", ] -HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32] -FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2] +HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32, 94, 21] +FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2, 25, 3] HEADER_LINES_DOC = 0 FOOTER_LINES_DOC = 63 @@ -224,6 +228,10 @@ def main(argv): if i == 8: if not routine.batched: body += cpp.clblast_netlib_c_cc(routine) + if i == 9: + body += cpp.clblast_h(routine, cuda=True) + if i == 10: + body += cpp.clblast_cc(routine, cuda=True) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 5fef3083..f1ee1959 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -36,19 +36,19 @@ HEADER = NL + SEPARATOR + """ """ + SEPARATOR + NL -def clblast_h(routine): +def clblast_h(routine, cuda=False): """The C++ API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL - result += routine.routine_header_cpp(12, " = nullptr") + ";" + NL + result += routine.routine_header_cpp(12, " = nullptr", cuda) + ";" + NL return result -def clblast_cc(routine): +def clblast_cc(routine, cuda=False): """The C++ API implementation (.cpp)""" indent1 = " " * (15 + routine.length()) result = NL + "// " + routine.description + ": " + routine.short_names() + NL if routine.implemented: - result += routine.routine_header_cpp(12, "") + " {" + NL + result += routine.routine_header_cpp(12, "", cuda) + " {" + NL result += " try {" + NL result += " auto queue_cpp = Queue(*queue);" + NL result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL @@ -60,14 +60,22 @@ def clblast_cc(routine): result += " return StatusCode::kSuccess;" + NL result += " } catch (...) { return DispatchException(); }" + NL else: - result += routine.routine_header_type_cpp(12) + " {" + NL + result += routine.routine_header_type_cpp(12, cuda) + " {" + NL result += " return StatusCode::kNotImplemented;" + NL result += "}" + NL for flavour in routine.flavours: indent2 = " " * (34 + routine.length() + len(flavour.template)) result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">(" - result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)]) - result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL + arguments = routine.arguments_type(flavour) + if cuda: + arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] + result += ("," + NL + indent2).join([a for a in arguments]) + result += "," + NL + indent2 + if cuda: + result += "CUstream*" + else: + result += "cl_command_queue*, cl_event*" + result += ");" + NL return result diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index cef7db87..c3c1f775 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -802,22 +802,38 @@ class Routine: """Retrieves a list of routine requirements for documentation""" return self.requirements - def routine_header_cpp(self, spaces, default_event): + def routine_header_cpp(self, spaces, default_event, cuda=False): """Retrieves the C++ templated definition for a routine""" indent = " " * (spaces + self.length()) + arguments = self.arguments_def(self.template) + if cuda: + arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] result = "template <" + self.template.name + ">\n" result += "StatusCode " + self.capitalized_name() + "(" - result += (",\n" + indent).join([a for a in self.arguments_def(self.template)]) - result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")" + result += (",\n" + indent).join([a for a in arguments]) + result += ",\n" + indent + if cuda: + result += "CUstream* stream" + else: + result += "cl_command_queue* queue, cl_event* event" + default_event + result += ")" return result - def routine_header_type_cpp(self, spaces): + def routine_header_type_cpp(self, spaces, cuda=False): """As above, but now without variable names""" indent = " " * (spaces + self.length()) + arguments = self.arguments_type(self.template) + if cuda: + arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] result = "template <" + self.template.name + ">\n" result += "StatusCode " + self.capitalized_name() + "(" - result += (",\n" + indent).join([a for a in self.arguments_type(self.template)]) - result += ",\n" + indent + "cl_command_queue*, cl_event*)" + result += (",\n" + indent).join([a for a in arguments]) + result += ",\n" + indent + if cuda: + result += "CUstream* stream" + else: + result += "cl_command_queue*, cl_event*" + result += ")" return result def routine_header_c(self, flavour, spaces, extra_qualifier): diff --git a/src/api_common.cpp b/src/api_common.cpp index aa7e2b0f..0d387cd9 100644 --- a/src/api_common.cpp +++ b/src/api_common.cpp @@ -12,9 +12,9 @@ #include <string> +#include "utilities/utilities.hpp" #include "cache.hpp" #include "routines/routines.hpp" -#include "clblast.h" namespace clblast { // ================================================================================================= diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp new file mode 100644 index 00000000..5f30d023 --- /dev/null +++ b/src/clblast_cuda.cpp @@ -0,0 +1,2336 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements all the BLAS API calls (CUDA version). In all cases, it does not much more +// than creating a new object of the appropriate type, and calling the main routine on that object. +// It forwards all status codes to the caller. +// +// ================================================================================================= + +#include <string> + +#include "routines/routines.hpp" +#include "clblast_cuda.h" + +namespace clblast { + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +template <typename T> +StatusCode Rotg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotg<float>(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Rotg<double>(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Generate modified givens plane rotation: SROTMG/DROTMG +template <typename T> +StatusCode Rotmg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotmg<float>(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Rotmg<double>(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Apply givens plane rotation: SROT/DROT +template <typename T> +StatusCode Rot(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const T, + const T, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rot<float>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const float, + const float, + CUstream*); +template StatusCode PUBLIC_API Rot<double>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const double, + const double, + CUstream*); + +// Apply modified givens plane rotation: SROTM/DROTM +template <typename T> +StatusCode Rotm(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotm<float>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Rotm<double>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +template <typename T> +StatusCode Swap(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xswap<T>(queue_cpp, event); + routine.DoSwap(n, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Swap<float>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap<double>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap<float2>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap<double2>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap<half>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +template <typename T> +StatusCode Scal(const size_t n, + const T alpha, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xscal<T>(queue_cpp, event); + routine.DoScal(n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Scal<float>(const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal<double>(const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal<float2>(const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal<double2>(const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal<half>(const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +template <typename T> +StatusCode Copy(const size_t n, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xcopy<T>(queue_cpp, event); + routine.DoCopy(n, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Copy<float>(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy<double>(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy<float2>(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy<double2>(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy<half>(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +template <typename T> +StatusCode Axpy(const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xaxpy<T>(queue_cpp, event); + routine.DoAxpy(n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Axpy<float>(const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy<double>(const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy<float2>(const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy<double2>(const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy<half>(const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Dot product of two vectors: SDOT/DDOT/HDOT +template <typename T> +StatusCode Dot(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdot<T>(queue_cpp, event); + routine.DoDot(n, + Buffer<T>(dot_buffer), dot_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dot<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dot<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dot<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template <typename T> +StatusCode Dotu(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdotu<T>(queue_cpp, event); + routine.DoDotu(n, + Buffer<T>(dot_buffer), dot_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dotu<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dotu<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template <typename T> +StatusCode Dotc(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdotc<T>(queue_cpp, event); + routine.DoDotc(n, + Buffer<T>(dot_buffer), dot_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dotc<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dotc<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +template <typename T> +StatusCode Nrm2(const size_t n, + CUdeviceptr nrm2_buffer, const size_t nrm2_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xnrm2<T>(queue_cpp, event); + routine.DoNrm2(n, + Buffer<T>(nrm2_buffer), nrm2_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Nrm2<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +template <typename T> +StatusCode Asum(const size_t n, + CUdeviceptr asum_buffer, const size_t asum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xasum<T>(queue_cpp, event); + routine.DoAsum(n, + Buffer<T>(asum_buffer), asum_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Asum<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +template <typename T> +StatusCode Sum(const size_t n, + CUdeviceptr sum_buffer, const size_t sum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsum<T>(queue_cpp, event); + routine.DoSum(n, + Buffer<T>(sum_buffer), sum_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Sum<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +template <typename T> +StatusCode Amax(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xamax<T>(queue_cpp, event); + routine.DoAmax(n, + Buffer<unsigned int>(imax_buffer), imax_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Amax<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +template <typename T> +StatusCode Amin(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xamin<T>(queue_cpp, event); + routine.DoAmin(n, + Buffer<unsigned int>(imin_buffer), imin_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Amin<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +template <typename T> +StatusCode Max(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xmax<T>(queue_cpp, event); + routine.DoMax(n, + Buffer<unsigned int>(imax_buffer), imax_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Max<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +template <typename T> +StatusCode Min(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xmin<T>(queue_cpp, event); + routine.DoMin(n, + Buffer<unsigned int>(imin_buffer), imin_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Min<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +template <typename T> +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgemv<T>(queue_cpp, event); + routine.DoGemv(layout, a_transpose, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gemv<float>(const Layout, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv<double>(const Layout, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv<float2>(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv<double2>(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv<half>(const Layout, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +template <typename T> +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgbmv<T>(queue_cpp, event); + routine.DoGbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gbmv<float>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv<double>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv<float2>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv<double2>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv<half>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +template <typename T> +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhemv<T>(queue_cpp, event); + routine.DoHemv(layout, triangle, + n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hemv<float2>(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hemv<double2>(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template <typename T> +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhbmv<T>(queue_cpp, event); + routine.DoHbmv(layout, triangle, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hbmv<float2>(const Layout, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hbmv<double2>(const Layout, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template <typename T> +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpmv<T>(queue_cpp, event); + routine.DoHpmv(layout, triangle, + n, + alpha, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpmv<float2>(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hpmv<double2>(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +template <typename T> +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsymv<T>(queue_cpp, event); + routine.DoSymv(layout, triangle, + n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Symv<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symv<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symv<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +template <typename T> +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsbmv<T>(queue_cpp, event); + routine.DoSbmv(layout, triangle, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Sbmv<float>(const Layout, const Triangle, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sbmv<double>(const Layout, const Triangle, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sbmv<half>(const Layout, const Triangle, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +template <typename T> +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspmv<T>(queue_cpp, event); + routine.DoSpmv(layout, triangle, + n, + alpha, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spmv<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spmv<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spmv<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +template <typename T> +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrmv<T>(queue_cpp, event); + routine.DoTrmv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv<half>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +template <typename T> +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtbmv<T>(queue_cpp, event); + routine.DoTbmv(layout, triangle, a_transpose, diagonal, + n, k, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv<half>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +template <typename T> +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr ap_buffer, const size_t ap_offset, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtpmv<T>(queue_cpp, event); + routine.DoTpmv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template <typename T> +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrsv<T>(queue_cpp, event); + routine.DoTrsv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template <typename T> +StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Tbsv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbsv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template <typename T> +StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Tpsv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpsv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General rank-1 matrix update: SGER/DGER/HGER +template <typename T> +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xger<T>(queue_cpp, event); + routine.DoGer(layout, + m, n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Ger<float>(const Layout, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Ger<double>(const Layout, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Ger<half>(const Layout, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General rank-1 complex matrix update: CGERU/ZGERU +template <typename T> +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgeru<T>(queue_cpp, event); + routine.DoGeru(layout, + m, n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Geru<float2>(const Layout, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Geru<double2>(const Layout, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template <typename T> +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgerc<T>(queue_cpp, event); + routine.DoGerc(layout, + m, n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gerc<float2>(const Layout, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gerc<double2>(const Layout, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian rank-1 matrix update: CHER/ZHER +template <typename T> +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher<std::complex<T>,T>(queue_cpp, event); + routine.DoHer(layout, triangle, + n, + alpha, + Buffer<std::complex<T>>(x_buffer), x_offset, x_inc, + Buffer<std::complex<T>>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Her<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template <typename T> +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event); + routine.DoHpr(layout, triangle, + n, + alpha, + Buffer<std::complex<T>>(x_buffer), x_offset, x_inc, + Buffer<std::complex<T>>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpr<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hpr<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template <typename T> +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher2<T>(queue_cpp, event); + routine.DoHer2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her2<float2>(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Her2<double2>(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template <typename T> +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpr2<T>(queue_cpp, event); + routine.DoHpr2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpr2<float2>(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hpr2<double2>(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +template <typename T> +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr<T>(queue_cpp, event); + routine.DoSyr(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +template <typename T> +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspr<T>(queue_cpp, event); + routine.DoSpr(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spr<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +template <typename T> +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr2<T>(queue_cpp, event); + routine.DoSyr2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr2<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +template <typename T> +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspr2<T>(queue_cpp, event); + routine.DoSpr2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spr2<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr2<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr2<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +template <typename T> +StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgemm<T>(queue_cpp, event); + routine.DoGemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gemm<float>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm<double>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm<float2>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm<double2>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm<half>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +template <typename T> +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsymm<T>(queue_cpp, event); + routine.DoSymm(layout, side, triangle, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Symm<float>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm<double>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm<float2>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm<double2>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm<half>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +template <typename T> +StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhemm<T>(queue_cpp, event); + routine.DoHemm(layout, side, triangle, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hemm<float2>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hemm<double2>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +template <typename T> +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyrk<T>(queue_cpp, event); + routine.DoSyrk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syrk<float>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk<double>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk<float2>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk<double2>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk<half>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +template <typename T> +StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xherk<std::complex<T>,T>(queue_cpp, event); + routine.DoHerk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer<std::complex<T>>(a_buffer), a_offset, a_ld, + beta, + Buffer<std::complex<T>>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Herk<float>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Herk<double>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +template <typename T> +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr2k<T>(queue_cpp, event); + routine.DoSyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr2k<float>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k<double>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k<float2>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k<double2>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k<half>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +template <typename T, typename U> +StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher2k<T,U>(queue_cpp, event); + routine.DoHer2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her2k<float2,float>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Her2k<double2,double>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +template <typename T> +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrmm<T>(queue_cpp, event); + routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trmm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm<double>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm<float2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm<double2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template <typename T> +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrsm<T>(queue_cpp, event); + routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsm<double>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsm<float2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template <typename T> +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xomatcopy<T>(queue_cpp, event); + routine.DoOmatcopy(layout, a_transpose, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Omatcopy<float>(const Layout, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy<double>(const Layout, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy<float2>(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy<double2>(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL +template <typename T> +StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, + const CUdeviceptr im_buffer, const size_t im_offset, + CUdeviceptr col_buffer, const size_t col_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xim2col<T>(queue_cpp, event); + routine.DoIm2col(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + Buffer<T>(im_buffer), im_offset, + Buffer<T>(col_buffer), col_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Im2col<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template <typename T> +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, + CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = XaxpyBatched<T>(queue_cpp, event); + auto alphas_cpp = std::vector<T>(); + auto x_offsets_cpp = std::vector<size_t>(); + auto y_offsets_cpp = std::vector<size_t>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + x_offsets_cpp.push_back(x_offsets[batch]); + y_offsets_cpp.push_back(y_offsets[batch]); + } + routine.DoAxpyBatched(n, + alphas_cpp, + Buffer<T>(x_buffer), x_offsets_cpp, x_inc, + Buffer<T>(y_buffer), y_offsets_cpp, y_inc, + batch_count); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API AxpyBatched<float>(const size_t, + const float*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched<double>(const size_t, + const double*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t, + const float2*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t, + const double2*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched<half>(const size_t, + const half*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); + +// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED +template <typename T> +StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T *alphas, + const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld, + const T *betas, + CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, + const size_t batch_count, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = XgemmBatched<T>(queue_cpp, event); + auto alphas_cpp = std::vector<T>(); + auto betas_cpp = std::vector<T>(); + auto a_offsets_cpp = std::vector<size_t>(); + auto b_offsets_cpp = std::vector<size_t>(); + auto c_offsets_cpp = std::vector<size_t>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + betas_cpp.push_back(betas[batch]); + a_offsets_cpp.push_back(a_offsets[batch]); + b_offsets_cpp.push_back(b_offsets[batch]); + c_offsets_cpp.push_back(c_offsets[batch]); + } + routine.DoGemmBatched(layout, a_transpose, b_transpose, + m, n, k, + alphas_cpp, + Buffer<T>(a_buffer), a_offsets_cpp, a_ld, + Buffer<T>(b_buffer), b_offsets_cpp, b_ld, + betas_cpp, + Buffer<T>(c_buffer), c_offsets_cpp, c_ld, + batch_count); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API GemmBatched<float>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const float*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched<double>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const double*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched<float2>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const float2*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched<double2>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const double2*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched<half>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const half*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const half*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); + +// ================================================================================================= +} // namespace clblast diff --git a/src/cupp11.hpp b/src/cupp11.hpp new file mode 100644 index 00000000..988366ea --- /dev/null +++ b/src/cupp11.hpp @@ -0,0 +1,770 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API +// calls. The main benefits are increased abstraction, automatic memory management, and portability. +// Portability here means that a similar header exists for CUDA with the same classes and +// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. +// +// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and +// therefore contains the following header copyright notice: +// +// ================================================================================================= +// +// Copyright 2015 SURFsara +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ================================================================================================= + +#ifndef CLBLAST_CUPP11_H_ +#define CLBLAST_CUPP11_H_ + +// C++ +#include <algorithm> // std::copy +#include <string> // std::string +#include <vector> // std::vector +#include <memory> // std::shared_ptr + +// CUDA +#include <cuda.h> // CUDA driver API +#include <nvrtc.h> // NVIDIA runtime compilation API + +// Exception classes +#include "cxpp11_common.hpp" + +namespace clblast { +// ================================================================================================= + +// Max-length of strings +constexpr auto kStringLength = 256; + +// ================================================================================================= + +// Represents a runtime error returned by a CUDA driver API function +class CLCudaAPIError : public ErrorCode<DeviceError, CUresult> { +public: + explicit CLCudaAPIError(CUresult status, const std::string &where): + ErrorCode(status, where, "CUDA error: " + where + ": " + + GetErrorName(status) + " --> " + GetErrorString(status)) { + } + + static void Check(const CUresult status, const std::string &where) { + if (status != CUDA_SUCCESS) { + throw CLCudaAPIError(status, where); + } + } + + static void CheckDtor(const CUresult status, const std::string &where) { + if (status != CUDA_SUCCESS) { + fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what()); + } + } + +private: + std::string GetErrorName(CUresult status) const { + const char* status_code; + cuGetErrorName(status, &status_code); + return std::string(status_code); + } + std::string GetErrorString(CUresult status) const { + const char* status_string; + cuGetErrorString(status, &status_string); + return std::string(status_string); + } +}; + +// Represents a runtime error returned by a CUDA runtime compilation API function +class CLCudaAPINVRTCError : public ErrorCode<DeviceError, nvrtcResult> { +public: + explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where): + ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) { + } + + static void Check(const nvrtcResult status, const std::string &where) { + if (status != NVRTC_SUCCESS) { + throw CLCudaAPINVRTCError(status, where); + } + } + + static void CheckDtor(const nvrtcResult status, const std::string &where) { + if (status != NVRTC_SUCCESS) { + fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what()); + } + } + +private: + std::string GetErrorString(nvrtcResult status) const { + const char* status_string = nvrtcGetErrorString(status); + return std::string(status_string); + } +}; + +// Exception returned when building a program +using CLCudaAPIBuildError = CLCudaAPINVRTCError; + +// ================================================================================================= + +// Error occurred in CUDA driver or runtime compilation API +#define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call)) +#define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call)) + +// Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors) +#define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call)) +#define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call)) + +// ================================================================================================= + +// C++11 version of two 'CUevent' pointers +class Event { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Regular constructor with memory management + explicit Event(): + start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }), + end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) { + CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT)); + CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT)); + } + + // Waits for completion of this event (not implemented for CUDA) + void WaitForCompletion() const { } + + // Retrieves the elapsed time of the last recorded event + float GetElapsedTime() const { + auto result = 0.0f; + cuEventElapsedTime(&result, *start_, *end_); + return result; + } + + // Accessors to the private data-members + const CUevent& start() const { return *start_; } + const CUevent& end() const { return *end_; } + Event* pointer() { return this; } +private: + std::shared_ptr<CUevent> start_; + std::shared_ptr<CUevent> end_; +}; + +// Pointer to a CUDA event +using EventPointer = Event*; + +// ================================================================================================= + +// Raw platform ID type +using RawPlatformID = size_t; + +// The CUDA platform: initializes the CUDA driver API +class Platform { +public: + + // Initializes the platform. Note that the platform ID variable is not actually used for CUDA. + explicit Platform(const size_t platform_id) : platform_id_(0) { + if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); } + CheckError(cuInit(0)); + } + + // Methods to retrieve platform information + std::string Name() const { return "CUDA"; } + std::string Vendor() const { return "NVIDIA Corporation"; } + std::string Version() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return "CUDA driver "+std::to_string(result); + } + + // Returns the number of devices on this platform + size_t NumDevices() const { + auto result = 0; + CheckError(cuDeviceGetCount(&result)); + return static_cast<size_t>(result); + } + + // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0) + const RawPlatformID& operator()() const { return platform_id_; } +private: + const size_t platform_id_; +}; + +// Retrieves a vector with all platforms. Note that there is just one platform in CUDA. +inline std::vector<Platform> GetAllPlatforms() { + auto all_platforms = std::vector<Platform>{ Platform(size_t{0}) }; + return all_platforms; +} + +// ================================================================================================= + +// Raw device ID type +using RawDeviceID = CUdevice; + +// C++11 version of 'CUdevice' +class Device { +public: + + // Constructor based on the regular CUDA data-type + explicit Device(const CUdevice device): device_(device) { } + + // Initialization + explicit Device(const Platform &platform, const size_t device_id) { + auto num_devices = platform.NumDevices(); + if (num_devices == 0) { + throw RuntimeError("Device: no devices found"); + } + if (device_id >= num_devices) { + throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); + } + + CheckError(cuDeviceGet(&device_, device_id)); + } + + // Methods to retrieve device information + RawPlatformID PlatformID() const { return 0; } + std::string Version() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return "CUDA driver "+std::to_string(result); + } + size_t VersionNumber() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return static_cast<size_t>(result); + } + std::string Vendor() const { return "NVIDIA Corporation"; } + std::string Name() const { + auto result = std::string{}; + result.resize(kStringLength); + CheckError(cuDeviceGetName(&result[0], result.size(), device_)); + return result; + } + std::string Type() const { return "GPU"; } + size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); } + size_t MaxWorkItemDimensions() const { return size_t{3}; } + std::vector<size_t> MaxWorkItemSizes() const { + return std::vector<size_t>{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X), + GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), + GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)}; + } + unsigned long LocalMemSize() const { + return static_cast<unsigned long>(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)); + } + + std::string Capabilities() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + return "SM"+std::to_string(major)+"."+std::to_string(minor); + } + bool HasExtension(const std::string &extension) const { return false; } + bool SupportsFP64() const { return true; } + bool SupportsFP16() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + if (major > 5) { return true; } // SM 6.x, 7.x and higher + if (major == 5 && minor == 3) { return true; } // SM 5.3 + return false; + } + + size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); } + size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); } + unsigned long MemorySize() const { + auto result = size_t{0}; + CheckError(cuDeviceTotalMem(&result, device_)); + return static_cast<unsigned long>(result); + } + unsigned long MaxAllocSize() const { return MemorySize(); } + size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); } + size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); } + + // Configuration-validity checks + bool IsLocalMemoryValid(const size_t local_mem_usage) const { + return (local_mem_usage <= LocalMemSize()); + } + bool IsThreadConfigValid(const std::vector<size_t> &local) const { + auto local_size = size_t{1}; + for (const auto &item: local) { local_size *= item; } + for (auto i=size_t{0}; i<local.size(); ++i) { + if (local[i] > MaxWorkItemSizes()[i]) { return false; } + } + if (local_size > MaxWorkGroupSize()) { return false; } + if (local.size() > MaxWorkItemDimensions()) { return false; } + return true; + } + + // Query for a specific type of device or brand + bool IsCPU() const { return false; } + bool IsGPU() const { return true; } + bool IsAMD() const { return false; } + bool IsNVIDIA() const { return true; } + bool IsIntel() const { return false; } + bool IsARM() const { return false; } + + // Platform specific extensions + std::string AMDBoardName() const { return ""; } + std::string NVIDIAComputeCapability() const { return Capabilities(); } + + // Accessor to the private data-member + const RawDeviceID& operator()() const { return device_; } +private: + CUdevice device_; + + // Private helper function + size_t GetInfo(const CUdevice_attribute info) const { + auto result = 0; + CheckError(cuDeviceGetAttribute(&result, info, device_)); + return static_cast<size_t>(result); + } +}; + +// ================================================================================================= + +// Raw context type +using RawContext = CUcontext; + +// C++11 version of 'CUcontext' +class Context { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Context(const CUcontext context): + context_(new CUcontext) { + *context_ = context; + } + + // Regular constructor with memory management + explicit Context(const Device &device): + context_(new CUcontext, [](CUcontext* c) { + if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); } + delete c; + }) { + CheckError(cuCtxCreate(context_.get(), 0, device())); + } + + // Accessor to the private data-member + const RawContext& operator()() const { return *context_; } + RawContext* pointer() const { return &(*context_); } +private: + std::shared_ptr<CUcontext> context_; +}; + +// Pointer to a raw CUDA context +using ContextPointer = CUcontext*; + +// ================================================================================================= + +// C++11 version of 'nvrtcProgram'. Additionally holds the program's source code. +class Program { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Source-based constructor with memory management + explicit Program(const Context &, std::string source): + program_(new nvrtcProgram, [](nvrtcProgram* p) { + if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); } + delete p; + }), + source_(std::move(source)), + from_binary_(false) { + const auto source_ptr = &source_[0]; + CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr)); + } + + // PTX-based constructor + explicit Program(const Device &device, const Context &context, const std::string &binary): + program_(nullptr), // not used + source_(binary), + from_binary_(true) { + } + + // Compiles the device program and checks whether or not there are any warnings/errors + void Build(const Device &, std::vector<std::string> &options) { + if (from_binary_) { return; } + auto raw_options = std::vector<const char*>(); + for (const auto &option: options) { + raw_options.push_back(option.c_str()); + } + auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data()); + CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram"); + } + + // Confirms whether a certain status code is an actual compilation error or warning + bool StatusIsCompilationWarningOrError(const nvrtcResult status) const { + return (status == NVRTC_ERROR_INVALID_INPUT); + } + + // Retrieves the warning/error message from the compiler (if any) + std::string GetBuildInfo(const Device &) const { + if (from_binary_) { return std::string{}; } + auto bytes = size_t{0}; + CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0])); + return result; + } + + // Retrieves an intermediate representation of the compiled program (i.e. PTX) + std::string GetIR() const { + if (from_binary_) { return source_; } // holds the PTX + auto bytes = size_t{0}; + CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0])); + return result; + } + + // Accessor to the private data-member + const nvrtcProgram& operator()() const { return *program_; } +private: + std::shared_ptr<nvrtcProgram> program_; + const std::string source_; + const bool from_binary_; +}; + +// ================================================================================================= + +// Raw command-queue type +using RawCommandQueue = CUstream; + +// C++11 version of 'CUstream' +class Queue { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Regular constructor with memory management + explicit Queue(const Context &context, const Device &device): + queue_(new CUstream, [](CUstream* s) { + if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); } + delete s; + }), + context_(context), + device_(device) { + CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING)); + } + + // Synchronizes the queue and optionally also an event + void Finish(Event &event) const { + CheckError(cuEventSynchronize(event.end())); + Finish(); + } + void Finish() const { + CheckError(cuStreamSynchronize(*queue_)); + } + + // Retrieves the corresponding context or device + Context GetContext() const { return context_; } + Device GetDevice() const { return device_; } + + // Accessor to the private data-member + const RawCommandQueue& operator()() const { return *queue_; } +private: + std::shared_ptr<CUstream> queue_; + const Context context_; + const Device device_; +}; + +// ================================================================================================= + +// C++11 version of page-locked host memory +template <typename T> +class BufferHost { +public: + + // Regular constructor with memory management + explicit BufferHost(const Context &, const size_t size): + buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }), + size_(size) { + CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T))); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + return size_*sizeof(T); + } + + // Compatibility with std::vector + size_t size() const { return size_; } + T* begin() { return &static_cast<T*>(*buffer_)[0]; } + T* end() { return &static_cast<T*>(*buffer_)[size_-1]; } + T& operator[](const size_t i) { return static_cast<T*>(*buffer_)[i]; } + T* data() { return static_cast<T*>(*buffer_); } + const T* data() const { return static_cast<T*>(*buffer_); } + +private: + std::shared_ptr<void*> buffer_; + const size_t size_; +}; + +// ================================================================================================= + +// Enumeration of buffer access types +enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; + +// C++11 version of 'CUdeviceptr' +template <typename T> +class Buffer { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Buffer(const CUdeviceptr buffer): + buffer_(new CUdeviceptr), + access_(BufferAccess::kNotOwned) { + *buffer_ = buffer; + } + + // Regular constructor with memory management. If this class does not own the buffer object, then + // the memory will not be freed automatically afterwards. + explicit Buffer(const Context &, const BufferAccess access, const size_t size): + buffer_(new CUdeviceptr, [access](CUdeviceptr* m) { + if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); } + delete m; + }), + access_(access) { + CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T))); + } + + // As above, but now with read/write access as a default + explicit Buffer(const Context &context, const size_t size): + Buffer<T>(context, BufferAccess::kReadWrite, size) { + } + + // Constructs a new buffer based on an existing host-container + template <typename Iterator> + explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): + Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) { + auto size = static_cast<size_t>(end - start); + auto pointer = &*start; + CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue())); + queue.Finish(); + } + + // Copies from device to host: reading the device buffer a-synchronously + void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + if (access_ == BufferAccess::kWriteOnly) { + throw LogicError("Buffer: reading from a write-only buffer"); + } + CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue())); + } + void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host, + const size_t offset = 0) const { + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } + ReadAsync(queue, size, host.data(), offset); + } + void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host, + const size_t offset = 0) const { + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } + ReadAsync(queue, size, host.data(), offset); + } + + // Copies from device to host: reading the device buffer + void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + ReadAsync(queue, size, host, offset); + queue.Finish(); + } + void Read(const Queue &queue, const size_t size, std::vector<T> &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + void Read(const Queue &queue, const size_t size, BufferHost<T> &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer a-synchronously + void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + if (access_ == BufferAccess::kReadOnly) { + throw LogicError("Buffer: writing to a read-only buffer"); + } + if (GetSize() < (offset+size)*sizeof(T)) { + throw LogicError("Buffer: target device buffer is too small"); + } + CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue())); + } + void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer + void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + WriteAsync(queue, size, host, offset); + queue.Finish(); + } + void Write(const Queue &queue, const size_t size, const std::vector<T> &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + void Write(const Queue &queue, const size_t size, const BufferHost<T> &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + + // Copies the contents of this buffer into another device buffer + void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const { + CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue())); + } + void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const { + CopyToAsync(queue, size, destination); + queue.Finish(); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + auto result = size_t{0}; + CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_)); + return result; + } + + // Accessors to the private data-members + CUdeviceptr operator()() const { return *buffer_; } + CUdeviceptr& operator()() { return *buffer_; } +private: + std::shared_ptr<CUdeviceptr> buffer_; + const BufferAccess access_; +}; + +// ================================================================================================= + +// C++11 version of 'CUfunction' +class Kernel { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Kernel(const CUmodule module, const CUfunction kernel): + module_(module), + kernel_(kernel) { + } + + // Regular constructor with memory management + explicit Kernel(const Program &program, const std::string &name) { + CheckError(cuModuleLoadDataEx(&module_, program.GetIR().data(), 0, nullptr, nullptr)); + CheckError(cuModuleGetFunction(&kernel_, module_, name.c_str())); + } + + // Sets a kernel argument at the indicated position. This stores both the value of the argument + // (as raw bytes) and the index indicating where this value can be found. + template <typename T> + void SetArgument(const size_t index, const T &value) { + if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); } + arguments_indices_[index] = arguments_data_.size(); + for (auto j=size_t(0); j<sizeof(T); ++j) { + arguments_data_.push_back(reinterpret_cast<const char*>(&value)[j]); + } + } + template <typename T> + void SetArgument(const size_t index, Buffer<T> &value) { + SetArgument(index, value()); + } + + // Sets all arguments in one go using parameter packs. Note that this resets all previously set + // arguments using 'SetArgument' or 'SetArguments'. + template <typename... Args> + void SetArguments(Args&... args) { + arguments_indices_.clear(); + arguments_data_.clear(); + SetArgumentsRecursive(0, args...); + } + + // Retrieves the amount of local memory used per work-group for this kernel. Note that this the + // shared memory in CUDA terminology. + unsigned long LocalMemUsage(const Device &) const { + auto result = 0; + CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_)); + return static_cast<unsigned long>(result); + } + + // Retrieves the name of the kernel + std::string GetFunctionName() const { + return std::string{"unknown"}; // Not implemented for the CUDA backend + } + + // Launches a kernel onto the specified queue + void Launch(const Queue &queue, const std::vector<size_t> &global, + const std::vector<size_t> &local, EventPointer event) { + + // Creates the grid (number of threadblocks) and sets the block sizes (threads per block) + auto grid = std::vector<size_t>{1, 1, 1}; + auto block = std::vector<size_t>{1, 1, 1}; + if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); } + for (auto i=size_t{0}; i<local.size(); ++i) { grid[i] = global[i]/local[i]; } + for (auto i=size_t{0}; i<local.size(); ++i) { block[i] = local[i]; } + + // Creates the array of pointers from the arrays of indices & data + std::vector<void*> pointers; + for (auto &index: arguments_indices_) { + pointers.push_back(&arguments_data_[index]); + } + + // Launches the kernel, its execution time is recorded by events + CheckError(cuEventRecord(event->start(), queue())); + CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2], + 0, queue(), pointers.data(), nullptr)); + CheckError(cuEventRecord(event->end(), queue())); + } + + // As above, but with an event waiting list + // TODO: Implement this function + void Launch(const Queue &queue, const std::vector<size_t> &global, + const std::vector<size_t> &local, EventPointer event, + std::vector<Event>& waitForEvents) { + if (local.size() == 0) { + throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end"); + } + else if (waitForEvents.size() != 0) { + throw LogicError("Kernel: launching with an event waiting list is not implemented for the CUDA back-end"); + } + else { + return Launch(queue, global, local, event); + } + } + + // Accessors to the private data-members + const CUfunction& operator()() const { return kernel_; } + CUfunction operator()() { return kernel_; } +private: + CUmodule module_; + CUfunction kernel_; + std::vector<size_t> arguments_indices_; // Indices of the arguments + std::vector<char> arguments_data_; // The arguments data as raw bytes + + // Internal implementation for the recursive SetArguments function. + template <typename T> + void SetArgumentsRecursive(const size_t index, T &first) { + SetArgument(index, first); + } + template <typename T, typename... Args> + void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { + SetArgument(index, first); + SetArgumentsRecursive(index+1, args...); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_CUPP11_H_ +#endif diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp index b5693181..a5b6be4b 100644 --- a/src/utilities/buffer_test.hpp +++ b/src/utilities/buffer_test.hpp @@ -15,7 +15,7 @@ #ifndef CLBLAST_BUFFER_TEST_H_ #define CLBLAST_BUFFER_TEST_H_ -#include "clblast.h" +#include "utilities/utilities.hpp namespace clblast { // ================================================================================================= diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index b2949c27..f56226be 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -21,8 +21,13 @@ #include <complex> #include <random> -#include "clpp11.hpp" -#include "clblast.h" +#ifdef OPENCL_API + #include "clpp11.hpp" + #include "clblast.h" +#elif CUDA_API + #include "cupp11.hpp" + #include "clblast_cuda.h" +#endif #include "clblast_half.h" #include "utilities/clblast_exceptions.hpp" #include "utilities/msvc.hpp" |