From df3c9f4a8ab9e82ccc4add15b04da5c1b6172b72 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 8 Oct 2017 21:52:02 +0200 Subject: Moved non-routine-specific API functions and includes to separate files --- scripts/generator/generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'scripts') diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index df0eaca0..0d34d7fe 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -42,8 +42,8 @@ FILES = [ "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", ] -HEADER_LINES = [122, 79, 126, 24, 29, 41, 29, 65, 32] -FOOTER_LINES = [25, 147, 27, 38, 6, 6, 6, 9, 2] +HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32] +FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2] HEADER_LINES_DOC = 0 FOOTER_LINES_DOC = 63 -- cgit v1.2.3 From 9224da19ef384c1a7986587a682035905f63cf55 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 9 Oct 2017 20:06:25 +0200 Subject: Fixed the Python generator script w.r.t. the recent change of testing direct/in-direct GEMM kernels separately --- scripts/generator/generator/cpp.py | 15 ++++++++++----- scripts/generator/generator/datatype.py | 6 +++--- test/correctness/routines/level3/xgemm.cpp | 5 ----- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'scripts') diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 964b8f3e..5fef3083 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -364,7 +364,9 @@ def performance_test(routine, level_string): found = False for flavour in routine.flavours: if flavour.precision_name == precision: - result += NL + " clblast::RunClient(beta)[0], reinterpret_cast(beta)[1]}" return "beta" - def test_template(self): + def test_template(self, extra_template_argument): """Returns the template as used in the correctness/performance tests""" buffer_type = "clblast::" + self.buffer_type if self.is_non_standard() else self.buffer_type beta_cpp = "clblast::" + self.beta_cpp if self.beta_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2] else self.beta_cpp if self.buffer_type != self.beta_cpp: - return "<" + buffer_type + "," + self.beta_cpp + ">, " + buffer_type + ", " + beta_cpp - return "<" + buffer_type + ">, " + buffer_type + ", " + beta_cpp + return "<" + extra_template_argument + buffer_type + "," + self.beta_cpp + ">, " + buffer_type + ", " + beta_cpp + return "<" + extra_template_argument + buffer_type + ">, " + buffer_type + ", " + beta_cpp def is_complex(self, scalar): """Current scalar is complex""" diff --git a/test/correctness/routines/level3/xgemm.cpp b/test/correctness/routines/level3/xgemm.cpp index bdf57b36..351e538b 100644 --- a/test/correctness/routines/level3/xgemm.cpp +++ b/test/correctness/routines/level3/xgemm.cpp @@ -15,21 +15,16 @@ // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - - // Tests GEMM based on the 'in-direct' kernel errors += clblast::RunTests, float, float>(argc, argv, false, "SGEMM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMM"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HGEMM"); - - // Tests GEMM based on the 'direct' kernel errors += clblast::RunTests, float, float>(argc, argv, true, "SGEMM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMM"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HGEMM"); - if (errors > 0) { return 1; } else { return 0; } } -- cgit v1.2.3 From b901809345848b44442c787380b13db5e5156df0 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 11 Oct 2017 23:16:57 +0200 Subject: Added first (untested) version of a CUDA API --- CMakeLists.txt | 98 +- include/clblast_cuda.h | 643 +++++++++ scripts/generator/generator.py | 12 +- scripts/generator/generator/cpp.py | 22 +- scripts/generator/generator/routine.py | 28 +- src/api_common.cpp | 2 +- src/clblast_cuda.cpp | 2336 ++++++++++++++++++++++++++++++++ src/cupp11.hpp | 770 +++++++++++ src/utilities/buffer_test.hpp | 2 +- src/utilities/utilities.hpp | 9 +- 10 files changed, 3874 insertions(+), 48 deletions(-) create mode 100644 include/clblast_cuda.h create mode 100644 src/clblast_cuda.cpp create mode 100644 src/cupp11.hpp (limited to 'scripts') diff --git a/CMakeLists.txt b/CMakeLists.txt index 52accbd4..a5a41f35 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,6 +30,23 @@ option(TESTS "Enable compilation of the correctness tests" OFF) option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF) option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF) +# Select between an OpenCL API (default) or a CUDA API (beta) +option(OPENCL "Build CLBlast with an OpenCL API (default)" ON) +option(CUDA "Build CLBlast with a CUDA API (beta)" OFF) +if(NOT OPENCL AND NOT CUDA) + message(FATAL_ERROR "No API selected, choose from OpenCL (-DOPENCL=ON) or CUDA (-DCUDA=ON)") +endif() +if(OPENCL AND CUDA) + message(FATAL_ERROR "Multiple APIs selected, choose either OpenCL (-DOPENCL=ON -DCUDA=OFF) or CUDA (-DCUDA=ON -DOPENCL=OFF)") +endif() +if(OPENCL) + message("-- Building CLBlast with OpenCL API (default)") + add_definitions(-DOPENCL_API) +elseif(CUDA) + message("-- Building CLBlast with CUDA API (beta)") + add_definitions(-DCUDA_API) +endif() + # Compile in verbose mode with additional diagnostic messages option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF) if(VERBOSE) @@ -123,8 +140,18 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}") # Package scripts location set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/") -# Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH. -find_package(OpenCL REQUIRED) +if(OPENCL) + # Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH. + find_package(OpenCL REQUIRED) + set(API_LIBRARIES ${OPENCL_LIBRARIES}) + set(API_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS}) +elseif(CUDA) + # For CUDA, the "FindCUDA.cmake" is part of CMake + find_package(CUDA REQUIRED) + set(API_LIBRARIES cuda nvrtc) + set(API_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) + link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64) +endif() # Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included. if(TUNERS) @@ -161,11 +188,6 @@ set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemm_direct xgemv) set(DATABASES copy pad padtranspose transpose xaxpy xdot xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger) -set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched) -set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) -if(NETLIB) - set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) -endif() set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) @@ -173,6 +195,16 @@ set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm) set(LEVELX_ROUTINES xomatcopy xim2col xaxpybatched xgemmbatched) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(PRECISIONS 32 64 3232 6464 16) +if(OPENCL) + set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched) + set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) + if(NETLIB) + set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) + endif() +elseif(CUDA) + set(SAMPLE_PROGRAMS_CPP ) + set(SAMPLE_PROGRAMS_C ) +endif() # ================================================================================================== @@ -184,14 +216,10 @@ set(SOURCES src/utilities/utilities.cpp src/api_common.cpp src/cache.cpp - src/clblast.cpp - src/clblast_c.cpp src/routine.cpp src/routines/levelx/xinvert.cpp # only source, don't include it as a test ) set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio - include/clblast.h - include/clblast_c.h include/clblast_half.h src/database/apple_cpu_fallback.hpp src/database/database.hpp @@ -209,13 +237,19 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual src/utilities/msvc.hpp src/utilities/utilities.hpp src/cache.hpp - src/clpp11.hpp src/cxpp11_common.hpp src/routine.hpp ) -if(NETLIB) - set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp) - set(HEADERS ${HEADERS} include/clblast_netlib_c.h) +if(OPENCL) + set(SOURCES ${SOURCES} src/clblast.cpp src/clblast_c.cpp) + set(HEADERS ${HEADERS} include/clblast.h include/clblast_c.h src/clpp11.hpp) + if(NETLIB) + set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp) + set(HEADERS ${HEADERS} include/clblast_netlib_c.h) + endif() +elseif(CUDA) + set(SOURCES ${SOURCES} src/clblast_cuda.cpp) + set(HEADERS ${HEADERS} include/clblast_cuda.h src/cupp11.hpp) endif() foreach(ROUTINE ${LEVEL1_ROUTINES}) set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp) @@ -249,14 +283,14 @@ else(BUILD_SHARED_LIBS) add_library(clblast STATIC ${SOURCES} ${HEADERS}) endif() -target_link_libraries(clblast ${OPENCL_LIBRARIES}) +target_link_libraries(clblast ${API_LIBRARIES}) # Includes directories: CLBlast and OpenCL target_include_directories(clblast PUBLIC $ $ $ - ${OPENCL_INCLUDE_DIRS}) + ${API_INCLUDE_DIRS}) # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built if(MSVC) @@ -267,11 +301,15 @@ endif() # Installs the library install(TARGETS clblast EXPORT CLBlast DESTINATION lib) -install(FILES include/clblast.h DESTINATION include) -install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) -if(NETLIB) - install(FILES include/clblast_netlib_c.h DESTINATION include) +if(OPENCL) + install(FILES include/clblast.h DESTINATION include) + install(FILES include/clblast_c.h DESTINATION include) + if(NETLIB) + install(FILES include/clblast_netlib_c.h DESTINATION include) + endif() +elseif(CUDA) + install(FILES include/clblast_cuda.h DESTINATION include) endif() # Installs the config for find_package in dependent projects @@ -291,19 +329,21 @@ endif() if(SAMPLES) # Downloads the cl.hpp file from Khronos - file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp) + if(OPENCL) + file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp) + endif() # Adds sample programs (C++) foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP}) add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp) - target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_sample_${SAMPLE} clblast ${API_LIBRARIES}) install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin) endforeach() # Adds sample programs (C) foreach(SAMPLE ${SAMPLE_PROGRAMS_C}) add_executable(clblast_sample_${SAMPLE}_c samples/${SAMPLE}.c) - target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${API_LIBRARIES}) install(TARGETS clblast_sample_${SAMPLE}_c DESTINATION bin) endforeach() @@ -324,7 +364,7 @@ if(TUNERS) # Adds tuning executables foreach(KERNEL ${KERNELS}) add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp) - target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS}) install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin) endforeach() @@ -429,7 +469,7 @@ if(CLIENTS) test/routines/levelx/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${ROUTINES}) - target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) install(TARGETS clblast_client_${ROUTINE} DESTINATION bin) endforeach() @@ -481,7 +521,7 @@ if(TESTS) test/routines/levelx/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${ROUTINES}) - target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) install(TARGETS clblast_test_${ROUTINE} DESTINATION bin) target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE}) @@ -492,7 +532,7 @@ if(TESTS) foreach(MISC_TEST ${MISC_TESTS}) add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON} test/correctness/misc/${MISC_TEST}.cpp) - target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_test_${MISC_TEST} PUBLIC $ ${clblast_SOURCE_DIR} ${REF_INCLUDES}) @@ -501,7 +541,7 @@ if(TESTS) # CLBlast diagnostics add_executable(clblast_test_diagnostics ${TESTS_COMMON} test/diagnostics.cpp) - target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_test_diagnostics PUBLIC $ ${clblast_SOURCE_DIR} ${REF_INCLUDES}) diff --git a/include/clblast_cuda.h b/include/clblast_cuda.h new file mode 100644 index 00000000..c125c302 --- /dev/null +++ b/include/clblast_cuda.h @@ -0,0 +1,643 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the special CUDA interface to the CLBlast BLAS routines. It also contains the +// definitions of the returned status codes and the layout and transpose types. This is the header +// users of the CUDA API of CLBlast should include and use. +// +// ================================================================================================= + +#ifndef CLBLAST_CLBLAST_CUDA_H_ +#define CLBLAST_CLBLAST_CUDA_H_ + +#include // For size_t +#include // For OverrideParameters function +#include // For OverrideParameters function + +// CUDA +#include // CUDA driver API +#include // NVIDIA runtime compilation API + +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#if defined(_WIN32) && defined(CLBLAST_DLL) + #if defined(COMPILING_DLL) + #define PUBLIC_API __declspec(dllexport) + #else + #define PUBLIC_API __declspec(dllimport) + #endif +#else + #define PUBLIC_API +#endif + +namespace clblast { +// ================================================================================================= + +// Status codes. These codes can be returned by functions declared in this header file. The error +// codes match either the standard CUDA driver API error codes or the regular CLBlast error codes. +enum class StatusCode { + + // Status codes in common with the OpenCL standard + kSuccess = 0, // CUDA_SUCCESS + kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions + kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total + kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension + + // Status codes in common with the clBLAS library + kNotImplemented = -1024, // Routine or functionality not implemented yet + kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer + kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer + kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer + kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer + kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer + kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero + kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension + kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension + kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension + kInvalidIncrementX = -1013, // Increment of vector X cannot be zero + kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero + kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small + kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small + kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small + kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small + kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small + + // Custom additional status codes for CLBlast + kInvalidBatchCount = -2049, // The batch count needs to be positive + kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel + kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel + kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device + kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device + kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device + kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer + kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small + kDatabaseError = -2041, // Entry for the device was not found in the database + kUnknownError = -2040, // A catch-all error code representing an unspecified error + kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception +}; + +// Matrix layout and transpose types +enum class Layout { kRowMajor = 101, kColMajor = 102 }; +enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 }; +enum class Triangle { kUpper = 121, kLower = 122 }; +enum class Diagonal { kNonUnit = 131, kUnit = 132 }; +enum class Side { kLeft = 141, kRight = 142 }; + +// Precision scoped enum (values in bits) +enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, + kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 }; + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +template +StatusCode Rotg(CUdeviceptr sa_buffer, const size_t sa_offset, + CUdeviceptr sb_buffer, const size_t sb_offset, + CUdeviceptr sc_buffer, const size_t sc_offset, + CUdeviceptr ss_buffer, const size_t ss_offset, + CUstream* stream); + +// Generate modified givens plane rotation: SROTMG/DROTMG +template +StatusCode Rotmg(CUdeviceptr sd1_buffer, const size_t sd1_offset, + CUdeviceptr sd2_buffer, const size_t sd2_offset, + CUdeviceptr sx1_buffer, const size_t sx1_offset, + const CUdeviceptr sy1_buffer, const size_t sy1_offset, + CUdeviceptr sparam_buffer, const size_t sparam_offset, + CUstream* stream); + +// Apply givens plane rotation: SROT/DROT +template +StatusCode Rot(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const T cos, + const T sin, + CUstream* stream); + +// Apply modified givens plane rotation: SROTM/DROTM +template +StatusCode Rotm(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr sparam_buffer, const size_t sparam_offset, + CUstream* stream); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +template +StatusCode Swap(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +template +StatusCode Scal(const size_t n, + const T alpha, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +template +StatusCode Copy(const size_t n, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +template +StatusCode Axpy(const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Dot product of two vectors: SDOT/DDOT/HDOT +template +StatusCode Dot(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template +StatusCode Dotu(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template +StatusCode Dotc(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +template +StatusCode Nrm2(const size_t n, + CUdeviceptr nrm2_buffer, const size_t nrm2_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +template +StatusCode Asum(const size_t n, + CUdeviceptr asum_buffer, const size_t asum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +template +StatusCode Sum(const size_t n, + CUdeviceptr sum_buffer, const size_t sum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +template +StatusCode Amax(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +template +StatusCode Amin(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +template +StatusCode Max(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +template +StatusCode Min(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +template +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +template +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +template +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +template +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +template +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +template +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +template +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +template +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +template +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr ap_buffer, const size_t ap_offset, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template +StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template +StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr ap_buffer, const size_t ap_offset, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// General rank-1 matrix update: SGER/DGER/HGER +template +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// General rank-1 complex matrix update: CGERU/ZGERU +template +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Hermitian rank-1 matrix update: CHER/ZHER +template +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +template +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +template +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +template +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +template +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +template +StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +template +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +template +StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +template +StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +template +StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +template +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream); + +// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL +template +StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, + const CUdeviceptr im_buffer, const size_t im_offset, + CUdeviceptr col_buffer, const size_t col_offset, + CUstream* stream); + +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, + CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + CUstream* stream); + +// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED +template +StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T *alphas, + const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld, + const T *betas, + CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, + const size_t batch_count, + CUstream* stream); + +// ================================================================================================= + +// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on +// for the same device. This cache can be cleared to free up system memory or in case of debugging. +StatusCode PUBLIC_API ClearCache(); + +// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels. +// Further CLBlast routine calls will then run at maximum speed. +StatusCode PUBLIC_API FillCache(const CUdevice device); + +// ================================================================================================= + +// Overrides tuning parameters for a specific device-precision-kernel combination. The next time +// the target routine is called it will re-compile and use the new parameters from then on. +StatusCode PUBLIC_API OverrideParameters(const CUdevice device, const std::string &kernel_name, + const Precision precision, + const std::unordered_map ¶meters); + +// ================================================================================================= + +} // namespace clblast + +// CLBLAST_CLBLAST_CUDA_H_ +#endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 0d34d7fe..520e3fc8 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -12,6 +12,8 @@ # clblast.cpp # clblast_c.h # clblast_c.cpp +# clblast_cuda.h +# clblast_cuda.cpp # clblast_netlib_c.h # clblast_netlib_c.cpp # wrapper_clblas.h @@ -41,9 +43,11 @@ FILES = [ "/test/wrapper_cublas.hpp", "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", + "/include/clblast_cuda.h", + "/src/clblast_cuda.cpp", ] -HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32] -FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2] +HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32, 94, 21] +FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2, 25, 3] HEADER_LINES_DOC = 0 FOOTER_LINES_DOC = 63 @@ -224,6 +228,10 @@ def main(argv): if i == 8: if not routine.batched: body += cpp.clblast_netlib_c_cc(routine) + if i == 9: + body += cpp.clblast_h(routine, cuda=True) + if i == 10: + body += cpp.clblast_cc(routine, cuda=True) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 5fef3083..f1ee1959 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -36,19 +36,19 @@ HEADER = NL + SEPARATOR + """ """ + SEPARATOR + NL -def clblast_h(routine): +def clblast_h(routine, cuda=False): """The C++ API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL - result += routine.routine_header_cpp(12, " = nullptr") + ";" + NL + result += routine.routine_header_cpp(12, " = nullptr", cuda) + ";" + NL return result -def clblast_cc(routine): +def clblast_cc(routine, cuda=False): """The C++ API implementation (.cpp)""" indent1 = " " * (15 + routine.length()) result = NL + "// " + routine.description + ": " + routine.short_names() + NL if routine.implemented: - result += routine.routine_header_cpp(12, "") + " {" + NL + result += routine.routine_header_cpp(12, "", cuda) + " {" + NL result += " try {" + NL result += " auto queue_cpp = Queue(*queue);" + NL result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL @@ -60,14 +60,22 @@ def clblast_cc(routine): result += " return StatusCode::kSuccess;" + NL result += " } catch (...) { return DispatchException(); }" + NL else: - result += routine.routine_header_type_cpp(12) + " {" + NL + result += routine.routine_header_type_cpp(12, cuda) + " {" + NL result += " return StatusCode::kNotImplemented;" + NL result += "}" + NL for flavour in routine.flavours: indent2 = " " * (34 + routine.length() + len(flavour.template)) result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">(" - result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)]) - result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL + arguments = routine.arguments_type(flavour) + if cuda: + arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] + result += ("," + NL + indent2).join([a for a in arguments]) + result += "," + NL + indent2 + if cuda: + result += "CUstream*" + else: + result += "cl_command_queue*, cl_event*" + result += ");" + NL return result diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index cef7db87..c3c1f775 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -802,22 +802,38 @@ class Routine: """Retrieves a list of routine requirements for documentation""" return self.requirements - def routine_header_cpp(self, spaces, default_event): + def routine_header_cpp(self, spaces, default_event, cuda=False): """Retrieves the C++ templated definition for a routine""" indent = " " * (spaces + self.length()) + arguments = self.arguments_def(self.template) + if cuda: + arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] result = "template <" + self.template.name + ">\n" result += "StatusCode " + self.capitalized_name() + "(" - result += (",\n" + indent).join([a for a in self.arguments_def(self.template)]) - result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")" + result += (",\n" + indent).join([a for a in arguments]) + result += ",\n" + indent + if cuda: + result += "CUstream* stream" + else: + result += "cl_command_queue* queue, cl_event* event" + default_event + result += ")" return result - def routine_header_type_cpp(self, spaces): + def routine_header_type_cpp(self, spaces, cuda=False): """As above, but now without variable names""" indent = " " * (spaces + self.length()) + arguments = self.arguments_type(self.template) + if cuda: + arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] result = "template <" + self.template.name + ">\n" result += "StatusCode " + self.capitalized_name() + "(" - result += (",\n" + indent).join([a for a in self.arguments_type(self.template)]) - result += ",\n" + indent + "cl_command_queue*, cl_event*)" + result += (",\n" + indent).join([a for a in arguments]) + result += ",\n" + indent + if cuda: + result += "CUstream* stream" + else: + result += "cl_command_queue*, cl_event*" + result += ")" return result def routine_header_c(self, flavour, spaces, extra_qualifier): diff --git a/src/api_common.cpp b/src/api_common.cpp index aa7e2b0f..0d387cd9 100644 --- a/src/api_common.cpp +++ b/src/api_common.cpp @@ -12,9 +12,9 @@ #include +#include "utilities/utilities.hpp" #include "cache.hpp" #include "routines/routines.hpp" -#include "clblast.h" namespace clblast { // ================================================================================================= diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp new file mode 100644 index 00000000..5f30d023 --- /dev/null +++ b/src/clblast_cuda.cpp @@ -0,0 +1,2336 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements all the BLAS API calls (CUDA version). In all cases, it does not much more +// than creating a new object of the appropriate type, and calling the main routine on that object. +// It forwards all status codes to the caller. +// +// ================================================================================================= + +#include + +#include "routines/routines.hpp" +#include "clblast_cuda.h" + +namespace clblast { + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +template +StatusCode Rotg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Rotg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Generate modified givens plane rotation: SROTMG/DROTMG +template +StatusCode Rotmg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Apply givens plane rotation: SROT/DROT +template +StatusCode Rot(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const T, + const T, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rot(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const float, + const float, + CUstream*); +template StatusCode PUBLIC_API Rot(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const double, + const double, + CUstream*); + +// Apply modified givens plane rotation: SROTM/DROTM +template +StatusCode Rotm(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotm(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Rotm(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +template +StatusCode Swap(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xswap(queue_cpp, event); + routine.DoSwap(n, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Swap(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +template +StatusCode Scal(const size_t n, + const T alpha, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xscal(queue_cpp, event); + routine.DoScal(n, + alpha, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Scal(const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal(const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal(const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal(const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal(const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +template +StatusCode Copy(const size_t n, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xcopy(queue_cpp, event); + routine.DoCopy(n, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Copy(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +template +StatusCode Axpy(const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xaxpy(queue_cpp, event); + routine.DoAxpy(n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Axpy(const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy(const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy(const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy(const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy(const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Dot product of two vectors: SDOT/DDOT/HDOT +template +StatusCode Dot(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdot(queue_cpp, event); + routine.DoDot(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dot(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dot(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dot(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template +StatusCode Dotu(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdotu(queue_cpp, event); + routine.DoDotu(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dotu(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dotu(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template +StatusCode Dotc(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdotc(queue_cpp, event); + routine.DoDotc(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dotc(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dotc(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +template +StatusCode Nrm2(const size_t n, + CUdeviceptr nrm2_buffer, const size_t nrm2_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xnrm2(queue_cpp, event); + routine.DoNrm2(n, + Buffer(nrm2_buffer), nrm2_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Nrm2(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +template +StatusCode Asum(const size_t n, + CUdeviceptr asum_buffer, const size_t asum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xasum(queue_cpp, event); + routine.DoAsum(n, + Buffer(asum_buffer), asum_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Asum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +template +StatusCode Sum(const size_t n, + CUdeviceptr sum_buffer, const size_t sum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsum(queue_cpp, event); + routine.DoSum(n, + Buffer(sum_buffer), sum_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Sum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +template +StatusCode Amax(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xamax(queue_cpp, event); + routine.DoAmax(n, + Buffer(imax_buffer), imax_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Amax(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +template +StatusCode Amin(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xamin(queue_cpp, event); + routine.DoAmin(n, + Buffer(imin_buffer), imin_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Amin(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +template +StatusCode Max(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xmax(queue_cpp, event); + routine.DoMax(n, + Buffer(imax_buffer), imax_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Max(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +template +StatusCode Min(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xmin(queue_cpp, event); + routine.DoMin(n, + Buffer(imin_buffer), imin_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Min(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +template +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgemv(queue_cpp, event); + routine.DoGemv(layout, a_transpose, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +template +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgbmv(queue_cpp, event); + routine.DoGbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +template +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhemv(queue_cpp, event); + routine.DoHemv(layout, triangle, + n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhbmv(queue_cpp, event); + routine.DoHbmv(layout, triangle, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpmv(queue_cpp, event); + routine.DoHpmv(layout, triangle, + n, + alpha, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +template +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsymv(queue_cpp, event); + routine.DoSymv(layout, triangle, + n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +template +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsbmv(queue_cpp, event); + routine.DoSbmv(layout, triangle, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +template +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspmv(queue_cpp, event); + routine.DoSpmv(layout, triangle, + n, + alpha, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +template +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrmv(queue_cpp, event); + routine.DoTrmv(layout, triangle, a_transpose, diagonal, + n, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +template +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtbmv(queue_cpp, event); + routine.DoTbmv(layout, triangle, a_transpose, diagonal, + n, k, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +template +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr ap_buffer, const size_t ap_offset, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtpmv(queue_cpp, event); + routine.DoTpmv(layout, triangle, a_transpose, diagonal, + n, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrsv(queue_cpp, event); + routine.DoTrsv(layout, triangle, a_transpose, diagonal, + n, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template +StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template +StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General rank-1 matrix update: SGER/DGER/HGER +template +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xger(queue_cpp, event); + routine.DoGer(layout, + m, n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General rank-1 complex matrix update: CGERU/ZGERU +template +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgeru(queue_cpp, event); + routine.DoGeru(layout, + m, n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Geru(const Layout, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Geru(const Layout, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgerc(queue_cpp, event); + routine.DoGerc(layout, + m, n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gerc(const Layout, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gerc(const Layout, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian rank-1 matrix update: CHER/ZHER +template +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher,T>(queue_cpp, event); + routine.DoHer(layout, triangle, + n, + alpha, + Buffer>(x_buffer), x_offset, x_inc, + Buffer>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Her(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpr,T>(queue_cpp, event); + routine.DoHpr(layout, triangle, + n, + alpha, + Buffer>(x_buffer), x_offset, x_inc, + Buffer>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher2(queue_cpp, event); + routine.DoHer2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her2(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Her2(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpr2(queue_cpp, event); + routine.DoHpr2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +template +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr(queue_cpp, event); + routine.DoSyr(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +template +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspr(queue_cpp, event); + routine.DoSpr(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +template +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr2(queue_cpp, event); + routine.DoSyr2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +template +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspr2(queue_cpp, event); + routine.DoSpr2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +template +StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgemm(queue_cpp, event); + routine.DoGemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +template +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsymm(queue_cpp, event); + routine.DoSymm(layout, side, triangle, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +template +StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhemm(queue_cpp, event); + routine.DoHemm(layout, side, triangle, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyrk(queue_cpp, event); + routine.DoSyrk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +template +StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xherk,T>(queue_cpp, event); + routine.DoHerk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer>(a_buffer), a_offset, a_ld, + beta, + Buffer>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr2k(queue_cpp, event); + routine.DoSyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +template +StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher2k(queue_cpp, event); + routine.DoHer2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +template +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrmm(queue_cpp, event); + routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrsm(queue_cpp, event); + routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xomatcopy(queue_cpp, event); + routine.DoOmatcopy(layout, a_transpose, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL +template +StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, + const CUdeviceptr im_buffer, const size_t im_offset, + CUdeviceptr col_buffer, const size_t col_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xim2col(queue_cpp, event); + routine.DoIm2col(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + Buffer(im_buffer), im_offset, + Buffer(col_buffer), col_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, + CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = XaxpyBatched(queue_cpp, event); + auto alphas_cpp = std::vector(); + auto x_offsets_cpp = std::vector(); + auto y_offsets_cpp = std::vector(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + x_offsets_cpp.push_back(x_offsets[batch]); + y_offsets_cpp.push_back(y_offsets[batch]); + } + routine.DoAxpyBatched(n, + alphas_cpp, + Buffer(x_buffer), x_offsets_cpp, x_inc, + Buffer(y_buffer), y_offsets_cpp, y_inc, + batch_count); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API AxpyBatched(const size_t, + const float*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched(const size_t, + const double*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched(const size_t, + const float2*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched(const size_t, + const double2*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched(const size_t, + const half*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); + +// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED +template +StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T *alphas, + const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld, + const T *betas, + CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, + const size_t batch_count, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = XgemmBatched(queue_cpp, event); + auto alphas_cpp = std::vector(); + auto betas_cpp = std::vector(); + auto a_offsets_cpp = std::vector(); + auto b_offsets_cpp = std::vector(); + auto c_offsets_cpp = std::vector(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + betas_cpp.push_back(betas[batch]); + a_offsets_cpp.push_back(a_offsets[batch]); + b_offsets_cpp.push_back(b_offsets[batch]); + c_offsets_cpp.push_back(c_offsets[batch]); + } + routine.DoGemmBatched(layout, a_transpose, b_transpose, + m, n, k, + alphas_cpp, + Buffer(a_buffer), a_offsets_cpp, a_ld, + Buffer(b_buffer), b_offsets_cpp, b_ld, + betas_cpp, + Buffer(c_buffer), c_offsets_cpp, c_ld, + batch_count); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const float*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const double*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const float2*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const double2*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const half*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const half*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); + +// ================================================================================================= +} // namespace clblast diff --git a/src/cupp11.hpp b/src/cupp11.hpp new file mode 100644 index 00000000..988366ea --- /dev/null +++ b/src/cupp11.hpp @@ -0,0 +1,770 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API +// calls. The main benefits are increased abstraction, automatic memory management, and portability. +// Portability here means that a similar header exists for CUDA with the same classes and +// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. +// +// This file is taken from the CLCudaAPI project and +// therefore contains the following header copyright notice: +// +// ================================================================================================= +// +// Copyright 2015 SURFsara +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ================================================================================================= + +#ifndef CLBLAST_CUPP11_H_ +#define CLBLAST_CUPP11_H_ + +// C++ +#include // std::copy +#include // std::string +#include // std::vector +#include // std::shared_ptr + +// CUDA +#include // CUDA driver API +#include // NVIDIA runtime compilation API + +// Exception classes +#include "cxpp11_common.hpp" + +namespace clblast { +// ================================================================================================= + +// Max-length of strings +constexpr auto kStringLength = 256; + +// ================================================================================================= + +// Represents a runtime error returned by a CUDA driver API function +class CLCudaAPIError : public ErrorCode { +public: + explicit CLCudaAPIError(CUresult status, const std::string &where): + ErrorCode(status, where, "CUDA error: " + where + ": " + + GetErrorName(status) + " --> " + GetErrorString(status)) { + } + + static void Check(const CUresult status, const std::string &where) { + if (status != CUDA_SUCCESS) { + throw CLCudaAPIError(status, where); + } + } + + static void CheckDtor(const CUresult status, const std::string &where) { + if (status != CUDA_SUCCESS) { + fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what()); + } + } + +private: + std::string GetErrorName(CUresult status) const { + const char* status_code; + cuGetErrorName(status, &status_code); + return std::string(status_code); + } + std::string GetErrorString(CUresult status) const { + const char* status_string; + cuGetErrorString(status, &status_string); + return std::string(status_string); + } +}; + +// Represents a runtime error returned by a CUDA runtime compilation API function +class CLCudaAPINVRTCError : public ErrorCode { +public: + explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where): + ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) { + } + + static void Check(const nvrtcResult status, const std::string &where) { + if (status != NVRTC_SUCCESS) { + throw CLCudaAPINVRTCError(status, where); + } + } + + static void CheckDtor(const nvrtcResult status, const std::string &where) { + if (status != NVRTC_SUCCESS) { + fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what()); + } + } + +private: + std::string GetErrorString(nvrtcResult status) const { + const char* status_string = nvrtcGetErrorString(status); + return std::string(status_string); + } +}; + +// Exception returned when building a program +using CLCudaAPIBuildError = CLCudaAPINVRTCError; + +// ================================================================================================= + +// Error occurred in CUDA driver or runtime compilation API +#define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call)) +#define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call)) + +// Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors) +#define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call)) +#define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call)) + +// ================================================================================================= + +// C++11 version of two 'CUevent' pointers +class Event { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Regular constructor with memory management + explicit Event(): + start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }), + end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) { + CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT)); + CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT)); + } + + // Waits for completion of this event (not implemented for CUDA) + void WaitForCompletion() const { } + + // Retrieves the elapsed time of the last recorded event + float GetElapsedTime() const { + auto result = 0.0f; + cuEventElapsedTime(&result, *start_, *end_); + return result; + } + + // Accessors to the private data-members + const CUevent& start() const { return *start_; } + const CUevent& end() const { return *end_; } + Event* pointer() { return this; } +private: + std::shared_ptr start_; + std::shared_ptr end_; +}; + +// Pointer to a CUDA event +using EventPointer = Event*; + +// ================================================================================================= + +// Raw platform ID type +using RawPlatformID = size_t; + +// The CUDA platform: initializes the CUDA driver API +class Platform { +public: + + // Initializes the platform. Note that the platform ID variable is not actually used for CUDA. + explicit Platform(const size_t platform_id) : platform_id_(0) { + if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); } + CheckError(cuInit(0)); + } + + // Methods to retrieve platform information + std::string Name() const { return "CUDA"; } + std::string Vendor() const { return "NVIDIA Corporation"; } + std::string Version() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return "CUDA driver "+std::to_string(result); + } + + // Returns the number of devices on this platform + size_t NumDevices() const { + auto result = 0; + CheckError(cuDeviceGetCount(&result)); + return static_cast(result); + } + + // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0) + const RawPlatformID& operator()() const { return platform_id_; } +private: + const size_t platform_id_; +}; + +// Retrieves a vector with all platforms. Note that there is just one platform in CUDA. +inline std::vector GetAllPlatforms() { + auto all_platforms = std::vector{ Platform(size_t{0}) }; + return all_platforms; +} + +// ================================================================================================= + +// Raw device ID type +using RawDeviceID = CUdevice; + +// C++11 version of 'CUdevice' +class Device { +public: + + // Constructor based on the regular CUDA data-type + explicit Device(const CUdevice device): device_(device) { } + + // Initialization + explicit Device(const Platform &platform, const size_t device_id) { + auto num_devices = platform.NumDevices(); + if (num_devices == 0) { + throw RuntimeError("Device: no devices found"); + } + if (device_id >= num_devices) { + throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); + } + + CheckError(cuDeviceGet(&device_, device_id)); + } + + // Methods to retrieve device information + RawPlatformID PlatformID() const { return 0; } + std::string Version() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return "CUDA driver "+std::to_string(result); + } + size_t VersionNumber() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return static_cast(result); + } + std::string Vendor() const { return "NVIDIA Corporation"; } + std::string Name() const { + auto result = std::string{}; + result.resize(kStringLength); + CheckError(cuDeviceGetName(&result[0], result.size(), device_)); + return result; + } + std::string Type() const { return "GPU"; } + size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); } + size_t MaxWorkItemDimensions() const { return size_t{3}; } + std::vector MaxWorkItemSizes() const { + return std::vector{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X), + GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), + GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)}; + } + unsigned long LocalMemSize() const { + return static_cast(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)); + } + + std::string Capabilities() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + return "SM"+std::to_string(major)+"."+std::to_string(minor); + } + bool HasExtension(const std::string &extension) const { return false; } + bool SupportsFP64() const { return true; } + bool SupportsFP16() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + if (major > 5) { return true; } // SM 6.x, 7.x and higher + if (major == 5 && minor == 3) { return true; } // SM 5.3 + return false; + } + + size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); } + size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); } + unsigned long MemorySize() const { + auto result = size_t{0}; + CheckError(cuDeviceTotalMem(&result, device_)); + return static_cast(result); + } + unsigned long MaxAllocSize() const { return MemorySize(); } + size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); } + size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); } + + // Configuration-validity checks + bool IsLocalMemoryValid(const size_t local_mem_usage) const { + return (local_mem_usage <= LocalMemSize()); + } + bool IsThreadConfigValid(const std::vector &local) const { + auto local_size = size_t{1}; + for (const auto &item: local) { local_size *= item; } + for (auto i=size_t{0}; i MaxWorkItemSizes()[i]) { return false; } + } + if (local_size > MaxWorkGroupSize()) { return false; } + if (local.size() > MaxWorkItemDimensions()) { return false; } + return true; + } + + // Query for a specific type of device or brand + bool IsCPU() const { return false; } + bool IsGPU() const { return true; } + bool IsAMD() const { return false; } + bool IsNVIDIA() const { return true; } + bool IsIntel() const { return false; } + bool IsARM() const { return false; } + + // Platform specific extensions + std::string AMDBoardName() const { return ""; } + std::string NVIDIAComputeCapability() const { return Capabilities(); } + + // Accessor to the private data-member + const RawDeviceID& operator()() const { return device_; } +private: + CUdevice device_; + + // Private helper function + size_t GetInfo(const CUdevice_attribute info) const { + auto result = 0; + CheckError(cuDeviceGetAttribute(&result, info, device_)); + return static_cast(result); + } +}; + +// ================================================================================================= + +// Raw context type +using RawContext = CUcontext; + +// C++11 version of 'CUcontext' +class Context { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Context(const CUcontext context): + context_(new CUcontext) { + *context_ = context; + } + + // Regular constructor with memory management + explicit Context(const Device &device): + context_(new CUcontext, [](CUcontext* c) { + if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); } + delete c; + }) { + CheckError(cuCtxCreate(context_.get(), 0, device())); + } + + // Accessor to the private data-member + const RawContext& operator()() const { return *context_; } + RawContext* pointer() const { return &(*context_); } +private: + std::shared_ptr context_; +}; + +// Pointer to a raw CUDA context +using ContextPointer = CUcontext*; + +// ================================================================================================= + +// C++11 version of 'nvrtcProgram'. Additionally holds the program's source code. +class Program { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Source-based constructor with memory management + explicit Program(const Context &, std::string source): + program_(new nvrtcProgram, [](nvrtcProgram* p) { + if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); } + delete p; + }), + source_(std::move(source)), + from_binary_(false) { + const auto source_ptr = &source_[0]; + CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr)); + } + + // PTX-based constructor + explicit Program(const Device &device, const Context &context, const std::string &binary): + program_(nullptr), // not used + source_(binary), + from_binary_(true) { + } + + // Compiles the device program and checks whether or not there are any warnings/errors + void Build(const Device &, std::vector &options) { + if (from_binary_) { return; } + auto raw_options = std::vector(); + for (const auto &option: options) { + raw_options.push_back(option.c_str()); + } + auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data()); + CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram"); + } + + // Confirms whether a certain status code is an actual compilation error or warning + bool StatusIsCompilationWarningOrError(const nvrtcResult status) const { + return (status == NVRTC_ERROR_INVALID_INPUT); + } + + // Retrieves the warning/error message from the compiler (if any) + std::string GetBuildInfo(const Device &) const { + if (from_binary_) { return std::string{}; } + auto bytes = size_t{0}; + CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0])); + return result; + } + + // Retrieves an intermediate representation of the compiled program (i.e. PTX) + std::string GetIR() const { + if (from_binary_) { return source_; } // holds the PTX + auto bytes = size_t{0}; + CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0])); + return result; + } + + // Accessor to the private data-member + const nvrtcProgram& operator()() const { return *program_; } +private: + std::shared_ptr program_; + const std::string source_; + const bool from_binary_; +}; + +// ================================================================================================= + +// Raw command-queue type +using RawCommandQueue = CUstream; + +// C++11 version of 'CUstream' +class Queue { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Regular constructor with memory management + explicit Queue(const Context &context, const Device &device): + queue_(new CUstream, [](CUstream* s) { + if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); } + delete s; + }), + context_(context), + device_(device) { + CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING)); + } + + // Synchronizes the queue and optionally also an event + void Finish(Event &event) const { + CheckError(cuEventSynchronize(event.end())); + Finish(); + } + void Finish() const { + CheckError(cuStreamSynchronize(*queue_)); + } + + // Retrieves the corresponding context or device + Context GetContext() const { return context_; } + Device GetDevice() const { return device_; } + + // Accessor to the private data-member + const RawCommandQueue& operator()() const { return *queue_; } +private: + std::shared_ptr queue_; + const Context context_; + const Device device_; +}; + +// ================================================================================================= + +// C++11 version of page-locked host memory +template +class BufferHost { +public: + + // Regular constructor with memory management + explicit BufferHost(const Context &, const size_t size): + buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }), + size_(size) { + CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T))); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + return size_*sizeof(T); + } + + // Compatibility with std::vector + size_t size() const { return size_; } + T* begin() { return &static_cast(*buffer_)[0]; } + T* end() { return &static_cast(*buffer_)[size_-1]; } + T& operator[](const size_t i) { return static_cast(*buffer_)[i]; } + T* data() { return static_cast(*buffer_); } + const T* data() const { return static_cast(*buffer_); } + +private: + std::shared_ptr buffer_; + const size_t size_; +}; + +// ================================================================================================= + +// Enumeration of buffer access types +enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; + +// C++11 version of 'CUdeviceptr' +template +class Buffer { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Buffer(const CUdeviceptr buffer): + buffer_(new CUdeviceptr), + access_(BufferAccess::kNotOwned) { + *buffer_ = buffer; + } + + // Regular constructor with memory management. If this class does not own the buffer object, then + // the memory will not be freed automatically afterwards. + explicit Buffer(const Context &, const BufferAccess access, const size_t size): + buffer_(new CUdeviceptr, [access](CUdeviceptr* m) { + if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); } + delete m; + }), + access_(access) { + CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T))); + } + + // As above, but now with read/write access as a default + explicit Buffer(const Context &context, const size_t size): + Buffer(context, BufferAccess::kReadWrite, size) { + } + + // Constructs a new buffer based on an existing host-container + template + explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): + Buffer(context, BufferAccess::kReadWrite, static_cast(end - start)) { + auto size = static_cast(end - start); + auto pointer = &*start; + CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue())); + queue.Finish(); + } + + // Copies from device to host: reading the device buffer a-synchronously + void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + if (access_ == BufferAccess::kWriteOnly) { + throw LogicError("Buffer: reading from a write-only buffer"); + } + CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue())); + } + void ReadAsync(const Queue &queue, const size_t size, std::vector &host, + const size_t offset = 0) const { + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } + ReadAsync(queue, size, host.data(), offset); + } + void ReadAsync(const Queue &queue, const size_t size, BufferHost &host, + const size_t offset = 0) const { + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } + ReadAsync(queue, size, host.data(), offset); + } + + // Copies from device to host: reading the device buffer + void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + ReadAsync(queue, size, host, offset); + queue.Finish(); + } + void Read(const Queue &queue, const size_t size, std::vector &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + void Read(const Queue &queue, const size_t size, BufferHost &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer a-synchronously + void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + if (access_ == BufferAccess::kReadOnly) { + throw LogicError("Buffer: writing to a read-only buffer"); + } + if (GetSize() < (offset+size)*sizeof(T)) { + throw LogicError("Buffer: target device buffer is too small"); + } + CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue())); + } + void WriteAsync(const Queue &queue, const size_t size, const std::vector &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + void WriteAsync(const Queue &queue, const size_t size, const BufferHost &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer + void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + WriteAsync(queue, size, host, offset); + queue.Finish(); + } + void Write(const Queue &queue, const size_t size, const std::vector &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + void Write(const Queue &queue, const size_t size, const BufferHost &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + + // Copies the contents of this buffer into another device buffer + void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination) const { + CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue())); + } + void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) const { + CopyToAsync(queue, size, destination); + queue.Finish(); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + auto result = size_t{0}; + CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_)); + return result; + } + + // Accessors to the private data-members + CUdeviceptr operator()() const { return *buffer_; } + CUdeviceptr& operator()() { return *buffer_; } +private: + std::shared_ptr buffer_; + const BufferAccess access_; +}; + +// ================================================================================================= + +// C++11 version of 'CUfunction' +class Kernel { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Kernel(const CUmodule module, const CUfunction kernel): + module_(module), + kernel_(kernel) { + } + + // Regular constructor with memory management + explicit Kernel(const Program &program, const std::string &name) { + CheckError(cuModuleLoadDataEx(&module_, program.GetIR().data(), 0, nullptr, nullptr)); + CheckError(cuModuleGetFunction(&kernel_, module_, name.c_str())); + } + + // Sets a kernel argument at the indicated position. This stores both the value of the argument + // (as raw bytes) and the index indicating where this value can be found. + template + void SetArgument(const size_t index, const T &value) { + if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); } + arguments_indices_[index] = arguments_data_.size(); + for (auto j=size_t(0); j(&value)[j]); + } + } + template + void SetArgument(const size_t index, Buffer &value) { + SetArgument(index, value()); + } + + // Sets all arguments in one go using parameter packs. Note that this resets all previously set + // arguments using 'SetArgument' or 'SetArguments'. + template + void SetArguments(Args&... args) { + arguments_indices_.clear(); + arguments_data_.clear(); + SetArgumentsRecursive(0, args...); + } + + // Retrieves the amount of local memory used per work-group for this kernel. Note that this the + // shared memory in CUDA terminology. + unsigned long LocalMemUsage(const Device &) const { + auto result = 0; + CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_)); + return static_cast(result); + } + + // Retrieves the name of the kernel + std::string GetFunctionName() const { + return std::string{"unknown"}; // Not implemented for the CUDA backend + } + + // Launches a kernel onto the specified queue + void Launch(const Queue &queue, const std::vector &global, + const std::vector &local, EventPointer event) { + + // Creates the grid (number of threadblocks) and sets the block sizes (threads per block) + auto grid = std::vector{1, 1, 1}; + auto block = std::vector{1, 1, 1}; + if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); } + for (auto i=size_t{0}; i pointers; + for (auto &index: arguments_indices_) { + pointers.push_back(&arguments_data_[index]); + } + + // Launches the kernel, its execution time is recorded by events + CheckError(cuEventRecord(event->start(), queue())); + CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2], + 0, queue(), pointers.data(), nullptr)); + CheckError(cuEventRecord(event->end(), queue())); + } + + // As above, but with an event waiting list + // TODO: Implement this function + void Launch(const Queue &queue, const std::vector &global, + const std::vector &local, EventPointer event, + std::vector& waitForEvents) { + if (local.size() == 0) { + throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end"); + } + else if (waitForEvents.size() != 0) { + throw LogicError("Kernel: launching with an event waiting list is not implemented for the CUDA back-end"); + } + else { + return Launch(queue, global, local, event); + } + } + + // Accessors to the private data-members + const CUfunction& operator()() const { return kernel_; } + CUfunction operator()() { return kernel_; } +private: + CUmodule module_; + CUfunction kernel_; + std::vector arguments_indices_; // Indices of the arguments + std::vector arguments_data_; // The arguments data as raw bytes + + // Internal implementation for the recursive SetArguments function. + template + void SetArgumentsRecursive(const size_t index, T &first) { + SetArgument(index, first); + } + template + void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { + SetArgument(index, first); + SetArgumentsRecursive(index+1, args...); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_CUPP11_H_ +#endif diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp index b5693181..a5b6be4b 100644 --- a/src/utilities/buffer_test.hpp +++ b/src/utilities/buffer_test.hpp @@ -15,7 +15,7 @@ #ifndef CLBLAST_BUFFER_TEST_H_ #define CLBLAST_BUFFER_TEST_H_ -#include "clblast.h" +#include "utilities/utilities.hpp namespace clblast { // ================================================================================================= diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index b2949c27..f56226be 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -21,8 +21,13 @@ #include #include -#include "clpp11.hpp" -#include "clblast.h" +#ifdef OPENCL_API + #include "clpp11.hpp" + #include "clblast.h" +#elif CUDA_API + #include "cupp11.hpp" + #include "clblast_cuda.h" +#endif #include "clblast_half.h" #include "utilities/clblast_exceptions.hpp" #include "utilities/msvc.hpp" -- cgit v1.2.3 From cc5b4754250b3c03b9b0f8d72f32d1eacac15b18 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 12 Oct 2017 12:20:43 +0200 Subject: CUDA API now takes context and device in instead of stream --- include/clblast_cuda.h | 112 ++--- scripts/generator/generator/cpp.py | 9 +- scripts/generator/generator/routine.py | 4 +- src/clblast_cuda.cpp | 720 +++++++++++++++++++-------------- src/utilities/buffer_test.hpp | 2 +- 5 files changed, 476 insertions(+), 371 deletions(-) (limited to 'scripts') diff --git a/include/clblast_cuda.h b/include/clblast_cuda.h index c125c302..e28f68e5 100644 --- a/include/clblast_cuda.h +++ b/include/clblast_cuda.h @@ -103,7 +103,7 @@ StatusCode Rotg(CUdeviceptr sa_buffer, const size_t sa_offset, CUdeviceptr sb_buffer, const size_t sb_offset, CUdeviceptr sc_buffer, const size_t sc_offset, CUdeviceptr ss_buffer, const size_t ss_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Generate modified givens plane rotation: SROTMG/DROTMG template @@ -112,7 +112,7 @@ StatusCode Rotmg(CUdeviceptr sd1_buffer, const size_t sd1_offset, CUdeviceptr sx1_buffer, const size_t sx1_offset, const CUdeviceptr sy1_buffer, const size_t sy1_offset, CUdeviceptr sparam_buffer, const size_t sparam_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Apply givens plane rotation: SROT/DROT template @@ -121,7 +121,7 @@ StatusCode Rot(const size_t n, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const T cos, const T sin, - CUstream* stream); + const CUcontext context, const CUdevice device); // Apply modified givens plane rotation: SROTM/DROTM template @@ -129,28 +129,28 @@ StatusCode Rotm(const size_t n, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr sparam_buffer, const size_t sparam_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template @@ -158,7 +158,7 @@ StatusCode Axpy(const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Dot product of two vectors: SDOT/DDOT/HDOT template @@ -166,7 +166,7 @@ StatusCode Dot(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Dot product of two complex vectors: CDOTU/ZDOTU template @@ -174,7 +174,7 @@ StatusCode Dotu(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template @@ -182,56 +182,56 @@ StatusCode Dotc(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, CUdeviceptr nrm2_buffer, const size_t nrm2_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, CUdeviceptr asum_buffer, const size_t asum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, CUdeviceptr sum_buffer, const size_t sum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN template StatusCode Amin(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -246,7 +246,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template @@ -257,7 +257,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template @@ -268,7 +268,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template @@ -279,7 +279,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template @@ -290,7 +290,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template @@ -301,7 +301,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template @@ -312,7 +312,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template @@ -323,7 +323,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template @@ -331,7 +331,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template @@ -339,7 +339,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const size_t k, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template @@ -347,7 +347,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr ap_buffer, const size_t ap_offset, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template @@ -355,7 +355,7 @@ StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template @@ -363,7 +363,7 @@ StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const size_t k, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template @@ -371,7 +371,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr ap_buffer, const size_t ap_offset, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // General rank-1 matrix update: SGER/DGER/HGER template @@ -381,7 +381,7 @@ StatusCode Ger(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // General rank-1 complex matrix update: CGERU/ZGERU template @@ -391,7 +391,7 @@ StatusCode Geru(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template @@ -401,7 +401,7 @@ StatusCode Gerc(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian rank-1 matrix update: CHER/ZHER template @@ -410,7 +410,7 @@ StatusCode Her(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template @@ -419,7 +419,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian rank-2 matrix update: CHER2/ZHER2 template @@ -429,7 +429,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template @@ -439,7 +439,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template @@ -448,7 +448,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template @@ -457,7 +457,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template @@ -467,7 +467,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template @@ -477,7 +477,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -492,7 +492,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template @@ -503,7 +503,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template @@ -514,7 +514,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template @@ -524,7 +524,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Rank-K update of a hermitian matrix: CHERK/ZHERK template @@ -534,7 +534,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template @@ -545,7 +545,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template @@ -556,7 +556,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const U beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template @@ -565,7 +565,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template @@ -574,7 +574,7 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // ================================================================================================= // Extra non-BLAS routines (level-X) @@ -587,14 +587,14 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL template StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const CUdeviceptr im_buffer, const size_t im_offset, CUdeviceptr col_buffer, const size_t col_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template @@ -603,7 +603,7 @@ StatusCode AxpyBatched(const size_t n, const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, - CUstream* stream); + const CUcontext context, const CUdevice device); // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED template @@ -615,7 +615,7 @@ StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const T const T *betas, CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, - CUstream* stream); + const CUcontext context, const CUdevice device); // ================================================================================================= diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index f1ee1959..5413906a 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -50,7 +50,12 @@ def clblast_cc(routine, cuda=False): if routine.implemented: result += routine.routine_header_cpp(12, "", cuda) + " {" + NL result += " try {" + NL - result += " auto queue_cpp = Queue(*queue);" + NL + if cuda: + result += " const auto context_cpp = Context(context);" + NL + result += " const auto device_cpp = Device(device);" + NL + result += " auto queue_cpp = Queue(context_cpp, device_cpp);" + NL + else: + result += " auto queue_cpp = Queue(*queue);" + NL result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL if routine.batched: result += " " + (NL + " ").join(routine.batched_transform_to_cpp()) + NL @@ -72,7 +77,7 @@ def clblast_cc(routine, cuda=False): result += ("," + NL + indent2).join([a for a in arguments]) result += "," + NL + indent2 if cuda: - result += "CUstream*" + result += "const CUcontext, const CUdevice" else: result += "cl_command_queue*, cl_event*" result += ");" + NL diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index c3c1f775..b6b55821 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -813,7 +813,7 @@ class Routine: result += (",\n" + indent).join([a for a in arguments]) result += ",\n" + indent if cuda: - result += "CUstream* stream" + result += "const CUcontext context, const CUdevice device" else: result += "cl_command_queue* queue, cl_event* event" + default_event result += ")" @@ -830,7 +830,7 @@ class Routine: result += (",\n" + indent).join([a for a in arguments]) result += ",\n" + indent if cuda: - result += "CUstream* stream" + result += "const CUcontext, const CUdevice" else: result += "cl_command_queue*, cl_event*" result += ")" diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp index 5f30d023..f9a24236 100644 --- a/src/clblast_cuda.cpp +++ b/src/clblast_cuda.cpp @@ -30,19 +30,19 @@ StatusCode Rotg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rotg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Generate modified givens plane rotation: SROTMG/DROTMG template @@ -51,7 +51,7 @@ StatusCode Rotmg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, @@ -59,13 +59,13 @@ template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Apply givens plane rotation: SROT/DROT template @@ -74,7 +74,7 @@ StatusCode Rot(const size_t, CUdeviceptr, const size_t, const size_t, const T, const T, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rot(const size_t, @@ -82,13 +82,13 @@ template StatusCode PUBLIC_API Rot(const size_t, CUdeviceptr, const size_t, const size_t, const float, const float, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rot(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const double, const double, - CUstream*); + const CUcontext, const CUdevice); // Apply modified givens plane rotation: SROTM/DROTM template @@ -96,28 +96,30 @@ StatusCode Rotm(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotm(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rotm(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xswap(queue_cpp, event); routine.DoSwap(n, Buffer(x_buffer), x_offset, x_inc, @@ -128,32 +130,34 @@ StatusCode Swap(const size_t n, template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xscal(queue_cpp, event); routine.DoScal(n, alpha, @@ -164,32 +168,34 @@ StatusCode Scal(const size_t n, template StatusCode PUBLIC_API Scal(const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xcopy(queue_cpp, event); routine.DoCopy(n, Buffer(x_buffer), x_offset, x_inc, @@ -200,23 +206,23 @@ StatusCode Copy(const size_t n, template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template @@ -224,9 +230,11 @@ StatusCode Axpy(const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xaxpy(queue_cpp, event); routine.DoAxpy(n, alpha, @@ -239,27 +247,27 @@ template StatusCode PUBLIC_API Axpy(const size_t, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Dot product of two vectors: SDOT/DDOT/HDOT template @@ -267,9 +275,11 @@ StatusCode Dot(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xdot(queue_cpp, event); routine.DoDot(n, Buffer(dot_buffer), dot_offset, @@ -282,17 +292,17 @@ template StatusCode PUBLIC_API Dot(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dot(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dot(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Dot product of two complex vectors: CDOTU/ZDOTU template @@ -300,9 +310,11 @@ StatusCode Dotu(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xdotu(queue_cpp, event); routine.DoDotu(n, Buffer(dot_buffer), dot_offset, @@ -315,12 +327,12 @@ template StatusCode PUBLIC_API Dotu(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dotu(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template @@ -328,9 +340,11 @@ StatusCode Dotc(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xdotc(queue_cpp, event); routine.DoDotc(n, Buffer(dot_buffer), dot_offset, @@ -343,21 +357,23 @@ template StatusCode PUBLIC_API Dotc(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dotc(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, CUdeviceptr nrm2_buffer, const size_t nrm2_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xnrm2(queue_cpp, event); routine.DoNrm2(n, Buffer(nrm2_buffer), nrm2_offset, @@ -368,32 +384,34 @@ StatusCode Nrm2(const size_t n, template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, CUdeviceptr asum_buffer, const size_t asum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xasum(queue_cpp, event); routine.DoAsum(n, Buffer(asum_buffer), asum_offset, @@ -404,32 +422,34 @@ StatusCode Asum(const size_t n, template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, CUdeviceptr sum_buffer, const size_t sum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsum(queue_cpp, event); routine.DoSum(n, Buffer(sum_buffer), sum_offset, @@ -440,32 +460,34 @@ StatusCode Sum(const size_t n, template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xamax(queue_cpp, event); routine.DoAmax(n, Buffer(imax_buffer), imax_offset, @@ -476,32 +498,34 @@ StatusCode Amax(const size_t n, template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN template StatusCode Amin(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xamin(queue_cpp, event); routine.DoAmin(n, Buffer(imin_buffer), imin_offset, @@ -512,32 +536,34 @@ StatusCode Amin(const size_t n, template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xmax(queue_cpp, event); routine.DoMax(n, Buffer(imax_buffer), imax_offset, @@ -548,32 +574,34 @@ StatusCode Max(const size_t n, template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xmin(queue_cpp, event); routine.DoMin(n, Buffer(imin_buffer), imin_offset, @@ -584,23 +612,23 @@ StatusCode Min(const size_t n, template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -615,9 +643,11 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgemv(queue_cpp, event); routine.DoGemv(layout, a_transpose, m, n, @@ -636,7 +666,7 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double, @@ -644,7 +674,7 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const float2, @@ -652,7 +682,7 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double2, @@ -660,7 +690,7 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const half, @@ -668,7 +698,7 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template @@ -679,9 +709,11 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgbmv(queue_cpp, event); routine.DoGbmv(layout, a_transpose, m, n, kl, ku, @@ -700,7 +732,7 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double, @@ -708,7 +740,7 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const float2, @@ -716,7 +748,7 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double2, @@ -724,7 +756,7 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const half, @@ -732,7 +764,7 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template @@ -743,9 +775,11 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhemv(queue_cpp, event); routine.DoHemv(layout, triangle, n, @@ -764,7 +798,7 @@ template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const size_t, const double2, @@ -772,7 +806,7 @@ template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template @@ -783,9 +817,11 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhbmv(queue_cpp, event); routine.DoHbmv(layout, triangle, n, k, @@ -804,7 +840,7 @@ template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const size_t, const size_t, const double2, @@ -812,7 +848,7 @@ template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template @@ -823,9 +859,11 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhpmv(queue_cpp, event); routine.DoHpmv(layout, triangle, n, @@ -844,7 +882,7 @@ template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const size_t, const double2, @@ -852,7 +890,7 @@ template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template @@ -863,9 +901,11 @@ StatusCode Symv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsymv(queue_cpp, event); routine.DoSymv(layout, triangle, n, @@ -884,7 +924,7 @@ template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const double, @@ -892,7 +932,7 @@ template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const half, @@ -900,7 +940,7 @@ template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template @@ -911,9 +951,11 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsbmv(queue_cpp, event); routine.DoSbmv(layout, triangle, n, k, @@ -932,7 +974,7 @@ template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const double, @@ -940,7 +982,7 @@ template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const half, @@ -948,7 +990,7 @@ template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template @@ -959,9 +1001,11 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xspmv(queue_cpp, event); routine.DoSpmv(layout, triangle, n, @@ -980,7 +1024,7 @@ template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const double, @@ -988,7 +1032,7 @@ template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const half, @@ -996,7 +1040,7 @@ template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template @@ -1004,9 +1048,11 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrmv(queue_cpp, event); routine.DoTrmv(layout, triangle, a_transpose, diagonal, n, @@ -1019,27 +1065,27 @@ template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const T const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template @@ -1047,9 +1093,11 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const size_t k, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtbmv(queue_cpp, event); routine.DoTbmv(layout, triangle, a_transpose, diagonal, n, k, @@ -1062,27 +1110,27 @@ template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const T const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template @@ -1090,9 +1138,11 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr ap_buffer, const size_t ap_offset, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtpmv(queue_cpp, event); routine.DoTpmv(layout, triangle, a_transpose, diagonal, n, @@ -1105,27 +1155,27 @@ template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const T const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template @@ -1133,9 +1183,11 @@ StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrsv(queue_cpp, event); routine.DoTrsv(layout, triangle, a_transpose, diagonal, n, @@ -1148,22 +1200,22 @@ template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const T const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template @@ -1171,29 +1223,29 @@ StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template @@ -1201,29 +1253,29 @@ StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // General rank-1 matrix update: SGER/DGER/HGER template @@ -1233,9 +1285,11 @@ StatusCode Ger(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xger(queue_cpp, event); routine.DoGer(layout, m, n, @@ -1252,21 +1306,21 @@ template StatusCode PUBLIC_API Ger(const Layout, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // General rank-1 complex matrix update: CGERU/ZGERU template @@ -1276,9 +1330,11 @@ StatusCode Geru(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgeru(queue_cpp, event); routine.DoGeru(layout, m, n, @@ -1295,14 +1351,14 @@ template StatusCode PUBLIC_API Geru(const Layout, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Geru(const Layout, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template @@ -1312,9 +1368,11 @@ StatusCode Gerc(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgerc(queue_cpp, event); routine.DoGerc(layout, m, n, @@ -1331,14 +1389,14 @@ template StatusCode PUBLIC_API Gerc(const Layout, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gerc(const Layout, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian rank-1 matrix update: CHER/ZHER template @@ -1347,9 +1405,11 @@ StatusCode Her(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xher,T>(queue_cpp, event); routine.DoHer(layout, triangle, n, @@ -1364,13 +1424,13 @@ template StatusCode PUBLIC_API Her(const Layout, const Triangle, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Her(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template @@ -1379,9 +1439,11 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhpr,T>(queue_cpp, event); routine.DoHpr(layout, triangle, n, @@ -1396,13 +1458,13 @@ template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian rank-2 matrix update: CHER2/ZHER2 template @@ -1412,9 +1474,11 @@ StatusCode Her2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xher2(queue_cpp, event); routine.DoHer2(layout, triangle, n, @@ -1431,14 +1495,14 @@ template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template @@ -1448,9 +1512,11 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhpr2(queue_cpp, event); routine.DoHpr2(layout, triangle, n, @@ -1467,14 +1533,14 @@ template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template @@ -1483,9 +1549,11 @@ StatusCode Syr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyr(queue_cpp, event); routine.DoSyr(layout, triangle, n, @@ -1500,19 +1568,19 @@ template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template @@ -1521,9 +1589,11 @@ StatusCode Spr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xspr(queue_cpp, event); routine.DoSpr(layout, triangle, n, @@ -1538,19 +1608,19 @@ template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template @@ -1560,9 +1630,11 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyr2(queue_cpp, event); routine.DoSyr2(layout, triangle, n, @@ -1579,21 +1651,21 @@ template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template @@ -1603,9 +1675,11 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xspr2(queue_cpp, event); routine.DoSpr2(layout, triangle, n, @@ -1622,21 +1696,21 @@ template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -1651,9 +1725,11 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgemm(queue_cpp, event); routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, @@ -1672,7 +1748,7 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double, @@ -1680,7 +1756,7 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2, @@ -1688,7 +1764,7 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2, @@ -1696,7 +1772,7 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, cons const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half, @@ -1704,7 +1780,7 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const T const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template @@ -1715,9 +1791,11 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsymm(queue_cpp, event); routine.DoSymm(layout, side, triangle, m, n, @@ -1736,7 +1814,7 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Trian const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double, @@ -1744,7 +1822,7 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Tria const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, @@ -1752,7 +1830,7 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Tria const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, @@ -1760,7 +1838,7 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Tri const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const half, @@ -1768,7 +1846,7 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triang const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template @@ -1779,9 +1857,11 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhemm(queue_cpp, event); routine.DoHemm(layout, side, triangle, m, n, @@ -1800,7 +1880,7 @@ template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Tria const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, @@ -1808,7 +1888,7 @@ template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Tri const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template @@ -1818,9 +1898,11 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyrk(queue_cpp, event); routine.DoSyrk(layout, triangle, a_transpose, n, k, @@ -1837,35 +1919,35 @@ template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const T const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Rank-K update of a hermitian matrix: CHERK/ZHERK template @@ -1875,9 +1957,11 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xherk,T>(queue_cpp, event); routine.DoHerk(layout, triangle, a_transpose, n, k, @@ -1894,14 +1978,14 @@ template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const T const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template @@ -1912,9 +1996,11 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyr2k(queue_cpp, event); routine.DoSyr2k(layout, triangle, ab_transpose, n, k, @@ -1933,7 +2019,7 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, @@ -1941,7 +2027,7 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, @@ -1949,7 +2035,7 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, @@ -1957,7 +2043,7 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, cons const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, @@ -1965,7 +2051,7 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const T const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template @@ -1976,9 +2062,11 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const U beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xher2k(queue_cpp, event); routine.DoHer2k(layout, triangle, ab_transpose, n, k, @@ -1997,7 +2085,7 @@ template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, @@ -2005,7 +2093,7 @@ template StatusCode PUBLIC_API Her2k(const Layout, const Triangl const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template @@ -2014,9 +2102,11 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrmm(queue_cpp, event); routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, @@ -2031,31 +2121,31 @@ template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Trian const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template @@ -2064,9 +2154,11 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrsm(queue_cpp, event); routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, @@ -2081,25 +2173,25 @@ template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Trian const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // ================================================================================================= // Extra non-BLAS routines (level-X) @@ -2112,9 +2204,11 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xomatcopy(queue_cpp, event); routine.DoOmatcopy(layout, a_transpose, m, n, @@ -2129,40 +2223,42 @@ template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL template StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const CUdeviceptr im_buffer, const size_t im_offset, CUdeviceptr col_buffer, const size_t col_offset, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xim2col(queue_cpp, event); routine.DoIm2col(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, Buffer(im_buffer), im_offset, @@ -2173,23 +2269,23 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template @@ -2198,9 +2294,11 @@ StatusCode AxpyBatched(const size_t n, const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = XaxpyBatched(queue_cpp, event); auto alphas_cpp = std::vector(); auto x_offsets_cpp = std::vector(); @@ -2223,31 +2321,31 @@ template StatusCode PUBLIC_API AxpyBatched(const size_t, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const float2*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double2*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const half*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED template @@ -2259,9 +2357,11 @@ StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const T const T *betas, CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = XgemmBatched(queue_cpp, event); auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); @@ -2294,7 +2394,7 @@ template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const float*, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double*, @@ -2303,7 +2403,7 @@ template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose const double*, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2*, @@ -2312,7 +2412,7 @@ template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose const float2*, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2*, @@ -2321,7 +2421,7 @@ template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpos const double2*, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half*, @@ -2330,7 +2430,7 @@ template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const half*, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // ================================================================================================= } // namespace clblast diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp index a5b6be4b..fd071434 100644 --- a/src/utilities/buffer_test.hpp +++ b/src/utilities/buffer_test.hpp @@ -15,7 +15,7 @@ #ifndef CLBLAST_BUFFER_TEST_H_ #define CLBLAST_BUFFER_TEST_H_ -#include "utilities/utilities.hpp +#include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= -- cgit v1.2.3 From 54d0c440ce84d61db1b462033052dd0f532a40d8 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 14 Oct 2017 11:43:57 +0200 Subject: Various fixes to make the host code and sample compile with the CUDA API --- samples/sgemm_cuda.cpp | 26 +++++----- scripts/generator/generator/cpp.py | 3 +- src/clblast_cuda.cpp | 100 ++++++++++++++++++------------------- src/cupp11.hpp | 10 ++-- src/cxpp11_common.hpp | 1 + src/kernels/opencl_to_cuda.h | 11 ++++ 6 files changed, 84 insertions(+), 67 deletions(-) (limited to 'scripts') diff --git a/samples/sgemm_cuda.cpp b/samples/sgemm_cuda.cpp index ed2ad588..f1138316 100644 --- a/samples/sgemm_cuda.cpp +++ b/samples/sgemm_cuda.cpp @@ -19,7 +19,7 @@ #include // Includes the CUDA driver API -#include +#include // Includes the CLBlast library #include @@ -43,14 +43,15 @@ int main() { const auto c_ld = n; // Initializes the OpenCL device + cuInit(0); CUdevice device; cuDeviceGet(&device, device_id); // Creates the OpenCL context and stream CUcontext context; - cuCtxCreate(context, 0, device); + cuCtxCreate(&context, 0, device); CUstream stream; - cuStreamCreate(queue, CU_STREAM_NON_BLOCKING); + cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING); // Populate host matrices with some example data auto host_a = std::vector(m*k); @@ -64,12 +65,12 @@ int main() { CUdeviceptr device_a; CUdeviceptr device_b; CUdeviceptr device_c; - cuMemAlloc(device_a, host_a.size()*sizeof(float)); - cuMemAlloc(device_b, host_b.size()*sizeof(float)); - cuMemAlloc(device_c, host_c.size()*sizeof(float)); - cuMemcpyHtoDAsync(device_a, host_a.data()), host_a.size()*sizeof(T), queue); - cuMemcpyHtoDAsync(device_b, host_c.data()), host_b.size()*sizeof(T), queue); - cuMemcpyHtoDAsync(device_c, host_b.data()), host_c.size()*sizeof(T), queue); + cuMemAlloc(&device_a, host_a.size()*sizeof(float)); + cuMemAlloc(&device_b, host_b.size()*sizeof(float)); + cuMemAlloc(&device_c, host_c.size()*sizeof(float)); + cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(float), stream); + cuMemcpyHtoDAsync(device_b, host_c.data(), host_b.size()*sizeof(float), stream); + cuMemcpyHtoDAsync(device_c, host_b.data(), host_c.size()*sizeof(float), stream); // Start the timer auto start_time = std::chrono::steady_clock::now(); @@ -79,11 +80,12 @@ int main() { clblast::Transpose::kNo, clblast::Transpose::kNo, m, n, k, alpha, - device_a(), 0, a_ld, - device_b(), 0, b_ld, + device_a, 0, a_ld, + device_b, 0, b_ld, beta, - device_c(), 0, c_ld, + device_c, 0, c_ld, context, device); + cuStreamSynchronize(stream); // Record the execution time auto elapsed_time = std::chrono::steady_clock::now() - start_time; diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 5413906a..2d18655f 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -56,7 +56,8 @@ def clblast_cc(routine, cuda=False): result += " auto queue_cpp = Queue(context_cpp, device_cpp);" + NL else: result += " auto queue_cpp = Queue(*queue);" + NL - result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL + event = "nullptr" if cuda else "event" + result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, " + event + ");" + NL if routine.batched: result += " " + (NL + " ").join(routine.batched_transform_to_cpp()) + NL result += " routine.Do" + routine.capitalized_name() + "(" diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp index f9a24236..0e3d949d 100644 --- a/src/clblast_cuda.cpp +++ b/src/clblast_cuda.cpp @@ -120,7 +120,7 @@ StatusCode Swap(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xswap(queue_cpp, event); + auto routine = Xswap(queue_cpp, nullptr); routine.DoSwap(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); @@ -158,7 +158,7 @@ StatusCode Scal(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xscal(queue_cpp, event); + auto routine = Xscal(queue_cpp, nullptr); routine.DoScal(n, alpha, Buffer(x_buffer), x_offset, x_inc); @@ -196,7 +196,7 @@ StatusCode Copy(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xcopy(queue_cpp, event); + auto routine = Xcopy(queue_cpp, nullptr); routine.DoCopy(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); @@ -235,7 +235,7 @@ StatusCode Axpy(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xaxpy(queue_cpp, event); + auto routine = Xaxpy(queue_cpp, nullptr); routine.DoAxpy(n, alpha, Buffer(x_buffer), x_offset, x_inc, @@ -280,7 +280,7 @@ StatusCode Dot(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xdot(queue_cpp, event); + auto routine = Xdot(queue_cpp, nullptr); routine.DoDot(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, @@ -315,7 +315,7 @@ StatusCode Dotu(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xdotu(queue_cpp, event); + auto routine = Xdotu(queue_cpp, nullptr); routine.DoDotu(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, @@ -345,7 +345,7 @@ StatusCode Dotc(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xdotc(queue_cpp, event); + auto routine = Xdotc(queue_cpp, nullptr); routine.DoDotc(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, @@ -374,7 +374,7 @@ StatusCode Nrm2(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xnrm2(queue_cpp, event); + auto routine = Xnrm2(queue_cpp, nullptr); routine.DoNrm2(n, Buffer(nrm2_buffer), nrm2_offset, Buffer(x_buffer), x_offset, x_inc); @@ -412,7 +412,7 @@ StatusCode Asum(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xasum(queue_cpp, event); + auto routine = Xasum(queue_cpp, nullptr); routine.DoAsum(n, Buffer(asum_buffer), asum_offset, Buffer(x_buffer), x_offset, x_inc); @@ -450,7 +450,7 @@ StatusCode Sum(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsum(queue_cpp, event); + auto routine = Xsum(queue_cpp, nullptr); routine.DoSum(n, Buffer(sum_buffer), sum_offset, Buffer(x_buffer), x_offset, x_inc); @@ -488,7 +488,7 @@ StatusCode Amax(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xamax(queue_cpp, event); + auto routine = Xamax(queue_cpp, nullptr); routine.DoAmax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); @@ -526,7 +526,7 @@ StatusCode Amin(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xamin(queue_cpp, event); + auto routine = Xamin(queue_cpp, nullptr); routine.DoAmin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); @@ -564,7 +564,7 @@ StatusCode Max(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xmax(queue_cpp, event); + auto routine = Xmax(queue_cpp, nullptr); routine.DoMax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); @@ -602,7 +602,7 @@ StatusCode Min(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xmin(queue_cpp, event); + auto routine = Xmin(queue_cpp, nullptr); routine.DoMin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); @@ -648,7 +648,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xgemv(queue_cpp, event); + auto routine = Xgemv(queue_cpp, nullptr); routine.DoGemv(layout, a_transpose, m, n, alpha, @@ -714,7 +714,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xgbmv(queue_cpp, event); + auto routine = Xgbmv(queue_cpp, nullptr); routine.DoGbmv(layout, a_transpose, m, n, kl, ku, alpha, @@ -780,7 +780,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhemv(queue_cpp, event); + auto routine = Xhemv(queue_cpp, nullptr); routine.DoHemv(layout, triangle, n, alpha, @@ -822,7 +822,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhbmv(queue_cpp, event); + auto routine = Xhbmv(queue_cpp, nullptr); routine.DoHbmv(layout, triangle, n, k, alpha, @@ -864,7 +864,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhpmv(queue_cpp, event); + auto routine = Xhpmv(queue_cpp, nullptr); routine.DoHpmv(layout, triangle, n, alpha, @@ -906,7 +906,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsymv(queue_cpp, event); + auto routine = Xsymv(queue_cpp, nullptr); routine.DoSymv(layout, triangle, n, alpha, @@ -956,7 +956,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsbmv(queue_cpp, event); + auto routine = Xsbmv(queue_cpp, nullptr); routine.DoSbmv(layout, triangle, n, k, alpha, @@ -1006,7 +1006,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xspmv(queue_cpp, event); + auto routine = Xspmv(queue_cpp, nullptr); routine.DoSpmv(layout, triangle, n, alpha, @@ -1053,7 +1053,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtrmv(queue_cpp, event); + auto routine = Xtrmv(queue_cpp, nullptr); routine.DoTrmv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, @@ -1098,7 +1098,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtbmv(queue_cpp, event); + auto routine = Xtbmv(queue_cpp, nullptr); routine.DoTbmv(layout, triangle, a_transpose, diagonal, n, k, Buffer(a_buffer), a_offset, a_ld, @@ -1143,7 +1143,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtpmv(queue_cpp, event); + auto routine = Xtpmv(queue_cpp, nullptr); routine.DoTpmv(layout, triangle, a_transpose, diagonal, n, Buffer(ap_buffer), ap_offset, @@ -1188,7 +1188,7 @@ StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtrsv(queue_cpp, event); + auto routine = Xtrsv(queue_cpp, nullptr); routine.DoTrsv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, @@ -1290,7 +1290,7 @@ StatusCode Ger(const Layout layout, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xger(queue_cpp, event); + auto routine = Xger(queue_cpp, nullptr); routine.DoGer(layout, m, n, alpha, @@ -1335,7 +1335,7 @@ StatusCode Geru(const Layout layout, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xgeru(queue_cpp, event); + auto routine = Xgeru(queue_cpp, nullptr); routine.DoGeru(layout, m, n, alpha, @@ -1373,7 +1373,7 @@ StatusCode Gerc(const Layout layout, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xgerc(queue_cpp, event); + auto routine = Xgerc(queue_cpp, nullptr); routine.DoGerc(layout, m, n, alpha, @@ -1410,7 +1410,7 @@ StatusCode Her(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xher,T>(queue_cpp, event); + auto routine = Xher,T>(queue_cpp, nullptr); routine.DoHer(layout, triangle, n, alpha, @@ -1444,7 +1444,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhpr,T>(queue_cpp, event); + auto routine = Xhpr,T>(queue_cpp, nullptr); routine.DoHpr(layout, triangle, n, alpha, @@ -1479,7 +1479,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xher2(queue_cpp, event); + auto routine = Xher2(queue_cpp, nullptr); routine.DoHer2(layout, triangle, n, alpha, @@ -1517,7 +1517,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhpr2(queue_cpp, event); + auto routine = Xhpr2(queue_cpp, nullptr); routine.DoHpr2(layout, triangle, n, alpha, @@ -1554,7 +1554,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsyr(queue_cpp, event); + auto routine = Xsyr(queue_cpp, nullptr); routine.DoSyr(layout, triangle, n, alpha, @@ -1594,7 +1594,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xspr(queue_cpp, event); + auto routine = Xspr(queue_cpp, nullptr); routine.DoSpr(layout, triangle, n, alpha, @@ -1635,7 +1635,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsyr2(queue_cpp, event); + auto routine = Xsyr2(queue_cpp, nullptr); routine.DoSyr2(layout, triangle, n, alpha, @@ -1680,7 +1680,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xspr2(queue_cpp, event); + auto routine = Xspr2(queue_cpp, nullptr); routine.DoSpr2(layout, triangle, n, alpha, @@ -1730,7 +1730,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xgemm(queue_cpp, event); + auto routine = Xgemm(queue_cpp, nullptr); routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha, @@ -1796,7 +1796,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsymm(queue_cpp, event); + auto routine = Xsymm(queue_cpp, nullptr); routine.DoSymm(layout, side, triangle, m, n, alpha, @@ -1862,7 +1862,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhemm(queue_cpp, event); + auto routine = Xhemm(queue_cpp, nullptr); routine.DoHemm(layout, side, triangle, m, n, alpha, @@ -1903,7 +1903,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsyrk(queue_cpp, event); + auto routine = Xsyrk(queue_cpp, nullptr); routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha, @@ -1962,7 +1962,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xherk,T>(queue_cpp, event); + auto routine = Xherk,T>(queue_cpp, nullptr); routine.DoHerk(layout, triangle, a_transpose, n, k, alpha, @@ -2001,7 +2001,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsyr2k(queue_cpp, event); + auto routine = Xsyr2k(queue_cpp, nullptr); routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha, @@ -2067,7 +2067,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xher2k(queue_cpp, event); + auto routine = Xher2k(queue_cpp, nullptr); routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha, @@ -2107,7 +2107,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtrmm(queue_cpp, event); + auto routine = Xtrmm(queue_cpp, nullptr); routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, @@ -2159,7 +2159,7 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtrsm(queue_cpp, event); + auto routine = Xtrsm(queue_cpp, nullptr); routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, @@ -2209,7 +2209,7 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xomatcopy(queue_cpp, event); + auto routine = Xomatcopy(queue_cpp, nullptr); routine.DoOmatcopy(layout, a_transpose, m, n, alpha, @@ -2259,7 +2259,7 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xim2col(queue_cpp, event); + auto routine = Xim2col(queue_cpp, nullptr); routine.DoIm2col(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, Buffer(im_buffer), im_offset, Buffer(col_buffer), col_offset); @@ -2299,7 +2299,7 @@ StatusCode AxpyBatched(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = XaxpyBatched(queue_cpp, event); + auto routine = XaxpyBatched(queue_cpp, nullptr); auto alphas_cpp = std::vector(); auto x_offsets_cpp = std::vector(); auto y_offsets_cpp = std::vector(); @@ -2362,7 +2362,7 @@ StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const T const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = XgemmBatched(queue_cpp, event); + auto routine = XgemmBatched(queue_cpp, nullptr); auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); auto a_offsets_cpp = std::vector(); diff --git a/src/cupp11.hpp b/src/cupp11.hpp index 988366ea..854c0be9 100644 --- a/src/cupp11.hpp +++ b/src/cupp11.hpp @@ -370,6 +370,8 @@ using ContextPointer = CUcontext*; // C++11 version of 'nvrtcProgram'. Additionally holds the program's source code. class Program { public: + Program() = default; + // Note that there is no constructor based on the regular CUDA data-type because of extra state // Source-based constructor with memory management @@ -404,7 +406,7 @@ public: // Confirms whether a certain status code is an actual compilation error or warning bool StatusIsCompilationWarningOrError(const nvrtcResult status) const { - return (status == NVRTC_ERROR_INVALID_INPUT); + return (status == NVRTC_ERROR_COMPILATION); } // Retrieves the warning/error message from the compiler (if any) @@ -433,8 +435,8 @@ public: const nvrtcProgram& operator()() const { return *program_; } private: std::shared_ptr program_; - const std::string source_; - const bool from_binary_; + std::string source_; + bool from_binary_; }; // ================================================================================================= @@ -730,7 +732,7 @@ public: // TODO: Implement this function void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event, - std::vector& waitForEvents) { + const std::vector& waitForEvents) { if (local.size() == 0) { throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end"); } diff --git a/src/cxpp11_common.hpp b/src/cxpp11_common.hpp index 6ac008be..5097eac4 100644 --- a/src/cxpp11_common.hpp +++ b/src/cxpp11_common.hpp @@ -15,6 +15,7 @@ #ifndef CLBLAST_CXPP11_COMMON_H_ #define CLBLAST_CXPP11_COMMON_H_ +#include // strchr #include // std::string #include // std::runtime_error diff --git a/src/kernels/opencl_to_cuda.h b/src/kernels/opencl_to_cuda.h index 43a26a2f..2e46bc2b 100644 --- a/src/kernels/opencl_to_cuda.h +++ b/src/kernels/opencl_to_cuda.h @@ -11,6 +11,11 @@ // // ================================================================================================= +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( +// ================================================================================================= + // Replaces the OpenCL keywords with CUDA equivalent #define __kernel __placeholder__ #define __global @@ -49,3 +54,9 @@ typedef struct { float s0; float s1; float s2; float s3; float s12; float s13; float s14; float s15; } float16; // ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= + -- cgit v1.2.3