summaryrefslogtreecommitdiff
path: root/test/wrapper_cublas.hpp
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-04-13 21:31:27 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2017-04-13 21:31:27 +0200
commitf7f8ec644f51d16f888b6a7086009b79c0beef8f (patch)
tree88f652bba2a980b44010f415ed5d48af15d0b063 /test/wrapper_cublas.hpp
parentf24c142948fc71d8b37826c1275259668fe0d0e5 (diff)
Fixed CUDA malloc and cuBLAS handles: cuBLAS as a performance-reference now works
Diffstat (limited to 'test/wrapper_cublas.hpp')
-rw-r--r--test/wrapper_cublas.hpp914
1 files changed, 319 insertions, 595 deletions
diff --git a/test/wrapper_cublas.hpp b/test/wrapper_cublas.hpp
index 4de8451a..35b1b9c6 100644
--- a/test/wrapper_cublas.hpp
+++ b/test/wrapper_cublas.hpp
@@ -34,258 +34,226 @@ cublasSideMode_t convertToCUBLAS(const Side v) { return (v == Side::kLeft) ? CUB
// Forwards the cuBLAS calls for SROTG/DROTG
template <typename T>
-cublasStatus_t cublasXrotg(T* sa_buffer, const size_t sa_offset,
+cublasStatus_t cublasXrotg(cublasHandle_t handle, T* sa_buffer, const size_t sa_offset,
T* sb_buffer, const size_t sb_offset,
T* sc_buffer, const size_t sc_offset,
T* ss_buffer, const size_t ss_offset);
template <>
-cublasStatus_t cublasXrotg<float>(float* sa_buffer, const size_t sa_offset,
+cublasStatus_t cublasXrotg<float>(cublasHandle_t handle, float* sa_buffer, const size_t sa_offset,
float* sb_buffer, const size_t sb_offset,
float* sc_buffer, const size_t sc_offset,
float* ss_buffer, const size_t ss_offset) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSrotg(handle, &sa_buffer[sa_offset],
&sb_buffer[sb_offset],
&sc_buffer[sc_offset],
&ss_buffer[ss_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXrotg<double>(double* sa_buffer, const size_t sa_offset,
+cublasStatus_t cublasXrotg<double>(cublasHandle_t handle, double* sa_buffer, const size_t sa_offset,
double* sb_buffer, const size_t sb_offset,
double* sc_buffer, const size_t sc_offset,
double* ss_buffer, const size_t ss_offset) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDrotg(handle, &sa_buffer[sa_offset],
&sb_buffer[sb_offset],
&sc_buffer[sc_offset],
&ss_buffer[ss_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for SROTMG/DROTMG
template <typename T>
-cublasStatus_t cublasXrotmg(T* sd1_buffer, const size_t sd1_offset,
+cublasStatus_t cublasXrotmg(cublasHandle_t handle, T* sd1_buffer, const size_t sd1_offset,
T* sd2_buffer, const size_t sd2_offset,
T* sx1_buffer, const size_t sx1_offset,
const T* sy1_buffer, const size_t sy1_offset,
T* sparam_buffer, const size_t sparam_offset);
template <>
-cublasStatus_t cublasXrotmg<float>(float* sd1_buffer, const size_t sd1_offset,
+cublasStatus_t cublasXrotmg<float>(cublasHandle_t handle, float* sd1_buffer, const size_t sd1_offset,
float* sd2_buffer, const size_t sd2_offset,
float* sx1_buffer, const size_t sx1_offset,
const float* sy1_buffer, const size_t sy1_offset,
float* sparam_buffer, const size_t sparam_offset) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSrotmg(handle, &sd1_buffer[sd1_offset],
&sd2_buffer[sd2_offset],
&sx1_buffer[sx1_offset],
&sy1_buffer[sy1_offset],
&sparam_buffer[sparam_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXrotmg<double>(double* sd1_buffer, const size_t sd1_offset,
+cublasStatus_t cublasXrotmg<double>(cublasHandle_t handle, double* sd1_buffer, const size_t sd1_offset,
double* sd2_buffer, const size_t sd2_offset,
double* sx1_buffer, const size_t sx1_offset,
const double* sy1_buffer, const size_t sy1_offset,
double* sparam_buffer, const size_t sparam_offset) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDrotmg(handle, &sd1_buffer[sd1_offset],
&sd2_buffer[sd2_offset],
&sx1_buffer[sx1_offset],
&sy1_buffer[sy1_offset],
&sparam_buffer[sparam_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for SROT/DROT
-cublasStatus_t cublasXrot(const size_t n,
+cublasStatus_t cublasXrot(cublasHandle_t handle, const size_t n,
float* x_buffer, const size_t x_offset, const size_t x_inc,
float* y_buffer, const size_t y_offset, const size_t y_inc,
const float cos,
const float sin) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSrot(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&cos,
&sin);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXrot(const size_t n,
+cublasStatus_t cublasXrot(cublasHandle_t handle, const size_t n,
double* x_buffer, const size_t x_offset, const size_t x_inc,
double* y_buffer, const size_t y_offset, const size_t y_inc,
const double cos,
const double sin) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDrot(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&cos,
&sin);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for SROTM/DROTM
template <typename T>
-cublasStatus_t cublasXrotm(const size_t n,
+cublasStatus_t cublasXrotm(cublasHandle_t handle, const size_t n,
T* x_buffer, const size_t x_offset, const size_t x_inc,
T* y_buffer, const size_t y_offset, const size_t y_inc,
T* sparam_buffer, const size_t sparam_offset);
template <>
-cublasStatus_t cublasXrotm<float>(const size_t n,
+cublasStatus_t cublasXrotm<float>(cublasHandle_t handle, const size_t n,
float* x_buffer, const size_t x_offset, const size_t x_inc,
float* y_buffer, const size_t y_offset, const size_t y_inc,
float* sparam_buffer, const size_t sparam_offset) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSrotm(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&sparam_buffer[sparam_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXrotm<double>(const size_t n,
+cublasStatus_t cublasXrotm<double>(cublasHandle_t handle, const size_t n,
double* x_buffer, const size_t x_offset, const size_t x_inc,
double* y_buffer, const size_t y_offset, const size_t y_inc,
double* sparam_buffer, const size_t sparam_offset) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDrotm(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&sparam_buffer[sparam_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP
template <typename T>
-cublasStatus_t cublasXswap(const size_t n,
+cublasStatus_t cublasXswap(cublasHandle_t handle, const size_t n,
T* x_buffer, const size_t x_offset, const size_t x_inc,
T* y_buffer, const size_t y_offset, const size_t y_inc);
template <>
-cublasStatus_t cublasXswap<float>(const size_t n,
+cublasStatus_t cublasXswap<float>(cublasHandle_t handle, const size_t n,
float* x_buffer, const size_t x_offset, const size_t x_inc,
float* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSswap(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXswap<double>(const size_t n,
+cublasStatus_t cublasXswap<double>(cublasHandle_t handle, const size_t n,
double* x_buffer, const size_t x_offset, const size_t x_inc,
double* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDswap(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXswap<float2>(const size_t n,
+cublasStatus_t cublasXswap<float2>(cublasHandle_t handle, const size_t n,
float2* x_buffer, const size_t x_offset, const size_t x_inc,
float2* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCswap(handle, static_cast<int>(n),
reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXswap<double2>(const size_t n,
+cublasStatus_t cublasXswap<double2>(cublasHandle_t handle, const size_t n,
double2* x_buffer, const size_t x_offset, const size_t x_inc,
double2* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZswap(handle, static_cast<int>(n),
reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXswap<half>(const size_t n,
+cublasStatus_t cublasXswap<half>(cublasHandle_t handle, const size_t n,
half* x_buffer, const size_t x_offset, const size_t x_inc,
half* y_buffer, const size_t y_offset, const size_t y_inc) {
return CUBLAS_STATUS_NOT_SUPPORTED;
}
// Forwards the cuBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL
-cublasStatus_t cublasXscal(const size_t n,
+cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n,
const float alpha,
float* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSscal(handle, static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXscal(const size_t n,
+cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n,
const double alpha,
double* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDscal(handle, static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXscal(const size_t n,
+cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n,
const float2 alpha,
float2* x_buffer, const size_t x_offset, const size_t x_inc) {
cuComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCscal(handle, static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXscal(const size_t n,
+cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n,
const double2 alpha,
double2* x_buffer, const size_t x_offset, const size_t x_inc) {
cuDoubleComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZscal(handle, static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXscal(const size_t n,
+cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n,
const half alpha,
half* x_buffer, const size_t x_offset, const size_t x_inc) {
return CUBLAS_STATUS_NOT_SUPPORTED;
@@ -293,124 +261,108 @@ cublasStatus_t cublasXscal(const size_t n,
// Forwards the cuBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY
template <typename T>
-cublasStatus_t cublasXcopy(const size_t n,
+cublasStatus_t cublasXcopy(cublasHandle_t handle, const size_t n,
const T* x_buffer, const size_t x_offset, const size_t x_inc,
T* y_buffer, const size_t y_offset, const size_t y_inc);
template <>
-cublasStatus_t cublasXcopy<float>(const size_t n,
+cublasStatus_t cublasXcopy<float>(cublasHandle_t handle, const size_t n,
const float* x_buffer, const size_t x_offset, const size_t x_inc,
float* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasScopy(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXcopy<double>(const size_t n,
+cublasStatus_t cublasXcopy<double>(cublasHandle_t handle, const size_t n,
const double* x_buffer, const size_t x_offset, const size_t x_inc,
double* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDcopy(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXcopy<float2>(const size_t n,
+cublasStatus_t cublasXcopy<float2>(cublasHandle_t handle, const size_t n,
const float2* x_buffer, const size_t x_offset, const size_t x_inc,
float2* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCcopy(handle, static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXcopy<double2>(const size_t n,
+cublasStatus_t cublasXcopy<double2>(cublasHandle_t handle, const size_t n,
const double2* x_buffer, const size_t x_offset, const size_t x_inc,
double2* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZcopy(handle, static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXcopy<half>(const size_t n,
+cublasStatus_t cublasXcopy<half>(cublasHandle_t handle, const size_t n,
const half* x_buffer, const size_t x_offset, const size_t x_inc,
half* y_buffer, const size_t y_offset, const size_t y_inc) {
return CUBLAS_STATUS_NOT_SUPPORTED;
}
// Forwards the cuBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY
-cublasStatus_t cublasXaxpy(const size_t n,
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n,
const float alpha,
const float* x_buffer, const size_t x_offset, const size_t x_inc,
float* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSaxpy(handle, static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXaxpy(const size_t n,
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n,
const double alpha,
const double* x_buffer, const size_t x_offset, const size_t x_inc,
double* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDaxpy(handle, static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXaxpy(const size_t n,
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n,
const float2 alpha,
const float2* x_buffer, const size_t x_offset, const size_t x_inc,
float2* y_buffer, const size_t y_offset, const size_t y_inc) {
cuComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCaxpy(handle, static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXaxpy(const size_t n,
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n,
const double2 alpha,
const double2* x_buffer, const size_t x_offset, const size_t x_inc,
double2* y_buffer, const size_t y_offset, const size_t y_inc) {
cuDoubleComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZaxpy(handle, static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXaxpy(const size_t n,
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n,
const half alpha,
const half* x_buffer, const size_t x_offset, const size_t x_inc,
half* y_buffer, const size_t y_offset, const size_t y_inc) {
@@ -419,40 +371,36 @@ cublasStatus_t cublasXaxpy(const size_t n,
// Forwards the cuBLAS calls for SDOT/DDOT
template <typename T>
-cublasStatus_t cublasXdot(const size_t n,
+cublasStatus_t cublasXdot(cublasHandle_t handle, const size_t n,
T* dot_buffer, const size_t dot_offset,
const T* x_buffer, const size_t x_offset, const size_t x_inc,
const T* y_buffer, const size_t y_offset, const size_t y_inc);
template <>
-cublasStatus_t cublasXdot<float>(const size_t n,
+cublasStatus_t cublasXdot<float>(cublasHandle_t handle, const size_t n,
float* dot_buffer, const size_t dot_offset,
const float* x_buffer, const size_t x_offset, const size_t x_inc,
const float* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSdot(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&dot_buffer[dot_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXdot<double>(const size_t n,
+cublasStatus_t cublasXdot<double>(cublasHandle_t handle, const size_t n,
double* dot_buffer, const size_t dot_offset,
const double* x_buffer, const size_t x_offset, const size_t x_inc,
const double* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDdot(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&dot_buffer[dot_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXdot<half>(const size_t n,
+cublasStatus_t cublasXdot<half>(cublasHandle_t handle, const size_t n,
half* dot_buffer, const size_t dot_offset,
const half* x_buffer, const size_t x_offset, const size_t x_inc,
const half* y_buffer, const size_t y_offset, const size_t y_inc) {
@@ -461,129 +409,113 @@ cublasStatus_t cublasXdot<half>(const size_t n,
// Forwards the cuBLAS calls for CDOTU/ZDOTU
template <typename T>
-cublasStatus_t cublasXdotu(const size_t n,
+cublasStatus_t cublasXdotu(cublasHandle_t handle, const size_t n,
T* dot_buffer, const size_t dot_offset,
const T* x_buffer, const size_t x_offset, const size_t x_inc,
const T* y_buffer, const size_t y_offset, const size_t y_inc);
template <>
-cublasStatus_t cublasXdotu<float2>(const size_t n,
+cublasStatus_t cublasXdotu<float2>(cublasHandle_t handle, const size_t n,
float2* dot_buffer, const size_t dot_offset,
const float2* x_buffer, const size_t x_offset, const size_t x_inc,
const float2* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCdotu(handle, static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuComplex*>(&dot_buffer[dot_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXdotu<double2>(const size_t n,
+cublasStatus_t cublasXdotu<double2>(cublasHandle_t handle, const size_t n,
double2* dot_buffer, const size_t dot_offset,
const double2* x_buffer, const size_t x_offset, const size_t x_inc,
const double2* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZdotu(handle, static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuDoubleComplex*>(&dot_buffer[dot_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for CDOTC/ZDOTC
template <typename T>
-cublasStatus_t cublasXdotc(const size_t n,
+cublasStatus_t cublasXdotc(cublasHandle_t handle, const size_t n,
T* dot_buffer, const size_t dot_offset,
const T* x_buffer, const size_t x_offset, const size_t x_inc,
const T* y_buffer, const size_t y_offset, const size_t y_inc);
template <>
-cublasStatus_t cublasXdotc<float2>(const size_t n,
+cublasStatus_t cublasXdotc<float2>(cublasHandle_t handle, const size_t n,
float2* dot_buffer, const size_t dot_offset,
const float2* x_buffer, const size_t x_offset, const size_t x_inc,
const float2* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCdotc(handle, static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuComplex*>(&dot_buffer[dot_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXdotc<double2>(const size_t n,
+cublasStatus_t cublasXdotc<double2>(cublasHandle_t handle, const size_t n,
double2* dot_buffer, const size_t dot_offset,
const double2* x_buffer, const size_t x_offset, const size_t x_inc,
const double2* y_buffer, const size_t y_offset, const size_t y_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZdotc(handle, static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuDoubleComplex*>(&dot_buffer[dot_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2
template <typename T>
-cublasStatus_t cublasXnrm2(const size_t n,
+cublasStatus_t cublasXnrm2(cublasHandle_t handle, const size_t n,
T* nrm2_buffer, const size_t nrm2_offset,
const T* x_buffer, const size_t x_offset, const size_t x_inc);
template <>
-cublasStatus_t cublasXnrm2<float>(const size_t n,
+cublasStatus_t cublasXnrm2<float>(cublasHandle_t handle, const size_t n,
float* nrm2_buffer, const size_t nrm2_offset,
const float* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSnrm2(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&nrm2_buffer[nrm2_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXnrm2<double>(const size_t n,
+cublasStatus_t cublasXnrm2<double>(cublasHandle_t handle, const size_t n,
double* nrm2_buffer, const size_t nrm2_offset,
const double* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDnrm2(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&nrm2_buffer[nrm2_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXnrm2<float2>(const size_t n,
+cublasStatus_t cublasXnrm2<float2>(cublasHandle_t handle, const size_t n,
float2* nrm2_buffer, const size_t nrm2_offset,
const float2* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasScnrm2(handle, static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<float*>(&nrm2_buffer[nrm2_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXnrm2<double2>(const size_t n,
+cublasStatus_t cublasXnrm2<double2>(cublasHandle_t handle, const size_t n,
double2* nrm2_buffer, const size_t nrm2_offset,
const double2* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDznrm2(handle, static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<double*>(&nrm2_buffer[nrm2_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXnrm2<half>(const size_t n,
+cublasStatus_t cublasXnrm2<half>(cublasHandle_t handle, const size_t n,
half* nrm2_buffer, const size_t nrm2_offset,
const half* x_buffer, const size_t x_offset, const size_t x_inc) {
return CUBLAS_STATUS_NOT_SUPPORTED;
@@ -591,59 +523,51 @@ cublasStatus_t cublasXnrm2<half>(const size_t n,
// Forwards the cuBLAS calls for SASUM/DASUM/ScASUM/DzASUM
template <typename T>
-cublasStatus_t cublasXasum(const size_t n,
+cublasStatus_t cublasXasum(cublasHandle_t handle, const size_t n,
T* asum_buffer, const size_t asum_offset,
const T* x_buffer, const size_t x_offset, const size_t x_inc);
template <>
-cublasStatus_t cublasXasum<float>(const size_t n,
+cublasStatus_t cublasXasum<float>(cublasHandle_t handle, const size_t n,
float* asum_buffer, const size_t asum_offset,
const float* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSasum(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&asum_buffer[asum_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXasum<double>(const size_t n,
+cublasStatus_t cublasXasum<double>(cublasHandle_t handle, const size_t n,
double* asum_buffer, const size_t asum_offset,
const double* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDasum(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
&asum_buffer[asum_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXasum<float2>(const size_t n,
+cublasStatus_t cublasXasum<float2>(cublasHandle_t handle, const size_t n,
float2* asum_buffer, const size_t asum_offset,
const float2* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasScasum(handle, static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<float*>(&asum_buffer[asum_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXasum<double2>(const size_t n,
+cublasStatus_t cublasXasum<double2>(cublasHandle_t handle, const size_t n,
double2* asum_buffer, const size_t asum_offset,
const double2* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDzasum(handle, static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<double*>(&asum_buffer[asum_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXasum<half>(const size_t n,
+cublasStatus_t cublasXasum<half>(cublasHandle_t handle, const size_t n,
half* asum_buffer, const size_t asum_offset,
const half* x_buffer, const size_t x_offset, const size_t x_inc) {
return CUBLAS_STATUS_NOT_SUPPORTED;
@@ -651,59 +575,51 @@ cublasStatus_t cublasXasum<half>(const size_t n,
// Forwards the cuBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
template <typename T>
-cublasStatus_t cublasXamax(const size_t n,
+cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n,
T* imax_buffer, const size_t imax_offset,
const T* x_buffer, const size_t x_offset, const size_t x_inc);
template <>
-cublasStatus_t cublasXamax<float>(const size_t n,
+cublasStatus_t cublasXamax<float>(cublasHandle_t handle, const size_t n,
float* imax_buffer, const size_t imax_offset,
const float* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasIsamax(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
reinterpret_cast<int*>(&imax_buffer[imax_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXamax<double>(const size_t n,
+cublasStatus_t cublasXamax<double>(cublasHandle_t handle, const size_t n,
double* imax_buffer, const size_t imax_offset,
const double* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasIdamax(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
reinterpret_cast<int*>(&imax_buffer[imax_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXamax<float2>(const size_t n,
+cublasStatus_t cublasXamax<float2>(cublasHandle_t handle, const size_t n,
float2* imax_buffer, const size_t imax_offset,
const float2* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasIcamax(handle, static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<int*>(&imax_buffer[imax_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXamax<double2>(const size_t n,
+cublasStatus_t cublasXamax<double2>(cublasHandle_t handle, const size_t n,
double2* imax_buffer, const size_t imax_offset,
const double2* x_buffer, const size_t x_offset, const size_t x_inc) {
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasIzamax(handle, static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<int*>(&imax_buffer[imax_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXamax<half>(const size_t n,
+cublasStatus_t cublasXamax<half>(cublasHandle_t handle, const size_t n,
half* imax_buffer, const size_t imax_offset,
const half* x_buffer, const size_t x_offset, const size_t x_inc) {
return CUBLAS_STATUS_NOT_SUPPORTED;
@@ -714,7 +630,7 @@ cublasStatus_t cublasXamax<half>(const size_t n,
// =================================================================================================
// Forwards the cuBLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV
-cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
const size_t m, const size_t n,
const float alpha,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -722,8 +638,6 @@ cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transp
const float beta,
float* y_buffer, const size_t y_offset, const size_t y_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSgemv(handle, a_transpose,
static_cast<int>(m), static_cast<int>(n),
&alpha,
@@ -731,10 +645,10 @@ cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transp
&x_buffer[x_offset], static_cast<int>(x_inc),
&beta,
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
const size_t m, const size_t n,
const double alpha,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -742,8 +656,6 @@ cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transp
const double beta,
double* y_buffer, const size_t y_offset, const size_t y_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDgemv(handle, a_transpose,
static_cast<int>(m), static_cast<int>(n),
&alpha,
@@ -751,10 +663,10 @@ cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transp
&x_buffer[x_offset], static_cast<int>(x_inc),
&beta,
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
const size_t m, const size_t n,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -768,8 +680,6 @@ cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transp
cuComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCgemv(handle, a_transpose,
static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
@@ -777,10 +687,10 @@ cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transp
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
&beta_cuda,
reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
const size_t m, const size_t n,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -794,8 +704,6 @@ cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transp
cuDoubleComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZgemv(handle, a_transpose,
static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
@@ -803,10 +711,10 @@ cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transp
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
&beta_cuda,
reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
const size_t m, const size_t n,
const half alpha,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -817,7 +725,7 @@ cublasStatus_t cublasXgemv(const Layout layout, const cublasOperation_t a_transp
}
// Forwards the cuBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV
-cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const float alpha,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -825,8 +733,6 @@ cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transp
const float beta,
float* y_buffer, const size_t y_offset, const size_t y_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSgbmv(handle, a_transpose,
static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
&alpha,
@@ -834,10 +740,10 @@ cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transp
&x_buffer[x_offset], static_cast<int>(x_inc),
&beta,
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const double alpha,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -845,8 +751,6 @@ cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transp
const double beta,
double* y_buffer, const size_t y_offset, const size_t y_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDgbmv(handle, a_transpose,
static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
&alpha,
@@ -854,10 +758,10 @@ cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transp
&x_buffer[x_offset], static_cast<int>(x_inc),
&beta,
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -871,8 +775,6 @@ cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transp
cuComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCgbmv(handle, a_transpose,
static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
&alpha_cuda,
@@ -880,10 +782,10 @@ cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transp
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
&beta_cuda,
reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -897,8 +799,6 @@ cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transp
cuDoubleComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZgbmv(handle, a_transpose,
static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
&alpha_cuda,
@@ -906,10 +806,10 @@ cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transp
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
&beta_cuda,
reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const half alpha,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -920,7 +820,7 @@ cublasStatus_t cublasXgbmv(const Layout layout, const cublasOperation_t a_transp
}
// Forwards the cuBLAS calls for CHEMV/ZHEMV
-cublasStatus_t cublasXhemv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhemv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -934,8 +834,6 @@ cublasStatus_t cublasXhemv(const Layout layout, const cublasFillMode_t triangle,
cuComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasChemv(handle, triangle,
static_cast<int>(n),
&alpha_cuda,
@@ -943,10 +841,10 @@ cublasStatus_t cublasXhemv(const Layout layout, const cublasFillMode_t triangle,
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
&beta_cuda,
reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXhemv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhemv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -960,8 +858,6 @@ cublasStatus_t cublasXhemv(const Layout layout, const cublasFillMode_t triangle,
cuDoubleComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZhemv(handle, triangle,
static_cast<int>(n),
&alpha_cuda,
@@ -969,12 +865,12 @@ cublasStatus_t cublasXhemv(const Layout layout, const cublasFillMode_t triangle,
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
&beta_cuda,
reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for CHBMV/ZHBMV
-cublasStatus_t cublasXhbmv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n, const size_t k,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -988,8 +884,6 @@ cublasStatus_t cublasXhbmv(const Layout layout, const cublasFillMode_t triangle,
cuComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasChbmv(handle, triangle,
static_cast<int>(n), static_cast<int>(k),
&alpha_cuda,
@@ -997,10 +891,10 @@ cublasStatus_t cublasXhbmv(const Layout layout, const cublasFillMode_t triangle,
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
&beta_cuda,
reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXhbmv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n, const size_t k,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -1014,8 +908,6 @@ cublasStatus_t cublasXhbmv(const Layout layout, const cublasFillMode_t triangle,
cuDoubleComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZhbmv(handle, triangle,
static_cast<int>(n), static_cast<int>(k),
&alpha_cuda,
@@ -1023,12 +915,12 @@ cublasStatus_t cublasXhbmv(const Layout layout, const cublasFillMode_t triangle,
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
&beta_cuda,
reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for CHPMV/ZHPMV
-cublasStatus_t cublasXhpmv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float2 alpha,
const float2* ap_buffer, const size_t ap_offset,
@@ -1042,8 +934,6 @@ cublasStatus_t cublasXhpmv(const Layout layout, const cublasFillMode_t triangle,
cuComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasChpmv(handle, triangle,
static_cast<int>(n),
&alpha_cuda,
@@ -1051,10 +941,10 @@ cublasStatus_t cublasXhpmv(const Layout layout, const cublasFillMode_t triangle,
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
&beta_cuda,
reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXhpmv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double2 alpha,
const double2* ap_buffer, const size_t ap_offset,
@@ -1068,8 +958,6 @@ cublasStatus_t cublasXhpmv(const Layout layout, const cublasFillMode_t triangle,
cuDoubleComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZhpmv(handle, triangle,
static_cast<int>(n),
&alpha_cuda,
@@ -1077,12 +965,12 @@ cublasStatus_t cublasXhpmv(const Layout layout, const cublasFillMode_t triangle,
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
&beta_cuda,
reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for SSYMV/DSYMV
-cublasStatus_t cublasXsymv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float alpha,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -1090,8 +978,6 @@ cublasStatus_t cublasXsymv(const Layout layout, const cublasFillMode_t triangle,
const float beta,
float* y_buffer, const size_t y_offset, const size_t y_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSsymv(handle, triangle,
static_cast<int>(n),
&alpha,
@@ -1099,10 +985,10 @@ cublasStatus_t cublasXsymv(const Layout layout, const cublasFillMode_t triangle,
&x_buffer[x_offset], static_cast<int>(x_inc),
&beta,
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsymv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double alpha,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -1110,8 +996,6 @@ cublasStatus_t cublasXsymv(const Layout layout, const cublasFillMode_t triangle,
const double beta,
double* y_buffer, const size_t y_offset, const size_t y_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDsymv(handle, triangle,
static_cast<int>(n),
&alpha,
@@ -1119,10 +1003,10 @@ cublasStatus_t cublasXsymv(const Layout layout, const cublasFillMode_t triangle,
&x_buffer[x_offset], static_cast<int>(x_inc),
&beta,
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsymv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const half alpha,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -1133,7 +1017,7 @@ cublasStatus_t cublasXsymv(const Layout layout, const cublasFillMode_t triangle,
}
// Forwards the cuBLAS calls for SSBMV/DSBMV
-cublasStatus_t cublasXsbmv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n, const size_t k,
const float alpha,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -1141,8 +1025,6 @@ cublasStatus_t cublasXsbmv(const Layout layout, const cublasFillMode_t triangle,
const float beta,
float* y_buffer, const size_t y_offset, const size_t y_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSsbmv(handle, triangle,
static_cast<int>(n), static_cast<int>(k),
&alpha,
@@ -1150,10 +1032,10 @@ cublasStatus_t cublasXsbmv(const Layout layout, const cublasFillMode_t triangle,
&x_buffer[x_offset], static_cast<int>(x_inc),
&beta,
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsbmv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n, const size_t k,
const double alpha,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -1161,8 +1043,6 @@ cublasStatus_t cublasXsbmv(const Layout layout, const cublasFillMode_t triangle,
const double beta,
double* y_buffer, const size_t y_offset, const size_t y_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDsbmv(handle, triangle,
static_cast<int>(n), static_cast<int>(k),
&alpha,
@@ -1170,10 +1050,10 @@ cublasStatus_t cublasXsbmv(const Layout layout, const cublasFillMode_t triangle,
&x_buffer[x_offset], static_cast<int>(x_inc),
&beta,
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsbmv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n, const size_t k,
const half alpha,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -1184,7 +1064,7 @@ cublasStatus_t cublasXsbmv(const Layout layout, const cublasFillMode_t triangle,
}
// Forwards the cuBLAS calls for SSPMV/DSPMV
-cublasStatus_t cublasXspmv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float alpha,
const float* ap_buffer, const size_t ap_offset,
@@ -1192,8 +1072,6 @@ cublasStatus_t cublasXspmv(const Layout layout, const cublasFillMode_t triangle,
const float beta,
float* y_buffer, const size_t y_offset, const size_t y_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSspmv(handle, triangle,
static_cast<int>(n),
&alpha,
@@ -1201,10 +1079,10 @@ cublasStatus_t cublasXspmv(const Layout layout, const cublasFillMode_t triangle,
&x_buffer[x_offset], static_cast<int>(x_inc),
&beta,
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXspmv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double alpha,
const double* ap_buffer, const size_t ap_offset,
@@ -1212,8 +1090,6 @@ cublasStatus_t cublasXspmv(const Layout layout, const cublasFillMode_t triangle,
const double beta,
double* y_buffer, const size_t y_offset, const size_t y_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDspmv(handle, triangle,
static_cast<int>(n),
&alpha,
@@ -1221,10 +1097,10 @@ cublasStatus_t cublasXspmv(const Layout layout, const cublasFillMode_t triangle,
&x_buffer[x_offset], static_cast<int>(x_inc),
&beta,
&y_buffer[y_offset], static_cast<int>(y_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXspmv(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const half alpha,
const half* ap_buffer, const size_t ap_offset,
@@ -1236,72 +1112,64 @@ cublasStatus_t cublasXspmv(const Layout layout, const cublasFillMode_t triangle,
// Forwards the cuBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV
template <typename T>
-cublasStatus_t cublasXtrmv(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const T* a_buffer, const size_t a_offset, const size_t a_ld,
T* x_buffer, const size_t x_offset, const size_t x_inc);
template <>
-cublasStatus_t cublasXtrmv<float>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
float* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasStrmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
&a_buffer[a_offset], a_ld,
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtrmv<double>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
double* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDtrmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
&a_buffer[a_offset], a_ld,
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtrmv<float2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
float2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCtrmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtrmv<double2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
double2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZtrmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtrmv<half>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmv<half>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
half* x_buffer, const size_t x_offset, const size_t x_inc) {
@@ -1310,72 +1178,64 @@ cublasStatus_t cublasXtrmv<half>(const Layout layout, const cublasFillMode_t tri
// Forwards the cuBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV
template <typename T>
-cublasStatus_t cublasXtbmv(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const T* a_buffer, const size_t a_offset, const size_t a_ld,
T* x_buffer, const size_t x_offset, const size_t x_inc);
template <>
-cublasStatus_t cublasXtbmv<float>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbmv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
float* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasStbmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n), static_cast<int>(k),
&a_buffer[a_offset], a_ld,
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtbmv<double>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbmv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
double* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDtbmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n), static_cast<int>(k),
&a_buffer[a_offset], a_ld,
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtbmv<float2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbmv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
float2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCtbmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n), static_cast<int>(k),
reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtbmv<double2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbmv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
double2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZtbmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n), static_cast<int>(k),
reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtbmv<half>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbmv<half>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
half* x_buffer, const size_t x_offset, const size_t x_inc) {
@@ -1384,72 +1244,64 @@ cublasStatus_t cublasXtbmv<half>(const Layout layout, const cublasFillMode_t tri
// Forwards the cuBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV
template <typename T>
-cublasStatus_t cublasXtpmv(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const T* ap_buffer, const size_t ap_offset,
T* x_buffer, const size_t x_offset, const size_t x_inc);
template <>
-cublasStatus_t cublasXtpmv<float>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpmv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const float* ap_buffer, const size_t ap_offset,
float* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasStpmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
&ap_buffer[ap_offset],
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtpmv<double>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpmv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const double* ap_buffer, const size_t ap_offset,
double* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDtpmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
&ap_buffer[ap_offset],
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtpmv<float2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpmv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const float2* ap_buffer, const size_t ap_offset,
float2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCtpmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&ap_buffer[ap_offset]),
reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtpmv<double2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpmv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const double2* ap_buffer, const size_t ap_offset,
double2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZtpmv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&ap_buffer[ap_offset]),
reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtpmv<half>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpmv<half>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const half* ap_buffer, const size_t ap_offset,
half* x_buffer, const size_t x_offset, const size_t x_inc) {
@@ -1458,241 +1310,213 @@ cublasStatus_t cublasXtpmv<half>(const Layout layout, const cublasFillMode_t tri
// Forwards the cuBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV
template <typename T>
-cublasStatus_t cublasXtrsv(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const T* a_buffer, const size_t a_offset, const size_t a_ld,
T* x_buffer, const size_t x_offset, const size_t x_inc);
template <>
-cublasStatus_t cublasXtrsv<float>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrsv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
float* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasStrsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
&a_buffer[a_offset], a_ld,
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtrsv<double>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrsv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
double* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDtrsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
&a_buffer[a_offset], a_ld,
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtrsv<float2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrsv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
float2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCtrsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtrsv<double2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrsv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
double2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZtrsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for STBSV/DTBSV/CTBSV/ZTBSV
template <typename T>
-cublasStatus_t cublasXtbsv(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const T* a_buffer, const size_t a_offset, const size_t a_ld,
T* x_buffer, const size_t x_offset, const size_t x_inc);
template <>
-cublasStatus_t cublasXtbsv<float>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbsv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
float* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasStbsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n), static_cast<int>(k),
&a_buffer[a_offset], a_ld,
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtbsv<double>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbsv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
double* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDtbsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n), static_cast<int>(k),
&a_buffer[a_offset], a_ld,
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtbsv<float2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbsv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
float2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCtbsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n), static_cast<int>(k),
reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtbsv<double2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtbsv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n, const size_t k,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
double2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZtbsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n), static_cast<int>(k),
reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for STPSV/DTPSV/CTPSV/ZTPSV
template <typename T>
-cublasStatus_t cublasXtpsv(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const T* ap_buffer, const size_t ap_offset,
T* x_buffer, const size_t x_offset, const size_t x_inc);
template <>
-cublasStatus_t cublasXtpsv<float>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpsv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const float* ap_buffer, const size_t ap_offset,
float* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasStpsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
&ap_buffer[ap_offset],
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtpsv<double>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpsv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const double* ap_buffer, const size_t ap_offset,
double* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDtpsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
&ap_buffer[ap_offset],
&x_buffer[x_offset], static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtpsv<float2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpsv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const float2* ap_buffer, const size_t ap_offset,
float2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCtpsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&ap_buffer[ap_offset]),
reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
template <>
-cublasStatus_t cublasXtpsv<double2>(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtpsv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t n,
const double2* ap_buffer, const size_t ap_offset,
double2* x_buffer, const size_t x_offset, const size_t x_inc) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZtpsv(handle, triangle, a_transpose, diagonal,
static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&ap_buffer[ap_offset]),
reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for SGER/DGER
-cublasStatus_t cublasXger(const Layout layout,
+cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout,
const size_t m, const size_t n,
const float alpha,
const float* x_buffer, const size_t x_offset, const size_t x_inc,
const float* y_buffer, const size_t y_offset, const size_t y_inc,
float* a_buffer, const size_t a_offset, const size_t a_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSger(handle, static_cast<int>(m), static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&a_buffer[a_offset], a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXger(const Layout layout,
+cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout,
const size_t m, const size_t n,
const double alpha,
const double* x_buffer, const size_t x_offset, const size_t x_inc,
const double* y_buffer, const size_t y_offset, const size_t y_inc,
double* a_buffer, const size_t a_offset, const size_t a_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDger(handle, static_cast<int>(m), static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&a_buffer[a_offset], a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXger(const Layout layout,
+cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout,
const size_t m, const size_t n,
const half alpha,
const half* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -1702,7 +1526,7 @@ cublasStatus_t cublasXger(const Layout layout,
}
// Forwards the cuBLAS calls for CGERU/ZGERU
-cublasStatus_t cublasXgeru(const Layout layout,
+cublasStatus_t cublasXgeru(cublasHandle_t handle, const Layout layout,
const size_t m, const size_t n,
const float2 alpha,
const float2* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -1712,17 +1536,15 @@ cublasStatus_t cublasXgeru(const Layout layout,
cuComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCgeru(handle, static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgeru(const Layout layout,
+cublasStatus_t cublasXgeru(cublasHandle_t handle, const Layout layout,
const size_t m, const size_t n,
const double2 alpha,
const double2* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -1732,19 +1554,17 @@ cublasStatus_t cublasXgeru(const Layout layout,
cuDoubleComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZgeru(handle, static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for CGERC/ZGERC
-cublasStatus_t cublasXgerc(const Layout layout,
+cublasStatus_t cublasXgerc(cublasHandle_t handle, const Layout layout,
const size_t m, const size_t n,
const float2 alpha,
const float2* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -1754,17 +1574,15 @@ cublasStatus_t cublasXgerc(const Layout layout,
cuComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCgerc(handle, static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgerc(const Layout layout,
+cublasStatus_t cublasXgerc(cublasHandle_t handle, const Layout layout,
const size_t m, const size_t n,
const double2 alpha,
const double2* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -1774,87 +1592,77 @@ cublasStatus_t cublasXgerc(const Layout layout,
cuDoubleComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZgerc(handle, static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for CHER/ZHER
-cublasStatus_t cublasXher(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXher(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float alpha,
const float2* x_buffer, const size_t x_offset, const size_t x_inc,
float2* a_buffer, const size_t a_offset, const size_t a_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCher(handle, triangle,
static_cast<int>(n),
&alpha,
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXher(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXher(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double alpha,
const double2* x_buffer, const size_t x_offset, const size_t x_inc,
double2* a_buffer, const size_t a_offset, const size_t a_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZher(handle, triangle,
static_cast<int>(n),
&alpha,
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for CHPR/ZHPR
-cublasStatus_t cublasXhpr(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhpr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float alpha,
const float2* x_buffer, const size_t x_offset, const size_t x_inc,
float2* ap_buffer, const size_t ap_offset) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasChpr(handle, triangle,
static_cast<int>(n),
&alpha,
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<cuComplex*>(&ap_buffer[ap_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXhpr(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhpr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double alpha,
const double2* x_buffer, const size_t x_offset, const size_t x_inc,
double2* ap_buffer, const size_t ap_offset) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZhpr(handle, triangle,
static_cast<int>(n),
&alpha,
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<cuDoubleComplex*>(&ap_buffer[ap_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for CHER2/ZHER2
-cublasStatus_t cublasXher2(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXher2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float2 alpha,
const float2* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -1864,18 +1672,16 @@ cublasStatus_t cublasXher2(const Layout layout, const cublasFillMode_t triangle,
cuComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCher2(handle, triangle,
static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXher2(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXher2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double2 alpha,
const double2* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -1885,20 +1691,18 @@ cublasStatus_t cublasXher2(const Layout layout, const cublasFillMode_t triangle,
cuDoubleComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZher2(handle, triangle,
static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for CHPR2/ZHPR2
-cublasStatus_t cublasXhpr2(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhpr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float2 alpha,
const float2* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -1908,18 +1712,16 @@ cublasStatus_t cublasXhpr2(const Layout layout, const cublasFillMode_t triangle,
cuComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasChpr2(handle, triangle,
static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuComplex*>(&ap_buffer[ap_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXhpr2(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhpr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double2 alpha,
const double2* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -1929,52 +1731,46 @@ cublasStatus_t cublasXhpr2(const Layout layout, const cublasFillMode_t triangle,
cuDoubleComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZhpr2(handle, triangle,
static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
reinterpret_cast<cuDoubleComplex*>(&ap_buffer[ap_offset]));
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for SSYR/DSYR
-cublasStatus_t cublasXsyr(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float alpha,
const float* x_buffer, const size_t x_offset, const size_t x_inc,
float* a_buffer, const size_t a_offset, const size_t a_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSsyr(handle, triangle,
static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&a_buffer[a_offset], a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyr(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double alpha,
const double* x_buffer, const size_t x_offset, const size_t x_inc,
double* a_buffer, const size_t a_offset, const size_t a_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDsyr(handle, triangle,
static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&a_buffer[a_offset], a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyr(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const half alpha,
const half* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -1983,39 +1779,35 @@ cublasStatus_t cublasXsyr(const Layout layout, const cublasFillMode_t triangle,
}
// Forwards the cuBLAS calls for SSPR/DSPR
-cublasStatus_t cublasXspr(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float alpha,
const float* x_buffer, const size_t x_offset, const size_t x_inc,
float* ap_buffer, const size_t ap_offset) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSspr(handle, triangle,
static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&ap_buffer[ap_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXspr(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double alpha,
const double* x_buffer, const size_t x_offset, const size_t x_inc,
double* ap_buffer, const size_t ap_offset) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDspr(handle, triangle,
static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&ap_buffer[ap_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXspr(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const half alpha,
const half* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -2024,43 +1816,39 @@ cublasStatus_t cublasXspr(const Layout layout, const cublasFillMode_t triangle,
}
// Forwards the cuBLAS calls for SSYR2/DSYR2
-cublasStatus_t cublasXsyr2(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float alpha,
const float* x_buffer, const size_t x_offset, const size_t x_inc,
const float* y_buffer, const size_t y_offset, const size_t y_inc,
float* a_buffer, const size_t a_offset, const size_t a_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSsyr2(handle, triangle,
static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&a_buffer[a_offset], a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyr2(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double alpha,
const double* x_buffer, const size_t x_offset, const size_t x_inc,
const double* y_buffer, const size_t y_offset, const size_t y_inc,
double* a_buffer, const size_t a_offset, const size_t a_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDsyr2(handle, triangle,
static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&a_buffer[a_offset], a_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyr2(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const half alpha,
const half* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -2070,43 +1858,39 @@ cublasStatus_t cublasXsyr2(const Layout layout, const cublasFillMode_t triangle,
}
// Forwards the cuBLAS calls for SSPR2/DSPR2
-cublasStatus_t cublasXspr2(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const float alpha,
const float* x_buffer, const size_t x_offset, const size_t x_inc,
const float* y_buffer, const size_t y_offset, const size_t y_inc,
float* ap_buffer, const size_t ap_offset) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSspr2(handle, triangle,
static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&ap_buffer[ap_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXspr2(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const double alpha,
const double* x_buffer, const size_t x_offset, const size_t x_inc,
const double* y_buffer, const size_t y_offset, const size_t y_inc,
double* ap_buffer, const size_t ap_offset) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDspr2(handle, triangle,
static_cast<int>(n),
&alpha,
&x_buffer[x_offset], static_cast<int>(x_inc),
&y_buffer[y_offset], static_cast<int>(y_inc),
&ap_buffer[ap_offset]);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXspr2(const Layout layout, const cublasFillMode_t triangle,
+cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
const size_t n,
const half alpha,
const half* x_buffer, const size_t x_offset, const size_t x_inc,
@@ -2120,7 +1904,7 @@ cublasStatus_t cublasXspr2(const Layout layout, const cublasFillMode_t triangle,
// =================================================================================================
// Forwards the cuBLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM
-cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
+cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
const size_t m, const size_t n, const size_t k,
const float alpha,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2128,8 +1912,6 @@ cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transp
const float beta,
float* c_buffer, const size_t c_offset, const size_t c_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSgemm(handle, a_transpose, b_transpose,
static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
&alpha,
@@ -2137,10 +1919,10 @@ cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transp
&b_buffer[b_offset], b_ld,
&beta,
&c_buffer[c_offset], c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
+cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
const size_t m, const size_t n, const size_t k,
const double alpha,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2148,8 +1930,6 @@ cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transp
const double beta,
double* c_buffer, const size_t c_offset, const size_t c_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDgemm(handle, a_transpose, b_transpose,
static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
&alpha,
@@ -2157,10 +1937,10 @@ cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transp
&b_buffer[b_offset], b_ld,
&beta,
&c_buffer[c_offset], c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
+cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
const size_t m, const size_t n, const size_t k,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2174,8 +1954,6 @@ cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transp
cuComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCgemm(handle, a_transpose, b_transpose,
static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
&alpha_cuda,
@@ -2183,10 +1961,10 @@ cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transp
reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld,
&beta_cuda,
reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
+cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
const size_t m, const size_t n, const size_t k,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2200,8 +1978,6 @@ cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transp
cuDoubleComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZgemm(handle, a_transpose, b_transpose,
static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
&alpha_cuda,
@@ -2209,10 +1985,10 @@ cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transp
reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld,
&beta_cuda,
reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
+cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
const size_t m, const size_t n, const size_t k,
const half alpha,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2223,7 +1999,7 @@ cublasStatus_t cublasXgemm(const Layout layout, const cublasOperation_t a_transp
}
// Forwards the cuBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM
-cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
const size_t m, const size_t n,
const float alpha,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2231,8 +2007,6 @@ cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, con
const float beta,
float* c_buffer, const size_t c_offset, const size_t c_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSsymm(handle, side, triangle,
static_cast<int>(m), static_cast<int>(n),
&alpha,
@@ -2240,10 +2014,10 @@ cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, con
&b_buffer[b_offset], b_ld,
&beta,
&c_buffer[c_offset], c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
const size_t m, const size_t n,
const double alpha,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2251,8 +2025,6 @@ cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, con
const double beta,
double* c_buffer, const size_t c_offset, const size_t c_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDsymm(handle, side, triangle,
static_cast<int>(m), static_cast<int>(n),
&alpha,
@@ -2260,10 +2032,10 @@ cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, con
&b_buffer[b_offset], b_ld,
&beta,
&c_buffer[c_offset], c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
const size_t m, const size_t n,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2277,8 +2049,6 @@ cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, con
cuComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCsymm(handle, side, triangle,
static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
@@ -2286,10 +2056,10 @@ cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, con
reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld,
&beta_cuda,
reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
const size_t m, const size_t n,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2303,8 +2073,6 @@ cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, con
cuDoubleComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZsymm(handle, side, triangle,
static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
@@ -2312,10 +2080,10 @@ cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, con
reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld,
&beta_cuda,
reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
const size_t m, const size_t n,
const half alpha,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2326,7 +2094,7 @@ cublasStatus_t cublasXsymm(const Layout layout, const cublasSideMode_t side, con
}
// Forwards the cuBLAS calls for CHEMM/ZHEMM
-cublasStatus_t cublasXhemm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhemm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
const size_t m, const size_t n,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2340,8 +2108,6 @@ cublasStatus_t cublasXhemm(const Layout layout, const cublasSideMode_t side, con
cuComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasChemm(handle, side, triangle,
static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
@@ -2349,10 +2115,10 @@ cublasStatus_t cublasXhemm(const Layout layout, const cublasSideMode_t side, con
reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld,
&beta_cuda,
reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXhemm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+cublasStatus_t cublasXhemm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
const size_t m, const size_t n,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2366,8 +2132,6 @@ cublasStatus_t cublasXhemm(const Layout layout, const cublasSideMode_t side, con
cuDoubleComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZhemm(handle, side, triangle,
static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
@@ -2375,48 +2139,44 @@ cublasStatus_t cublasXhemm(const Layout layout, const cublasSideMode_t side, con
reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld,
&beta_cuda,
reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK
-cublasStatus_t cublasXsyrk(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
const size_t n, const size_t k,
const float alpha,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
const float beta,
float* c_buffer, const size_t c_offset, const size_t c_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSsyrk(handle, triangle, a_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha,
&a_buffer[a_offset], a_ld,
&beta,
&c_buffer[c_offset], c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyrk(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
const size_t n, const size_t k,
const double alpha,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
const double beta,
double* c_buffer, const size_t c_offset, const size_t c_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDsyrk(handle, triangle, a_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha,
&a_buffer[a_offset], a_ld,
&beta,
&c_buffer[c_offset], c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyrk(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
const size_t n, const size_t k,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2429,18 +2189,16 @@ cublasStatus_t cublasXsyrk(const Layout layout, const cublasFillMode_t triangle,
cuComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCsyrk(handle, triangle, a_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha_cuda,
reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
&beta_cuda,
reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyrk(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
const size_t n, const size_t k,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2453,18 +2211,16 @@ cublasStatus_t cublasXsyrk(const Layout layout, const cublasFillMode_t triangle,
cuDoubleComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZsyrk(handle, triangle, a_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha_cuda,
reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
&beta_cuda,
reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyrk(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
const size_t n, const size_t k,
const half alpha,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2474,45 +2230,41 @@ cublasStatus_t cublasXsyrk(const Layout layout, const cublasFillMode_t triangle,
}
// Forwards the cuBLAS calls for CHERK/ZHERK
-cublasStatus_t cublasXherk(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXherk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
const size_t n, const size_t k,
const float alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
const float beta,
float2* c_buffer, const size_t c_offset, const size_t c_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCherk(handle, triangle, a_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha,
reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
&beta,
reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXherk(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+cublasStatus_t cublasXherk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
const size_t n, const size_t k,
const double alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
const double beta,
double2* c_buffer, const size_t c_offset, const size_t c_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZherk(handle, triangle, a_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha,
reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
&beta,
reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K
-cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
const size_t n, const size_t k,
const float alpha,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2520,8 +2272,6 @@ cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle
const float beta,
float* c_buffer, const size_t c_offset, const size_t c_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasSsyr2k(handle, triangle, ab_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha,
@@ -2529,10 +2279,10 @@ cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle
&b_buffer[b_offset], b_ld,
&beta,
&c_buffer[c_offset], c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
const size_t n, const size_t k,
const double alpha,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2540,8 +2290,6 @@ cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle
const double beta,
double* c_buffer, const size_t c_offset, const size_t c_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDsyr2k(handle, triangle, ab_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha,
@@ -2549,10 +2297,10 @@ cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle
&b_buffer[b_offset], b_ld,
&beta,
&c_buffer[c_offset], c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
const size_t n, const size_t k,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2566,8 +2314,6 @@ cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle
cuComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCsyr2k(handle, triangle, ab_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha_cuda,
@@ -2575,10 +2321,10 @@ cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle
reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld,
&beta_cuda,
reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
const size_t n, const size_t k,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2592,8 +2338,6 @@ cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle
cuDoubleComplex beta_cuda;
beta_cuda.x = beta.real();
beta_cuda.y = beta.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZsyr2k(handle, triangle, ab_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha_cuda,
@@ -2601,10 +2345,10 @@ cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle
reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld,
&beta_cuda,
reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
const size_t n, const size_t k,
const half alpha,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2615,7 +2359,7 @@ cublasStatus_t cublasXsyr2k(const Layout layout, const cublasFillMode_t triangle
}
// Forwards the cuBLAS calls for CHER2K/ZHER2K
-cublasStatus_t cublasXher2k(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+cublasStatus_t cublasXher2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
const size_t n, const size_t k,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2626,8 +2370,6 @@ cublasStatus_t cublasXher2k(const Layout layout, const cublasFillMode_t triangle
cuComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCher2k(handle, triangle, ab_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha_cuda,
@@ -2635,10 +2377,10 @@ cublasStatus_t cublasXher2k(const Layout layout, const cublasFillMode_t triangle
reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld,
&beta,
reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXher2k(const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+cublasStatus_t cublasXher2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
const size_t n, const size_t k,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2649,8 +2391,6 @@ cublasStatus_t cublasXher2k(const Layout layout, const cublasFillMode_t triangle
cuDoubleComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZher2k(handle, triangle, ab_transpose,
static_cast<int>(n), static_cast<int>(k),
&alpha_cuda,
@@ -2658,46 +2398,42 @@ cublasStatus_t cublasXher2k(const Layout layout, const cublasFillMode_t triangle
reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld,
&beta,
reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
// Forwards the cuBLAS calls for STRMM/DTRMM/CTRMM/ZTRMM
-cublasStatus_t cublasXtrmm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t m, const size_t n,
const float alpha,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
float* b_buffer, const size_t b_offset, const size_t b_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasStrmm(handle, side, triangle, a_transpose, diagonal,
static_cast<int>(m), static_cast<int>(n),
&alpha,
&a_buffer[a_offset], a_ld,
&a_buffer[a_offset], a_ld,
&b_buffer[b_offset], b_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXtrmm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t m, const size_t n,
const double alpha,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
double* b_buffer, const size_t b_offset, const size_t b_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDtrmm(handle, side, triangle, a_transpose, diagonal,
static_cast<int>(m), static_cast<int>(n),
&alpha,
&a_buffer[a_offset], a_ld,
&a_buffer[a_offset], a_ld,
&b_buffer[b_offset], b_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXtrmm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t m, const size_t n,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2706,18 +2442,16 @@ cublasStatus_t cublasXtrmm(const Layout layout, const cublasSideMode_t side, con
cuComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCtrmm(handle, side, triangle, a_transpose, diagonal,
static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuComplex*>(&b_buffer[b_offset]), b_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXtrmm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t m, const size_t n,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2726,18 +2460,16 @@ cublasStatus_t cublasXtrmm(const Layout layout, const cublasSideMode_t side, con
cuDoubleComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZtrmm(handle, side, triangle, a_transpose, diagonal,
static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuDoubleComplex*>(&b_buffer[b_offset]), b_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXtrmm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t m, const size_t n,
const half alpha,
const half* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2746,39 +2478,35 @@ cublasStatus_t cublasXtrmm(const Layout layout, const cublasSideMode_t side, con
}
// Forwards the cuBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM
-cublasStatus_t cublasXtrsm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t m, const size_t n,
const float alpha,
const float* a_buffer, const size_t a_offset, const size_t a_ld,
float* b_buffer, const size_t b_offset, const size_t b_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasStrsm(handle, side, triangle, a_transpose, diagonal,
static_cast<int>(m), static_cast<int>(n),
&alpha,
&a_buffer[a_offset], a_ld,
&b_buffer[b_offset], b_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXtrsm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t m, const size_t n,
const double alpha,
const double* a_buffer, const size_t a_offset, const size_t a_ld,
double* b_buffer, const size_t b_offset, const size_t b_ld) {
if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasDtrsm(handle, side, triangle, a_transpose, diagonal,
static_cast<int>(m), static_cast<int>(n),
&alpha,
&a_buffer[a_offset], a_ld,
&b_buffer[b_offset], b_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXtrsm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t m, const size_t n,
const float2 alpha,
const float2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2787,17 +2515,15 @@ cublasStatus_t cublasXtrsm(const Layout layout, const cublasSideMode_t side, con
cuComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasCtrsm(handle, side, triangle, a_transpose, diagonal,
static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuComplex*>(&b_buffer[b_offset]), b_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}
-cublasStatus_t cublasXtrsm(const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
const size_t m, const size_t n,
const double2 alpha,
const double2* a_buffer, const size_t a_offset, const size_t a_ld,
@@ -2806,14 +2532,12 @@ cublasStatus_t cublasXtrsm(const Layout layout, const cublasSideMode_t side, con
cuDoubleComplex alpha_cuda;
alpha_cuda.x = alpha.real();
alpha_cuda.y = alpha.imag();
- cublasHandle_t handle;
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { return CUBLAS_STATUS_NOT_INITIALIZED; }
auto status = cublasZtrsm(handle, side, triangle, a_transpose, diagonal,
static_cast<int>(m), static_cast<int>(n),
&alpha_cuda,
reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<cuDoubleComplex*>(&b_buffer[b_offset]), b_ld);
- cublasDestroy(handle);
+ cudaDeviceSynchronize();
return status;
}