summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG3
-rw-r--r--CMakeLists.txt41
-rw-r--r--CONTRIBUTING.md20
-rw-r--r--LICENSE222
-rw-r--r--README.md12
-rw-r--r--clblast.pc.in10
-rw-r--r--doc/clblast.md1910
-rw-r--r--include/clblast.h29
-rw-r--r--include/clblast_c.h2292
-rw-r--r--samples/cache.c10
-rw-r--r--samples/dgemv.c18
-rw-r--r--samples/haxpy.c10
-rw-r--r--samples/sasum.c10
-rw-r--r--samples/sgemm.c19
-rwxr-xr-xscripts/generator/generator.py37
-rw-r--r--scripts/generator/generator/cpp.py29
-rw-r--r--scripts/generator/generator/doc.py2
-rw-r--r--scripts/generator/generator/routine.py22
-rw-r--r--src/buffer_test.hpp121
-rw-r--r--src/cache.cpp7
-rw-r--r--src/cache.hpp4
-rw-r--r--src/clblast.cpp955
-rw-r--r--src/clblast_c.cpp6007
-rw-r--r--src/clpp11.hpp134
-rw-r--r--src/cxpp11_common.hpp109
-rw-r--r--src/database/database.cpp4
-rw-r--r--src/database/database.hpp2
-rw-r--r--src/database/kernels/copy.hpp12
-rw-r--r--src/database/kernels/pad.hpp16
-rw-r--r--src/database/kernels/padtranspose.hpp8
-rw-r--r--src/database/kernels/transpose.hpp14
-rw-r--r--src/database/kernels/xaxpy.hpp16
-rw-r--r--src/database/kernels/xdot.hpp14
-rw-r--r--src/database/kernels/xgemm.hpp8
-rw-r--r--src/database/kernels/xgemm_direct.hpp32
-rw-r--r--src/database/kernels/xgemv.hpp7
-rw-r--r--src/database/kernels/xgemv_fast.hpp6
-rw-r--r--src/database/kernels/xgemv_fast_rot.hpp12
-rw-r--r--src/database/kernels/xger.hpp18
-rw-r--r--src/kernels/level3/xgemm_part3.opencl4
-rw-r--r--src/routine.cpp85
-rw-r--r--src/routine.hpp14
-rw-r--r--src/routines/common.cpp27
-rw-r--r--src/routines/common.hpp162
-rw-r--r--src/routines/level1/xamax.cpp100
-rw-r--r--src/routines/level1/xamax.hpp6
-rw-r--r--src/routines/level1/xasum.cpp94
-rw-r--r--src/routines/level1/xasum.hpp6
-rw-r--r--src/routines/level1/xaxpy.cpp91
-rw-r--r--src/routines/level1/xaxpy.hpp6
-rw-r--r--src/routines/level1/xcopy.cpp87
-rw-r--r--src/routines/level1/xcopy.hpp6
-rw-r--r--src/routines/level1/xdot.cpp109
-rw-r--r--src/routines/level1/xdot.hpp10
-rw-r--r--src/routines/level1/xdotc.cpp16
-rw-r--r--src/routines/level1/xdotc.hpp8
-rw-r--r--src/routines/level1/xdotu.cpp16
-rw-r--r--src/routines/level1/xdotu.hpp8
-rw-r--r--src/routines/level1/xmax.hpp8
-rw-r--r--src/routines/level1/xmin.hpp8
-rw-r--r--src/routines/level1/xnrm2.cpp94
-rw-r--r--src/routines/level1/xnrm2.hpp6
-rw-r--r--src/routines/level1/xscal.cpp78
-rw-r--r--src/routines/level1/xscal.hpp4
-rw-r--r--src/routines/level1/xsum.hpp8
-rw-r--r--src/routines/level1/xswap.cpp87
-rw-r--r--src/routines/level1/xswap.hpp6
-rw-r--r--src/routines/level2/xgbmv.cpp28
-rw-r--r--src/routines/level2/xgbmv.hpp14
-rw-r--r--src/routines/level2/xgemv.cpp127
-rw-r--r--src/routines/level2/xgemv.hpp34
-rw-r--r--src/routines/level2/xger.cpp82
-rw-r--r--src/routines/level2/xger.hpp12
-rw-r--r--src/routines/level2/xgerc.cpp20
-rw-r--r--src/routines/level2/xgerc.hpp12
-rw-r--r--src/routines/level2/xgeru.cpp20
-rw-r--r--src/routines/level2/xgeru.hpp12
-rw-r--r--src/routines/level2/xhbmv.cpp28
-rw-r--r--src/routines/level2/xhbmv.hpp14
-rw-r--r--src/routines/level2/xhemv.cpp28
-rw-r--r--src/routines/level2/xhemv.hpp14
-rw-r--r--src/routines/level2/xher.cpp78
-rw-r--r--src/routines/level2/xher.hpp12
-rw-r--r--src/routines/level2/xher2.cpp87
-rw-r--r--src/routines/level2/xher2.hpp14
-rw-r--r--src/routines/level2/xhpmv.cpp28
-rw-r--r--src/routines/level2/xhpmv.hpp14
-rw-r--r--src/routines/level2/xhpr.cpp18
-rw-r--r--src/routines/level2/xhpr.hpp10
-rw-r--r--src/routines/level2/xhpr2.cpp22
-rw-r--r--src/routines/level2/xhpr2.hpp12
-rw-r--r--src/routines/level2/xsbmv.cpp28
-rw-r--r--src/routines/level2/xsbmv.hpp14
-rw-r--r--src/routines/level2/xspmv.cpp28
-rw-r--r--src/routines/level2/xspmv.hpp14
-rw-r--r--src/routines/level2/xspr.cpp18
-rw-r--r--src/routines/level2/xspr.hpp10
-rw-r--r--src/routines/level2/xspr2.cpp22
-rw-r--r--src/routines/level2/xspr2.hpp12
-rw-r--r--src/routines/level2/xsymv.cpp28
-rw-r--r--src/routines/level2/xsymv.hpp14
-rw-r--r--src/routines/level2/xsyr.cpp16
-rw-r--r--src/routines/level2/xsyr.hpp10
-rw-r--r--src/routines/level2/xsyr2.cpp20
-rw-r--r--src/routines/level2/xsyr2.hpp12
-rw-r--r--src/routines/level2/xtbmv.cpp44
-rw-r--r--src/routines/level2/xtbmv.hpp10
-rw-r--r--src/routines/level2/xtpmv.cpp44
-rw-r--r--src/routines/level2/xtpmv.hpp10
-rw-r--r--src/routines/level2/xtrmv.cpp44
-rw-r--r--src/routines/level2/xtrmv.hpp10
-rw-r--r--src/routines/level3/xgemm.cpp308
-rw-r--r--src/routines/level3/xgemm.hpp48
-rw-r--r--src/routines/level3/xhemm.cpp132
-rw-r--r--src/routines/level3/xhemm.hpp14
-rw-r--r--src/routines/level3/xher2k.cpp293
-rw-r--r--src/routines/level3/xher2k.hpp14
-rw-r--r--src/routines/level3/xherk.cpp203
-rw-r--r--src/routines/level3/xherk.hpp12
-rw-r--r--src/routines/level3/xsymm.cpp132
-rw-r--r--src/routines/level3/xsymm.hpp14
-rw-r--r--src/routines/level3/xsyr2k.cpp221
-rw-r--r--src/routines/level3/xsyr2k.hpp14
-rw-r--r--src/routines/level3/xsyrk.cpp171
-rw-r--r--src/routines/level3/xsyrk.hpp12
-rw-r--r--src/routines/level3/xtrmm.cpp134
-rw-r--r--src/routines/level3/xtrmm.hpp12
-rw-r--r--src/routines/levelx/xomatcopy.cpp32
-rw-r--r--src/routines/levelx/xomatcopy.hpp8
-rw-r--r--src/tuning/kernels/copy_fast.cpp2
-rw-r--r--src/tuning/kernels/copy_pad.cpp2
-rw-r--r--src/tuning/kernels/transpose_fast.cpp2
-rw-r--r--src/tuning/kernels/transpose_pad.cpp2
-rw-r--r--src/tuning/kernels/xaxpy.cpp2
-rw-r--r--src/tuning/kernels/xdot.cpp2
-rw-r--r--src/tuning/kernels/xgemm.cpp2
-rw-r--r--src/tuning/kernels/xgemm_direct.cpp2
-rw-r--r--src/tuning/kernels/xgemv.cpp2
-rw-r--r--src/tuning/kernels/xger.cpp2
-rw-r--r--src/tuning/tuning.hpp2
-rw-r--r--src/utilities/buffer_test.hpp113
-rw-r--r--src/utilities/clblast_exceptions.cpp95
-rw-r--r--src/utilities/clblast_exceptions.hpp50
-rw-r--r--src/utilities/msvc.hpp (renamed from src/msvc.hpp)0
-rw-r--r--src/utilities/utilities.cpp (renamed from src/utilities.cpp)2
-rw-r--r--src/utilities/utilities.hpp (renamed from src/utilities.hpp)9
-rw-r--r--test/correctness/testblas.cpp22
-rw-r--r--test/correctness/testblas.hpp32
-rw-r--r--test/correctness/tester.cpp30
-rw-r--r--test/correctness/tester.hpp30
-rw-r--r--test/performance/client.hpp2
-rw-r--r--test/wrapper_cblas.hpp2
-rw-r--r--test/wrapper_clblas.hpp2
153 files changed, 8873 insertions, 7816 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 2affaadd..48305f03 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,8 +1,11 @@
Development version (next release)
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
+- Changed the enums in the C API to avoid potential name clashes with external code
+- Greatly improved the way exceptions are handled in the library (thanks to 'intelfx')
- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
- Fixed a bug in the tests and samples related to waiting for an invalid event
+- Fixed a bug in the SYRK/SYR2K/HERK/HER2K routines that would occur with specific tuning parameters
- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
- Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS
- Added an option to run tuned kernels multiple times to average execution times
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf2a36dd..f5edbd75 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,9 +69,7 @@ endif()
if(MSVC)
if(BUILD_SHARED_LIBS)
add_definitions(" /DCLBLAST_DLL")
- else(BUILD_SHARED_LIBS)
- add_definitions(" /DCLBLAST_STATIC")
- endif(BUILD_SHARED_LIBS)
+ endif()
endif(MSVC)
# C++ compiler settings
@@ -167,11 +165,12 @@ set(PRECISIONS 32 64 3232 6464 16)
set(SOURCES
src/database/database.cpp
src/routines/common.cpp
+ src/utilities/clblast_exceptions.cpp
+ src/utilities/utilities.cpp
src/cache.cpp
src/clblast.cpp
src/clblast_c.cpp
src/routine.cpp
- src/utilities.cpp
)
foreach(ROUTINE ${LEVEL1_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
@@ -191,7 +190,7 @@ if(BUILD_SHARED_LIBS)
add_library(clblast SHARED ${SOURCES})
else(BUILD_SHARED_LIBS)
add_library(clblast STATIC ${SOURCES})
-endif(BUILD_SHARED_LIBS)
+endif()
target_link_libraries(clblast ${OPENCL_LIBRARIES})
@@ -206,7 +205,7 @@ target_include_directories(clblast PUBLIC
if(MSVC)
if(BUILD_SHARED_LIBS)
target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
- endif(BUILD_SHARED_LIBS)
+ endif()
endif()
# Installs the library
@@ -218,9 +217,17 @@ install(FILES include/clblast_half.h DESTINATION include)
# Installs the config for find_package in dependent projects
install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)
+# Install pkg-config file on Linux
+if(UNIX)
+ configure_file("${CMAKE_CURRENT_SOURCE_DIR}/clblast.pc.in"
+ "${CMAKE_CURRENT_BINARY_DIR}/clblast.pc" @ONLY IMMEDIATE)
+ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/clblast.pc
+ DESTINATION lib/pkgconfig)
+endif()
+
# ==================================================================================================
-# Sets a default platform ($DEVICEPLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests on
+# Sets a default platform ($CLBLAST_PLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests
set(DEVICEPLATFORM )
if(DEFINED ENV{CLBLAST_DEVICE})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{CLBLAST_DEVICE})
@@ -229,6 +236,12 @@ if(DEFINED ENV{CLBLAST_PLATFORM})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{CLBLAST_PLATFORM})
endif()
+# Optionally also provides other options to the tests such as -full_test ($CLBLAST_TEST_ARGUMENTS)
+set(TEST_ARGUMENTS )
+if(DEFINED ENV{CLBLAST_TEST_ARGUMENTS})
+ set(TEST_ARGUMENTS $ENV{CLBLAST_TEST_ARGUMENTS})
+endif()
+
# ==================================================================================================
# This section contains all the code related to the examples
@@ -262,7 +275,7 @@ if(TUNERS)
# Visual Studio requires the sources of non-exported objects/libraries
set(TUNERS_COMMON )
if(MSVC)
- set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities.cpp)
+ set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
endif()
# Adds tuning executables
@@ -298,7 +311,7 @@ if(CLIENTS OR TESTS)
find_package(Threads)
set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
- if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+ if(MSVC)
add_definitions(" /DCLBLAST_REF_CLBLAS")
else()
add_definitions(" -DCLBLAST_REF_CLBLAS")
@@ -307,7 +320,7 @@ if(CLIENTS OR TESTS)
if(CBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
- if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+ if(MSVC)
add_definitions(" /DCLBLAST_REF_CBLAS")
else()
add_definitions(" -DCLBLAST_REF_CBLAS")
@@ -325,7 +338,7 @@ if(CLIENTS)
# Visual Studio requires the sources of non-exported objects/libraries
set(CLIENTS_COMMON )
if(MSVC)
- set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities.cpp test/performance/client.cpp)
+ set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities/utilities.cpp test/performance/client.cpp)
else()
# Creates the common performance-tests objects (requires CMake 2.8.8)
add_library(test_performance_common OBJECT test/performance/client.cpp)
@@ -372,7 +385,7 @@ if(TESTS)
# Visual Studio requires the sources of non-exported objects/libraries
set(TESTS_COMMON )
if(MSVC)
- set(TESTS_COMMON ${TESTS_COMMON} src/utilities.cpp
+ set(TESTS_COMMON ${TESTS_COMMON} src/utilities/utilities.cpp
test/correctness/tester.cpp test/correctness/testblas.cpp)
else()
# Creates the common correctness-tests objects (requires CMake 2.8.8)
@@ -405,14 +418,14 @@ if(TESTS)
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
- add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM})
+ add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM} ${TEST_ARGUMENTS})
endforeach()
# Adds 'alltests' target: runs all tests
set(ALLTESTS )
set(ALLTESTSDEPENDS )
foreach(ROUTINE ${ROUTINES})
- set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM})
+ set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM} ${TEST_ARGUMENTS})
set(ALLTESTSDEPENDS clblast_test_${ROUTINE})
endforeach()
add_custom_target(alltests ${ALLTESTS} DEPENDS ${ALLTESTSDEPENDS})
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..9f698d32
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,20 @@
+
+CLBlast: Contributing guidelines
+================
+
+For information about the CLBlast library, see the [README](README.md) file instead.
+
+Tuning results
+-------------
+
+A [dedicated GitHub issue](https://github.com/CNugteren/CLBlast/issues/1) is available to post new tuning results. If you compiled with the tuners (see the [README](README.md) for instructions), ran one of the tuners on your device (or all perhaps?), and feel that these results should be included in the next release of CLBlast, please post them there. You can do this by attaching the JSON files to the issue (archived in a .ZIP file).
+
+
+Code improvements and additions
+-------------
+
+Pull requests are welcome as long as they:
+
+* Contain unit additions or modifications
+* Follow the CLBlast coding style, which is loosely based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers. We use a tab-size of 2 spaces and a max-width of 100 characters.
+* Are made against the `development` branch.
diff --git a/LICENSE b/LICENSE
index 75f63024..0df827ea 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,201 @@
-MIT License
-
-Copyright (c) 2016 Cedric Nugteren
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2015 Cedric Nugteren
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.md b/README.md
index a88f5ce1..9b289448 100644
--- a/README.md
+++ b/README.md
@@ -111,8 +111,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- GeForce GTX 750 Ti
- GeForce GTX 980
- GeForce GTX 1070
- - GeForce GTX Titan
- - GeForce GTX Titan X
+ - GeForce GTX TITAN
+ - GeForce GTX TITAN Black
+ - GeForce GTX TITAN X
- Tesla K20m
- Tesla K40m
* AMD GPUs:
@@ -121,6 +122,7 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- Oland
- Pitcairn
- Tahiti
+ - Tonga
* Intel GPUs:
- HD Graphics 530
- HD Graphics 5500 BroadWell U-Processor GT2
@@ -175,7 +177,7 @@ To build these tests, another BLAS library is needed to serve as a reference. Th
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further.
-All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
+All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake. Further options (e.g. `-full_test`) can be supplied through the `CLBLAST_TEST_ARGUMENTS` environmental variable.
Compiling the performance tests/clients (optional)
@@ -284,7 +286,7 @@ The `samples/haxpy.c` example shows how to use these convencience functions when
Contributing
-------------
-Contributions are welcome in the form of tuning results for OpenCL devices previously untested. Furthermore, merge requests are welcome as long as they contain unit additions or modifications. Furthermore, they should follow the CLBlast coding style, which is based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers.
+Contributions are welcome in the form of tuning results for OpenCL devices previously untested or pull requests. See [the contributing guidelines](CONTRIBUTING.md) for more details.
The contributing authors (code, pull requests, testing) so far are:
@@ -296,6 +298,7 @@ The contributing authors (code, pull requests, testing) so far are:
* [Gian-Carlo Pascutto](https://github.com/gcp)
* [Ivan Shapovalov](https://github.com/intelfx)
* [Dimitri Van Assche](https://github.com/dvasschemacq)
+* [Shehzan Mohammed](https://shehzan10.github.io)
Tuning and testing on a variety of OpenCL devices was made possible by:
@@ -303,6 +306,7 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
* [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
* [dividiti](http://www.dividiti.com)
* [SURFsara HPC center](http://www.surfsara.com)
+* [ArrayFire](http://arrayfire.org)
Support us
diff --git a/clblast.pc.in b/clblast.pc.in
new file mode 100644
index 00000000..2538add8
--- /dev/null
+++ b/clblast.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+includedir=${prefix}/include
+libdir=${exec_prefix}/lib
+
+Name: CLBlast
+Description: CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11
+Version: @clblast_VERSION_MAJOR@.@clblast_VERSION_MINOR@.@clblast_VERSION_PATCH@
+Libs: -L${libdir} -lclblast
+Cflags: -I${includedir}
diff --git a/doc/clblast.md b/doc/clblast.md
index 5105d023..37b99f3d 100644
--- a/doc/clblast.md
+++ b/doc/clblast.md
@@ -18,26 +18,26 @@ StatusCode Swap(const size_t n,
C API:
```
-StatusCode CLBlastSswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SWAP:
@@ -70,26 +70,26 @@ StatusCode Scal(const size_t n,
C API:
```
-StatusCode CLBlastSscal(const size_t n,
- const float alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDscal(const size_t n,
- const double alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCscal(const size_t n,
- const cl_float2 alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZscal(const size_t n,
- const cl_double2 alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHscal(const size_t n,
- const cl_half alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSscal(const size_t n,
+ const float alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDscal(const size_t n,
+ const double alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCscal(const size_t n,
+ const cl_float2 alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZscal(const size_t n,
+ const cl_double2 alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHscal(const size_t n,
+ const cl_half alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SCAL:
@@ -120,26 +120,26 @@ StatusCode Copy(const size_t n,
C API:
```
-StatusCode CLBlastScopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastScopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to COPY:
@@ -173,31 +173,31 @@ StatusCode Axpy(const size_t n,
C API:
```
-StatusCode CLBlastSaxpy(const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDaxpy(const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCaxpy(const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZaxpy(const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHaxpy(const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSaxpy(const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDaxpy(const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCaxpy(const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZaxpy(const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHaxpy(const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to AXPY:
@@ -232,21 +232,21 @@ StatusCode Dot(const size_t n,
C API:
```
-StatusCode CLBlastSdot(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDdot(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHdot(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSdot(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDdot(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHdot(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to DOT:
@@ -284,16 +284,16 @@ StatusCode Dotu(const size_t n,
C API:
```
-StatusCode CLBlastCdotu(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZdotu(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCdotu(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZdotu(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to DOTU:
@@ -331,16 +331,16 @@ StatusCode Dotc(const size_t n,
C API:
```
-StatusCode CLBlastCdotc(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZdotc(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCdotc(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZdotc(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to DOTC:
@@ -377,26 +377,26 @@ StatusCode Nrm2(const size_t n,
C API:
```
-StatusCode CLBlastSnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastScnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDznrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastScnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDznrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to NRM2:
@@ -430,26 +430,26 @@ StatusCode Asum(const size_t n,
C API:
```
-StatusCode CLBlastSasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastScasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDzasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastScasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDzasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to ASUM:
@@ -483,26 +483,26 @@ StatusCode Sum(const size_t n,
C API:
```
-StatusCode CLBlastSsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastScsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDzsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastScsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDzsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SUM:
@@ -536,26 +536,26 @@ StatusCode Amax(const size_t n,
C API:
```
-StatusCode CLBlastiSamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiDamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiCamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiZamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiHamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiSamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiDamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiCamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiZamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiHamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to AMAX:
@@ -589,26 +589,26 @@ StatusCode Max(const size_t n,
C API:
```
-StatusCode CLBlastiSmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiDmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiCmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiZmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiHmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiSmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiDmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiCmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiZmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiHmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to MAX:
@@ -642,26 +642,26 @@ StatusCode Min(const size_t n,
C API:
```
-StatusCode CLBlastiSmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiDmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiCmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiZmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastiHmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiSmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiDmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiCmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiZmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiHmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to MIN:
@@ -699,46 +699,46 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
C API:
```
-StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to GEMV:
@@ -787,46 +787,46 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
C API:
```
-StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to GBMV:
@@ -877,22 +877,22 @@ StatusCode Hemv(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastChemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to HEMV:
@@ -940,22 +940,22 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to HBMV:
@@ -1004,22 +1004,22 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to HPMV:
@@ -1062,30 +1062,30 @@ StatusCode Symv(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SYMV:
@@ -1133,30 +1133,30 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SBMV:
@@ -1205,30 +1205,30 @@ StatusCode Spmv(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SPMV:
@@ -1268,31 +1268,31 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_
C API:
```
-StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to TRMV:
@@ -1334,31 +1334,31 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_
C API:
```
-StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to TBMV:
@@ -1401,31 +1401,31 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_
C API:
```
-StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to TPMV:
@@ -1464,27 +1464,27 @@ StatusCode Ger(const Layout layout,
C API:
```
-StatusCode CLBlastSger(const Layout layout,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDger(const Layout layout,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHger(const Layout layout,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSger(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDger(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHger(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to GER:
@@ -1530,20 +1530,20 @@ StatusCode Geru(const Layout layout,
C API:
```
-StatusCode CLBlastCgeru(const Layout layout,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZgeru(const Layout layout,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgeru(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgeru(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to GERU:
@@ -1589,20 +1589,20 @@ StatusCode Gerc(const Layout layout,
C API:
```
-StatusCode CLBlastCgerc(const Layout layout,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZgerc(const Layout layout,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgerc(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgerc(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to GERC:
@@ -1647,18 +1647,18 @@ StatusCode Her(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastCher(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZher(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to HER:
@@ -1700,18 +1700,18 @@ StatusCode Hpr(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastChpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to HPR:
@@ -1749,20 +1749,20 @@ StatusCode Her2(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastCher2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZher2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to HER2:
@@ -1808,20 +1808,20 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to HPR2:
@@ -1861,24 +1861,24 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SYR:
@@ -1920,24 +1920,24 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastSspr(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDspr(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHspr(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SPR:
@@ -1975,27 +1975,27 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SYR2:
@@ -2041,27 +2041,27 @@ StatusCode Spr2(const Layout layout, const Triangle triangle,
C API:
```
-StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SPR2:
@@ -2103,46 +2103,46 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
C API:
```
-StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to GEMM:
@@ -2195,46 +2195,46 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
C API:
```
-StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SYMM:
@@ -2286,22 +2286,22 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
C API:
```
-StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to HEMM:
@@ -2352,41 +2352,41 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_
C API:
```
-StatusCode CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SYRK:
@@ -2433,20 +2433,20 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
C API:
```
-StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to HERK:
@@ -2494,46 +2494,46 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a
C API:
```
-StatusCode CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SYR2K:
@@ -2585,22 +2585,22 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
C API:
```
-StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to HER2K:
@@ -2650,36 +2650,36 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
C API:
```
-StatusCode CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to TRMM:
@@ -2726,36 +2726,36 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
C API:
```
-StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event)
-StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to OMATCOPY:
diff --git a/include/clblast.h b/include/clblast.h
index 0f52b2f9..7b2021d8 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -46,14 +46,34 @@ enum class StatusCode {
// Status codes in common with the OpenCL standard
kSuccess = 0, // CL_SUCCESS
+ kOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE
kTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
- kBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+ kOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES
+ kOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY
+ kOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+ kInvalidValue = -30, // CL_INVALID_VALUE
+ kInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE
+ kInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT
kInvalidBinary = -42, // CL_INVALID_BINARY
+ kInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS
+ kInvalidProgram = -44, // CL_INVALID_PROGRAM
+ kInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE
+ kInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME
+ kInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION
kInvalidKernel = -48, // CL_INVALID_KERNEL
+ kInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX
+ kInvalidArgValue = -50, // CL_INVALID_ARG_VALUE
+ kInvalidArgSize = -51, // CL_INVALID_ARG_SIZE
+ kInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS
kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
- kInvalidTempBufferSize = -61, // CL_INVALID_BUFFER_SIZE
+ kInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET
+ kInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST
+ kInvalidEvent = -58, // CL_INVALID_EVENT
+ kInvalidOperation = -59, // CL_INVALID_OPERATION
+ kInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE
+ kInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE
// Status codes in common with the clBLAS library
kNotImplemented = -1024, // Routine or functionality not implemented yet
@@ -75,13 +95,14 @@ enum class StatusCode {
kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
- kKernelLaunchError = -2048, // Problem occurred when enqueuing the kernel
- kKernelRunError = -2047, // Problem occurred while running the kernel
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
+ kDatabaseError = -2041, // Entry for the device was not found in the database
+ kUnknownError = -2040, // A catch-all error code representing an unspecified error
+ kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception
};
// Matrix layout and transpose types
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 33fb4acf..81f093cd 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -43,1277 +43,1303 @@ extern "C" {
// Status codes. These codes can be returned by functions declared in this header file. The error
// codes match either the standard OpenCL error codes or the clBLAS error codes.
-typedef enum StatusCode_ {
+typedef enum CLBlastStatusCode_ {
// Status codes in common with the OpenCL standard
- kSuccess = 0, // CL_SUCCESS
- kTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
- kBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
- kInvalidBinary = -42, // CL_INVALID_BINARY
- kInvalidKernel = -48, // CL_INVALID_KERNEL
- kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
- kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
- kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
- kInvalidTempBufferSize = -61, // CL_INVALID_BUFFER_SIZE
+ CLBlastSuccess = 0, // CL_SUCCESS
+ CLBlastOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE
+ CLBlastTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
+ CLBlastOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES
+ CLBlastOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY
+ CLBlastOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+ CLBlastInvalidValue = -30, // CL_INVALID_VALUE
+ CLBlastInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE
+ CLBlastInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT
+ CLBlastInvalidBinary = -42, // CL_INVALID_BINARY
+ CLBlastInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS
+ CLBlastInvalidProgram = -44, // CL_INVALID_PROGRAM
+ CLBlastInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE
+ CLBlastInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME
+ CLBlastInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION
+ CLBlastInvalidKernel = -48, // CL_INVALID_KERNEL
+ CLBlastInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX
+ CLBlastInvalidArgValue = -50, // CL_INVALID_ARG_VALUE
+ CLBlastInvalidArgSize = -51, // CL_INVALID_ARG_SIZE
+ CLBlastInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS
+ CLBlastInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
+ CLBlastInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
+ CLBlastInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
+ CLBlastInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET
+ CLBlastInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST
+ CLBlastInvalidEvent = -58, // CL_INVALID_EVENT
+ CLBlastInvalidOperation = -59, // CL_INVALID_OPERATION
+ CLBlastInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE
+ CLBlastInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE
// Status codes in common with the clBLAS library
- kNotImplemented = -1024, // Routine or functionality not implemented yet
- kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer
- kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer
- kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer
- kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer
- kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer
- kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero
- kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension
- kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension
- kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension
- kInvalidIncrementX = -1013, // Increment of vector X cannot be zero
- kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero
- kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small
- kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small
- kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small
- kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small
- kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
+ CLBlastNotImplemented = -1024, // Routine or functionality not implemented yet
+ CLBlastInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer
+ CLBlastInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer
+ CLBlastInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer
+ CLBlastInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer
+ CLBlastInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer
+ CLBlastInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero
+ CLBlastInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension
+ CLBlastInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension
+ CLBlastInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension
+ CLBlastInvalidIncrementX = -1013, // Increment of vector X cannot be zero
+ CLBlastInvalidIncrementY = -1012, // Increment of vector Y cannot be zero
+ CLBlastInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small
+ CLBlastInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small
+ CLBlastInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small
+ CLBlastInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small
+ CLBlastInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
- kKernelLaunchError = -2048, // Problem occurred when enqueuing the kernel
- kKernelRunError = -2047, // Problem occurred while running the kernel
- kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
- kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
- kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
- kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
- kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
-} StatusCode;
+ CLBlastInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
+ CLBlastNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
+ CLBlastNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
+ CLBlastInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
+ CLBlastInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
+ CLBlastDatabaseError = -2041, // Entry for the device was not found in the database
+ CLBlastUnknownError = -2040, // A catch-all error code representing an unspecified error
+ CLBlastUnexpectedError = -2039, // A catch-all error code representing an unexpected exception
+} CLBlastStatusCode;
// Matrix layout and transpose types
-typedef enum Layout_ { kRowMajor = 101, kColMajor = 102 } Layout;
-typedef enum Transpose_ { kNo = 111, kYes = 112, kConjugate = 113 } Transpose;
-typedef enum Triangle_ { kUpper = 121, kLower = 122 } Triangle;
-typedef enum Diagonal_ { kNonUnit = 131, kUnit = 132 } Diagonal;
-typedef enum Side_ { kLeft = 141, kRight = 142 } Side;
+typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101,
+ CLBlastLayoutColMajor = 102 } CLBlastLayout;
+typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112,
+ CLBlastTransposeConjugate = 113 } CLBlastTranspose;
+typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121,
+ CLBlastTriangleLower = 122 } CLBlastTriangle;
+typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
+ CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
+typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
// Precision scoped enum (values in bits)
-typedef enum Precision_ { kHalf = 16, kSingle = 32, kDouble = 64,
- kComplexSingle = 3232, kComplexDouble = 6464 } Precision;
+typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32,
+ CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232,
+ CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision;
// =================================================================================================
// BLAS level-1 (vector-vector) routines
// =================================================================================================
// Generate givens plane rotation: SROTG/DROTG
-StatusCode PUBLIC_API CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset,
- cl_mem sb_buffer, const size_t sb_offset,
- cl_mem sc_buffer, const size_t sc_offset,
- cl_mem ss_buffer, const size_t ss_offset,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
- cl_mem sb_buffer, const size_t sb_offset,
- cl_mem sc_buffer, const size_t sc_offset,
- cl_mem ss_buffer, const size_t ss_offset,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset,
+ cl_mem sb_buffer, const size_t sb_offset,
+ cl_mem sc_buffer, const size_t sc_offset,
+ cl_mem ss_buffer, const size_t ss_offset,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
+ cl_mem sb_buffer, const size_t sb_offset,
+ cl_mem sc_buffer, const size_t sc_offset,
+ cl_mem ss_buffer, const size_t ss_offset,
+ cl_command_queue* queue, cl_event* event);
// Generate modified givens plane rotation: SROTMG/DROTMG
-StatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
- cl_mem sd2_buffer, const size_t sd2_offset,
- cl_mem sx1_buffer, const size_t sx1_offset,
- const cl_mem sy1_buffer, const size_t sy1_offset,
- cl_mem sparam_buffer, const size_t sparam_offset,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
- cl_mem sd2_buffer, const size_t sd2_offset,
- cl_mem sx1_buffer, const size_t sx1_offset,
- const cl_mem sy1_buffer, const size_t sy1_offset,
- cl_mem sparam_buffer, const size_t sparam_offset,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
+ cl_mem sd2_buffer, const size_t sd2_offset,
+ cl_mem sx1_buffer, const size_t sx1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
+ cl_mem sparam_buffer, const size_t sparam_offset,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
+ cl_mem sd2_buffer, const size_t sd2_offset,
+ cl_mem sx1_buffer, const size_t sx1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
+ cl_mem sparam_buffer, const size_t sparam_offset,
+ cl_command_queue* queue, cl_event* event);
// Apply givens plane rotation: SROT/DROT
-StatusCode PUBLIC_API CLBlastSrot(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- const float cos,
- const float sin,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDrot(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- const double cos,
- const double sin,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSrot(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ const float cos,
+ const float sin,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDrot(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ const double cos,
+ const double sin,
+ cl_command_queue* queue, cl_event* event);
// Apply modified givens plane rotation: SROTM/DROTM
-StatusCode PUBLIC_API CLBlastSrotm(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem sparam_buffer, const size_t sparam_offset,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDrotm(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem sparam_buffer, const size_t sparam_offset,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSrotm(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem sparam_buffer, const size_t sparam_offset,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDrotm(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem sparam_buffer, const size_t sparam_offset,
+ cl_command_queue* queue, cl_event* event);
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
-StatusCode PUBLIC_API CLBlastSswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
-StatusCode PUBLIC_API CLBlastSscal(const size_t n,
- const float alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDscal(const size_t n,
- const double alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCscal(const size_t n,
- const cl_float2 alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZscal(const size_t n,
- const cl_double2 alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHscal(const size_t n,
- const cl_half alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSscal(const size_t n,
+ const float alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDscal(const size_t n,
+ const double alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCscal(const size_t n,
+ const cl_float2 alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZscal(const size_t n,
+ const cl_double2 alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHscal(const size_t n,
+ const cl_half alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
-StatusCode PUBLIC_API CLBlastScopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastScopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
-StatusCode PUBLIC_API CLBlastSaxpy(const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDaxpy(const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCaxpy(const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZaxpy(const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHaxpy(const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSaxpy(const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDaxpy(const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCaxpy(const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZaxpy(const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHaxpy(const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Dot product of two vectors: SDOT/DDOT/HDOT
-StatusCode PUBLIC_API CLBlastSdot(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDdot(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHdot(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSdot(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDdot(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHdot(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Dot product of two complex vectors: CDOTU/ZDOTU
-StatusCode PUBLIC_API CLBlastCdotu(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZdotu(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCdotu(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZdotu(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
-StatusCode PUBLIC_API CLBlastCdotc(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZdotc(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCdotc(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZdotc(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
-StatusCode PUBLIC_API CLBlastSnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastScnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDznrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastScnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDznrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
-StatusCode PUBLIC_API CLBlastSasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastScasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDzasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastScasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDzasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
-StatusCode PUBLIC_API CLBlastSsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastScsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDzsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastScsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDzsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
-StatusCode PUBLIC_API CLBlastiSamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiDamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiCamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiZamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiHamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiSamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiDamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiCamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiZamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiHamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
-StatusCode PUBLIC_API CLBlastiSmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiDmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiCmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiZmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiHmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiSmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiDmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiCmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiZmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiHmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
-StatusCode PUBLIC_API CLBlastiSmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiDmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiCmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiZmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastiHmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiSmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiDmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiCmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiZmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastiHmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
-StatusCode PUBLIC_API CLBlastSgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
-StatusCode PUBLIC_API CLBlastSgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
-StatusCode PUBLIC_API CLBlastChemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZhemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
-StatusCode PUBLIC_API CLBlastChbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZhbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
-StatusCode PUBLIC_API CLBlastChpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZhpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
-StatusCode PUBLIC_API CLBlastSsymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDsymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHsymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
-StatusCode PUBLIC_API CLBlastSsbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDsbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHsbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
-StatusCode PUBLIC_API CLBlastSspmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDspmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHspmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
-StatusCode PUBLIC_API CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
-StatusCode PUBLIC_API CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
-StatusCode PUBLIC_API CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
-StatusCode PUBLIC_API CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
-StatusCode PUBLIC_API CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastStbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
-StatusCode PUBLIC_API CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastStpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
// General rank-1 matrix update: SGER/DGER/HGER
-StatusCode PUBLIC_API CLBlastSger(const Layout layout,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDger(const Layout layout,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHger(const Layout layout,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSger(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDger(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHger(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
// General rank-1 complex matrix update: CGERU/ZGERU
-StatusCode PUBLIC_API CLBlastCgeru(const Layout layout,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZgeru(const Layout layout,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCgeru(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZgeru(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
// General rank-1 complex conjugated matrix update: CGERC/ZGERC
-StatusCode PUBLIC_API CLBlastCgerc(const Layout layout,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZgerc(const Layout layout,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCgerc(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZgerc(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
// Hermitian rank-1 matrix update: CHER/ZHER
-StatusCode PUBLIC_API CLBlastCher(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZher(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
// Hermitian packed rank-1 matrix update: CHPR/ZHPR
-StatusCode PUBLIC_API CLBlastChpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZhpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
// Hermitian rank-2 matrix update: CHER2/ZHER2
-StatusCode PUBLIC_API CLBlastCher2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZher2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
-StatusCode PUBLIC_API CLBlastChpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
-StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHsyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
-StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHspr(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
-StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHsyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
-StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHspr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// =================================================================================================
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
-StatusCode PUBLIC_API CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
-StatusCode PUBLIC_API CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
-StatusCode PUBLIC_API CLBlastChemm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
-StatusCode PUBLIC_API CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
// Rank-K update of a hermitian matrix: CHERK/ZHERK
-StatusCode PUBLIC_API CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
-StatusCode PUBLIC_API CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
-StatusCode PUBLIC_API CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
-StatusCode PUBLIC_API CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
-StatusCode PUBLIC_API CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
// =================================================================================================
// Extra non-BLAS routines (level-X)
// =================================================================================================
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
-StatusCode PUBLIC_API CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
-StatusCode PUBLIC_API CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
// =================================================================================================
// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
// for the same device. This cache can be cleared to free up system memory or in case of debugging.
-StatusCode PUBLIC_API CLBlastClearCache();
+CLBlastStatusCode PUBLIC_API CLBlastClearCache();
// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels.
// Further CLBlast routine calls will then run at maximum speed.
-StatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device);
+CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device);
// =================================================================================================
diff --git a/samples/cache.c b/samples/cache.c
index abc8ad4b..40f2163f 100644
--- a/samples/cache.c
+++ b/samples/cache.c
@@ -106,13 +106,13 @@ void run_example_routine(const cl_device_id device) {
clock_t start = clock();
// Calls an example routine
- StatusCode status = CLBlastSasum(n,
- device_output, 0,
- device_input, 0, 1,
- &queue, &event);
+ CLBlastStatusCode status = CLBlastSasum(n,
+ device_output, 0,
+ device_input, 0, 1,
+ &queue, &event);
// Wait for completion
- if (status == kSuccess) {
+ if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
diff --git a/samples/dgemv.c b/samples/dgemv.c
index a15d649a..dc2fe7db 100644
--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@@ -74,17 +74,17 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_y, CL_TRUE, 0, m*sizeof(double), host_y, 0, NULL, NULL);
// Call the DGEMV routine.
- StatusCode status = CLBlastDgemv(kRowMajor, kNo,
- m, n,
- alpha,
- device_a, 0, a_ld,
- device_x, 0, 1,
- beta,
- device_y, 0, 1,
- &queue, &event);
+ CLBlastStatusCode status = CLBlastDgemv(CLBlastLayoutRowMajor, CLBlastTransposeNo,
+ m, n,
+ alpha,
+ device_a, 0, a_ld,
+ device_x, 0, 1,
+ beta,
+ device_y, 0, 1,
+ &queue, &event);
// Wait for completion
- if (status == kSuccess) {
+ if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
diff --git a/samples/haxpy.c b/samples/haxpy.c
index 5bab3d42..8e0833f8 100644
--- a/samples/haxpy.c
+++ b/samples/haxpy.c
@@ -71,13 +71,13 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
// Call the HAXPY routine.
- StatusCode status = CLBlastHaxpy(n, alpha,
- device_a, 0, 1,
- device_b, 0, 1,
- &queue, &event);
+ CLBlastStatusCode status = CLBlastHaxpy(n, alpha,
+ device_a, 0, 1,
+ device_b, 0, 1,
+ &queue, &event);
// Wait for completion
- if (status == kSuccess) {
+ if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
diff --git a/samples/sasum.c b/samples/sasum.c
index 02f924b0..c285dd14 100644
--- a/samples/sasum.c
+++ b/samples/sasum.c
@@ -67,13 +67,13 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
// Call the SASUM routine.
- StatusCode status = CLBlastSasum(n,
- device_output, 0,
- device_input, 0, 1,
- &queue, &event);
+ CLBlastStatusCode status = CLBlastSasum(n,
+ device_output, 0,
+ device_input, 0, 1,
+ &queue, &event);
// Wait for completion
- if (status == kSuccess) {
+ if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
diff --git a/samples/sgemm.c b/samples/sgemm.c
index 583fc261..132dad81 100644
--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@@ -77,17 +77,18 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL);
// Call the SGEMM routine.
- StatusCode status = CLBlastSgemm(kRowMajor, kNo, kNo,
- m, n, k,
- alpha,
- device_a, 0, a_ld,
- device_b, 0, b_ld,
- beta,
- device_c, 0, c_ld,
- &queue, &event);
+ CLBlastStatusCode status = CLBlastSgemm(CLBlastLayoutRowMajor,
+ CLBlastTransposeNo, CLBlastTransposeNo,
+ m, n, k,
+ alpha,
+ device_a, 0, a_ld,
+ device_b, 0, b_ld,
+ beta,
+ device_c, 0, c_ld,
+ &queue, &event);
// Wait for completion
- if (status == kSuccess) {
+ if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 68ae9cbe..220b314d 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -31,9 +31,18 @@ import generator.doc as doc
from generator.routine import Routine
from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU
-
-HEADER_LINES = [96, 73, 97, 22, 29, 41, 43, 1]
-FOOTER_LINES = [17, 75, 19, 14, 6, 6, 10, 1]
+FILES = [
+ "/include/clblast.h",
+ "/src/clblast.cpp",
+ "/include/clblast_c.h",
+ "/src/clblast_c.cpp",
+ "/test/wrapper_clblas.hpp",
+ "/test/wrapper_cblas.hpp",
+ "/include/clblast_blas.h",
+ "/src/clblast_blas.cpp",
+]
+HEADER_LINES = [117, 73, 118, 22, 29, 41, 43, 1]
+FOOTER_LINES = [17, 80, 19, 18, 6, 6, 10, 1]
# Different possibilities for requirements
ald_m = "The value of `a_ld` must be at least `m`."
@@ -126,35 +135,23 @@ def main(argv):
cl_args = parser.parse_args(argv)
library_root = cl_args.clblast_root
- # Sets all the files the output
- files = [
- library_root + "/include/clblast.h",
- library_root + "/src/clblast.cpp",
- library_root + "/include/clblast_c.h",
- library_root + "/src/clblast_c.cpp",
- library_root + "/test/wrapper_clblas.hpp",
- library_root + "/test/wrapper_cblas.hpp",
- library_root + "/include/clblast_blas.h",
- library_root + "/src/clblast_blas.cpp",
- ]
-
# Checks whether the command-line arguments are valid; exists otherwise
- for f in files:
- if not os.path.isfile(f):
+ for f in FILES:
+ if not os.path.isfile(library_root + f):
print("[ERROR] The path '" + library_root + "' does not point to the root of the CLBlast library")
sys.exit()
# Iterates over all regular files to output
- for i in range(0, len(files)):
+ for i in range(0, len(FILES)):
# Stores the header and the footer of the original file
- with open(files[i]) as f:
+ with open(library_root + FILES[i]) as f:
original = f.readlines()
file_header = original[:HEADER_LINES[i]]
file_footer = original[-FOOTER_LINES[i]:]
# Re-writes the body of the file
- with open(files[i], "w") as f:
+ with open(library_root + FILES[i], "w") as f:
body = ""
levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4]
for level in levels:
diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py
index 83ddbcb2..61730fdb 100644
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@@ -45,17 +45,18 @@ def clblast_h(routine):
def clblast_cc(routine):
"""The C++ API implementation (.cpp)"""
- indent1 = " " * (20 + routine.length())
+ indent1 = " " * (15 + routine.length())
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
if routine.implemented:
result += routine.routine_header_cpp(12, "") + " {" + NL
- result += " auto queue_cpp = Queue(*queue);" + NL
- result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
- result += " auto status = routine.SetUp();" + NL
- result += " if (status != StatusCode::kSuccess) { return status; }" + NL
- result += " return routine.Do" + routine.name.capitalize() + "("
+ result += " try {" + NL
+ result += " auto queue_cpp = Queue(*queue);" + NL
+ result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
+ result += " routine.Do" + routine.name.capitalize() + "("
result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
result += ");" + NL
+ result += " return StatusCode::kSuccess;" + NL
+ result += " } catch (...) { return DispatchException(); }" + NL
else:
result += routine.routine_header_type_cpp(12) + " {" + NL
result += " return StatusCode::kNotImplemented;" + NL
@@ -72,7 +73,7 @@ def clblast_c_h(routine):
"""The C API header (.h)"""
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
for flavour in routine.flavours:
- result += routine.routine_header_c(flavour, 31, " PUBLIC_API") + ";" + NL
+ result += routine.routine_header_c(flavour, 38, " PUBLIC_API") + ";" + NL
return result
@@ -81,12 +82,16 @@ def clblast_c_cc(routine):
result = NL + "// " + routine.name.upper() + NL
for flavour in routine.flavours:
template = "<" + flavour.template + ">" if routine.no_scalars() else ""
- indent = " " * (26 + routine.length() + len(template))
- result += routine.routine_header_c(flavour, 20, "") + " {" + NL
- result += " auto status = clblast::" + routine.name.capitalize() + template + "("
+ indent = " " * (16 + routine.length() + len(template))
+ result += routine.routine_header_c(flavour, 27, "") + " {" + NL
+ result += " try {" + NL
+ result += " return static_cast<CLBlastStatusCode>(" + NL
+ result += " clblast::" + routine.name.capitalize() + template + "("
result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
- result += "," + NL + indent + "queue, event);"
- result += NL + " return static_cast<StatusCode>(status);" + NL + "}" + NL
+ result += "," + NL + indent + "queue, event)" + NL
+ result += " );" + NL
+ result += " } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }" + NL
+ result += "}" + NL
return result
diff --git a/scripts/generator/generator/doc.py b/scripts/generator/generator/doc.py
index 8657ed0d..c77ec1a0 100644
--- a/scripts/generator/generator/doc.py
+++ b/scripts/generator/generator/doc.py
@@ -32,7 +32,7 @@ def generate(routine):
result += "C API:" + NL
result += "```" + NL
for flavour in routine.flavours:
- result += routine.routine_header_c(flavour, 20, "") + NL
+ result += routine.routine_header_c(flavour, 27, "") + NL
result += "```" + NL + NL
# Routine arguments
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 126d64ce..795fc532 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -390,6 +390,13 @@ class Routine:
return [", ".join(definitions)]
return []
+ def options_def_c(self):
+ """As above, but now for the C API"""
+ if self.options:
+ definitions = ["const CLBlast" + convert.option_to_clblast(o) + " " + o for o in self.options]
+ return [", ".join(definitions)]
+ return []
+
def options_def_wrapper_clblas(self):
"""As above, but now using clBLAS data-types"""
if self.options:
@@ -505,6 +512,17 @@ class Routine:
list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+ def arguments_def_c(self, flavour):
+ """As above, but for the C API"""
+ return (self.options_def_c() + self.sizes_def() +
+ list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) +
+ self.scalar_def("alpha", flavour) +
+ list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) +
+ self.scalar_def("beta", flavour) +
+ list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
+ list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
+ list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+
def arguments_def_wrapper_clblas(self, flavour):
"""As above, but clBLAS wrapper plain data-types"""
return (self.options_def_wrapper_clblas() + self.sizes_def() +
@@ -575,8 +593,8 @@ class Routine:
def routine_header_c(self, flavour, spaces, extra_qualifier):
"""As above, but now for C"""
indent = " " * (spaces + self.length())
- result = "StatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
- result += (",\n" + indent).join([a for a in self.arguments_def(flavour)])
+ result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
+ result += (",\n" + indent).join([a for a in self.arguments_def_c(flavour)])
result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
return result
diff --git a/src/buffer_test.hpp b/src/buffer_test.hpp
deleted file mode 100644
index 80f5243f..00000000
--- a/src/buffer_test.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
-// templated and thus header-only.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_BUFFER_TEST_H_
-#define CLBLAST_BUFFER_TEST_H_
-
-#include "clblast.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Tests matrix 'A' for validity
-template <typename T>
-StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
- const size_t offset, const size_t ld) {
- if (ld < one) { return StatusCode::kInvalidLeadDimA; }
- try {
- const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
- } catch (...) { return StatusCode::kInvalidMatrixA; }
- return StatusCode::kSuccess;
-}
-
-// Tests matrix 'B' for validity
-template <typename T>
-StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
- const size_t offset, const size_t ld) {
- if (ld < one) { return StatusCode::kInvalidLeadDimB; }
- try {
- const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
- } catch (...) { return StatusCode::kInvalidMatrixB; }
- return StatusCode::kSuccess;
-}
-
-// Tests matrix 'C' for validity
-template <typename T>
-StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
- const size_t offset, const size_t ld) {
- if (ld < one) { return StatusCode::kInvalidLeadDimC; }
- try {
- const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
- } catch (...) { return StatusCode::kInvalidMatrixC; }
- return StatusCode::kSuccess;
-}
-
-// Tests matrix 'AP' for validity
-template <typename T>
-StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
- try {
- const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
- } catch (...) { return StatusCode::kInvalidMatrixA; }
- return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector 'X' for validity
-template <typename T>
-StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
- const size_t inc) {
- if (inc == 0) { return StatusCode::kInvalidIncrementX; }
- try {
- const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
- } catch (...) { return StatusCode::kInvalidVectorX; }
- return StatusCode::kSuccess;
-}
-
-// Tests vector 'Y' for validity
-template <typename T>
-StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
- const size_t inc) {
- if (inc == 0) { return StatusCode::kInvalidIncrementY; }
- try {
- const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
- } catch (...) { return StatusCode::kInvalidVectorY; }
- return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector 'scalar' for validity
-template <typename T>
-StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
- try {
- const auto required_size = (n + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
- } catch (...) { return StatusCode::kInvalidVectorScalar; }
- return StatusCode::kSuccess;
-}
-
-// Tests vector 'index' for validity
-template <typename T>
-StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
- try {
- const auto required_size = (n + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
- } catch (...) { return StatusCode::kInvalidVectorScalar; }
- return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_BUFFER_TEST_H_
-#endif
diff --git a/src/cache.cpp b/src/cache.cpp
index 6080f082..6786eaa2 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -57,7 +57,7 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
}
}
binary_cache_mutex_.unlock();
- throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none.");
+ throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
}
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
@@ -75,7 +75,7 @@ const Program& GetProgramFromCache(const Context &context, const Precision &prec
}
}
program_cache_mutex_.unlock();
- throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
+ throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
}
// Queries the cache to see whether or not the compiled kernel is already there
@@ -109,14 +109,13 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
// =================================================================================================
// Clears the cache of stored binaries and programs
-StatusCode CacheClearAll() {
+void CacheClearAll() {
binary_cache_mutex_.lock();
binary_cache_.clear();
binary_cache_mutex_.unlock();
program_cache_mutex_.lock();
program_cache_.clear();
program_cache_mutex_.unlock();
- return StatusCode::kSuccess;
}
// =================================================================================================
diff --git a/src/cache.hpp b/src/cache.hpp
index 9075da0d..9ecb0f1e 100644
--- a/src/cache.hpp
+++ b/src/cache.hpp
@@ -18,7 +18,7 @@
#include <vector>
#include <mutex>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
@@ -89,7 +89,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
// =================================================================================================
// Clears the cache of stored binaries
-StatusCode CacheClearAll();
+void CacheClearAll();
// =================================================================================================
} // namespace clblast
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 79c30ca4..4bb4e0b3 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -168,13 +168,14 @@ StatusCode Swap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xswap<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSwap(n,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xswap<T>(queue_cpp, event);
+ routine.DoSwap(n,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Swap<float>(const size_t,
cl_mem, const size_t, const size_t,
@@ -203,13 +204,14 @@ StatusCode Scal(const size_t n,
const T alpha,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xscal<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoScal(n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xscal<T>(queue_cpp, event);
+ routine.DoScal(n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Scal<float>(const size_t,
const float,
@@ -238,13 +240,14 @@ StatusCode Copy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xcopy<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoCopy(n,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xcopy<T>(queue_cpp, event);
+ routine.DoCopy(n,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Copy<float>(const size_t,
const cl_mem, const size_t, const size_t,
@@ -274,14 +277,15 @@ StatusCode Axpy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xaxpy<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoAxpy(n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xaxpy<T>(queue_cpp, event);
+ routine.DoAxpy(n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Axpy<float>(const size_t,
const float,
@@ -316,14 +320,15 @@ StatusCode Dot(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xdot<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoDot(n,
- Buffer<T>(dot_buffer), dot_offset,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xdot<T>(queue_cpp, event);
+ routine.DoDot(n,
+ Buffer<T>(dot_buffer), dot_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Dot<float>(const size_t,
cl_mem, const size_t,
@@ -348,14 +353,15 @@ StatusCode Dotu(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xdotu<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoDotu(n,
- Buffer<T>(dot_buffer), dot_offset,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xdotu<T>(queue_cpp, event);
+ routine.DoDotu(n,
+ Buffer<T>(dot_buffer), dot_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Dotu<float2>(const size_t,
cl_mem, const size_t,
@@ -375,14 +381,15 @@ StatusCode Dotc(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xdotc<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoDotc(n,
- Buffer<T>(dot_buffer), dot_offset,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xdotc<T>(queue_cpp, event);
+ routine.DoDotc(n,
+ Buffer<T>(dot_buffer), dot_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Dotc<float2>(const size_t,
cl_mem, const size_t,
@@ -401,13 +408,14 @@ StatusCode Nrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xnrm2<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoNrm2(n,
- Buffer<T>(nrm2_buffer), nrm2_offset,
- Buffer<T>(x_buffer), x_offset, x_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xnrm2<T>(queue_cpp, event);
+ routine.DoNrm2(n,
+ Buffer<T>(nrm2_buffer), nrm2_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Nrm2<float>(const size_t,
cl_mem, const size_t,
@@ -436,13 +444,14 @@ StatusCode Asum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xasum<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoAsum(n,
- Buffer<T>(asum_buffer), asum_offset,
- Buffer<T>(x_buffer), x_offset, x_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xasum<T>(queue_cpp, event);
+ routine.DoAsum(n,
+ Buffer<T>(asum_buffer), asum_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Asum<float>(const size_t,
cl_mem, const size_t,
@@ -471,13 +480,14 @@ StatusCode Sum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xsum<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSum(n,
- Buffer<T>(sum_buffer), sum_offset,
- Buffer<T>(x_buffer), x_offset, x_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsum<T>(queue_cpp, event);
+ routine.DoSum(n,
+ Buffer<T>(sum_buffer), sum_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Sum<float>(const size_t,
cl_mem, const size_t,
@@ -506,13 +516,14 @@ StatusCode Amax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xamax<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoAmax(n,
- Buffer<unsigned int>(imax_buffer), imax_offset,
- Buffer<T>(x_buffer), x_offset, x_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xamax<T>(queue_cpp, event);
+ routine.DoAmax(n,
+ Buffer<unsigned int>(imax_buffer), imax_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Amax<float>(const size_t,
cl_mem, const size_t,
@@ -541,13 +552,14 @@ StatusCode Max(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xmax<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoMax(n,
- Buffer<unsigned int>(imax_buffer), imax_offset,
- Buffer<T>(x_buffer), x_offset, x_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xmax<T>(queue_cpp, event);
+ routine.DoMax(n,
+ Buffer<unsigned int>(imax_buffer), imax_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Max<float>(const size_t,
cl_mem, const size_t,
@@ -576,13 +588,14 @@ StatusCode Min(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xmin<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoMin(n,
- Buffer<unsigned int>(imin_buffer), imin_offset,
- Buffer<T>(x_buffer), x_offset, x_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xmin<T>(queue_cpp, event);
+ routine.DoMin(n,
+ Buffer<unsigned int>(imin_buffer), imin_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Min<float>(const size_t,
cl_mem, const size_t,
@@ -619,17 +632,18 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xgemv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoGemv(layout, a_transpose,
- m, n,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(x_buffer), x_offset, x_inc,
- beta,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xgemv<T>(queue_cpp, event);
+ routine.DoGemv(layout, a_transpose,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Gemv<float>(const Layout, const Transpose,
const size_t, const size_t,
@@ -682,17 +696,18 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xgbmv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoGbmv(layout, a_transpose,
- m, n, kl, ku,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(x_buffer), x_offset, x_inc,
- beta,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xgbmv<T>(queue_cpp, event);
+ routine.DoGbmv(layout, a_transpose,
+ m, n, kl, ku,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Gbmv<float>(const Layout, const Transpose,
const size_t, const size_t, const size_t, const size_t,
@@ -745,17 +760,18 @@ StatusCode Hemv(const Layout layout, const Triangle triangle,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xhemv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoHemv(layout, triangle,
- n,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(x_buffer), x_offset, x_inc,
- beta,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhemv<T>(queue_cpp, event);
+ routine.DoHemv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Hemv<float2>(const Layout, const Triangle,
const size_t,
@@ -784,17 +800,18 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xhbmv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoHbmv(layout, triangle,
- n, k,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(x_buffer), x_offset, x_inc,
- beta,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhbmv<T>(queue_cpp, event);
+ routine.DoHbmv(layout, triangle,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Hbmv<float2>(const Layout, const Triangle,
const size_t, const size_t,
@@ -823,17 +840,18 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xhpmv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoHpmv(layout, triangle,
- n,
- alpha,
- Buffer<T>(ap_buffer), ap_offset,
- Buffer<T>(x_buffer), x_offset, x_inc,
- beta,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhpmv<T>(queue_cpp, event);
+ routine.DoHpmv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Hpmv<float2>(const Layout, const Triangle,
const size_t,
@@ -862,17 +880,18 @@ StatusCode Symv(const Layout layout, const Triangle triangle,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xsymv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSymv(layout, triangle,
- n,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(x_buffer), x_offset, x_inc,
- beta,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsymv<T>(queue_cpp, event);
+ routine.DoSymv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Symv<float>(const Layout, const Triangle,
const size_t,
@@ -909,17 +928,18 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xsbmv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSbmv(layout, triangle,
- n, k,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(x_buffer), x_offset, x_inc,
- beta,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsbmv<T>(queue_cpp, event);
+ routine.DoSbmv(layout, triangle,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Sbmv<float>(const Layout, const Triangle,
const size_t, const size_t,
@@ -956,17 +976,18 @@ StatusCode Spmv(const Layout layout, const Triangle triangle,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xspmv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSpmv(layout, triangle,
- n,
- alpha,
- Buffer<T>(ap_buffer), ap_offset,
- Buffer<T>(x_buffer), x_offset, x_inc,
- beta,
- Buffer<T>(y_buffer), y_offset, y_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xspmv<T>(queue_cpp, event);
+ routine.DoSpmv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Spmv<float>(const Layout, const Triangle,
const size_t,
@@ -1000,14 +1021,15 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xtrmv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoTrmv(layout, triangle, a_transpose, diagonal,
- n,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(x_buffer), x_offset, x_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xtrmv<T>(queue_cpp, event);
+ routine.DoTrmv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
@@ -1042,14 +1064,15 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xtbmv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoTbmv(layout, triangle, a_transpose, diagonal,
- n, k,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(x_buffer), x_offset, x_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xtbmv<T>(queue_cpp, event);
+ routine.DoTbmv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
@@ -1084,14 +1107,15 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_
const cl_mem ap_buffer, const size_t ap_offset,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xtpmv<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoTpmv(layout, triangle, a_transpose, diagonal,
- n,
- Buffer<T>(ap_buffer), ap_offset,
- Buffer<T>(x_buffer), x_offset, x_inc);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xtpmv<T>(queue_cpp, event);
+ routine.DoTpmv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
@@ -1218,16 +1242,17 @@ StatusCode Ger(const Layout layout,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xger<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoGer(layout,
- m, n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc,
- Buffer<T>(a_buffer), a_offset, a_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xger<T>(queue_cpp, event);
+ routine.DoGer(layout,
+ m, n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Ger<float>(const Layout,
const size_t, const size_t,
@@ -1260,16 +1285,17 @@ StatusCode Geru(const Layout layout,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xgeru<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoGeru(layout,
- m, n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc,
- Buffer<T>(a_buffer), a_offset, a_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xgeru<T>(queue_cpp, event);
+ routine.DoGeru(layout,
+ m, n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Geru<float2>(const Layout,
const size_t, const size_t,
@@ -1295,16 +1321,17 @@ StatusCode Gerc(const Layout layout,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xgerc<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoGerc(layout,
- m, n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc,
- Buffer<T>(a_buffer), a_offset, a_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xgerc<T>(queue_cpp, event);
+ routine.DoGerc(layout,
+ m, n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Gerc<float2>(const Layout,
const size_t, const size_t,
@@ -1329,15 +1356,16 @@ StatusCode Her(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xher<std::complex<T>,T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoHer(layout, triangle,
- n,
- alpha,
- Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
- Buffer<std::complex<T>>(a_buffer), a_offset, a_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xher<std::complex<T>,T>(queue_cpp, event);
+ routine.DoHer(layout, triangle,
+ n,
+ alpha,
+ Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
+ Buffer<std::complex<T>>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Her<float>(const Layout, const Triangle,
const size_t,
@@ -1360,15 +1388,16 @@ StatusCode Hpr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoHpr(layout, triangle,
- n,
- alpha,
- Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
- Buffer<std::complex<T>>(ap_buffer), ap_offset);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event);
+ routine.DoHpr(layout, triangle,
+ n,
+ alpha,
+ Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
+ Buffer<std::complex<T>>(ap_buffer), ap_offset);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Hpr<float>(const Layout, const Triangle,
const size_t,
@@ -1392,16 +1421,17 @@ StatusCode Her2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xher2<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoHer2(layout, triangle,
- n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc,
- Buffer<T>(a_buffer), a_offset, a_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xher2<T>(queue_cpp, event);
+ routine.DoHer2(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Her2<float2>(const Layout, const Triangle,
const size_t,
@@ -1427,16 +1457,17 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xhpr2<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoHpr2(layout, triangle,
- n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc,
- Buffer<T>(ap_buffer), ap_offset);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhpr2<T>(queue_cpp, event);
+ routine.DoHpr2(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(ap_buffer), ap_offset);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Hpr2<float2>(const Layout, const Triangle,
const size_t,
@@ -1461,15 +1492,16 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xsyr<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSyr(layout, triangle,
- n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(a_buffer), a_offset, a_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsyr<T>(queue_cpp, event);
+ routine.DoSyr(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Syr<float>(const Layout, const Triangle,
const size_t,
@@ -1498,15 +1530,16 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xspr<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSpr(layout, triangle,
- n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(ap_buffer), ap_offset);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xspr<T>(queue_cpp, event);
+ routine.DoSpr(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(ap_buffer), ap_offset);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Spr<float>(const Layout, const Triangle,
const size_t,
@@ -1536,16 +1569,17 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xsyr2<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSyr2(layout, triangle,
- n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc,
- Buffer<T>(a_buffer), a_offset, a_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsyr2<T>(queue_cpp, event);
+ routine.DoSyr2(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Syr2<float>(const Layout, const Triangle,
const size_t,
@@ -1578,16 +1612,17 @@ StatusCode Spr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xspr2<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSpr2(layout, triangle,
- n,
- alpha,
- Buffer<T>(x_buffer), x_offset, x_inc,
- Buffer<T>(y_buffer), y_offset, y_inc,
- Buffer<T>(ap_buffer), ap_offset);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xspr2<T>(queue_cpp, event);
+ routine.DoSpr2(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(ap_buffer), ap_offset);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Spr2<float>(const Layout, const Triangle,
const size_t,
@@ -1625,17 +1660,18 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xgemm<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoGemm(layout, a_transpose, b_transpose,
- m, n, k,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(b_buffer), b_offset, b_ld,
- beta,
- Buffer<T>(c_buffer), c_offset, c_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xgemm<T>(queue_cpp, event);
+ routine.DoGemm(layout, a_transpose, b_transpose,
+ m, n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Gemm<float>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
@@ -1688,17 +1724,18 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xsymm<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSymm(layout, side, triangle,
- m, n,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(b_buffer), b_offset, b_ld,
- beta,
- Buffer<T>(c_buffer), c_offset, c_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsymm<T>(queue_cpp, event);
+ routine.DoSymm(layout, side, triangle,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Symm<float>(const Layout, const Side, const Triangle,
const size_t, const size_t,
@@ -1751,17 +1788,18 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xhemm<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoHemm(layout, side, triangle,
- m, n,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(b_buffer), b_offset, b_ld,
- beta,
- Buffer<T>(c_buffer), c_offset, c_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhemm<T>(queue_cpp, event);
+ routine.DoHemm(layout, side, triangle,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Hemm<float2>(const Layout, const Side, const Triangle,
const size_t, const size_t,
@@ -1789,16 +1827,17 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xsyrk<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSyrk(layout, triangle, a_transpose,
- n, k,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- beta,
- Buffer<T>(c_buffer), c_offset, c_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsyrk<T>(queue_cpp, event);
+ routine.DoSyrk(layout, triangle, a_transpose,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Syrk<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t,
@@ -1845,16 +1884,17 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xherk<std::complex<T>,T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoHerk(layout, triangle, a_transpose,
- n, k,
- alpha,
- Buffer<std::complex<T>>(a_buffer), a_offset, a_ld,
- beta,
- Buffer<std::complex<T>>(c_buffer), c_offset, c_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xherk<std::complex<T>,T>(queue_cpp, event);
+ routine.DoHerk(layout, triangle, a_transpose,
+ n, k,
+ alpha,
+ Buffer<std::complex<T>>(a_buffer), a_offset, a_ld,
+ beta,
+ Buffer<std::complex<T>>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Herk<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t,
@@ -1881,17 +1921,18 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xsyr2k<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoSyr2k(layout, triangle, ab_transpose,
- n, k,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(b_buffer), b_offset, b_ld,
- beta,
- Buffer<T>(c_buffer), c_offset, c_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsyr2k<T>(queue_cpp, event);
+ routine.DoSyr2k(layout, triangle, ab_transpose,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Syr2k<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t,
@@ -1944,17 +1985,18 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
const U beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xher2k<T,U>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoHer2k(layout, triangle, ab_transpose,
- n, k,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(b_buffer), b_offset, b_ld,
- beta,
- Buffer<T>(c_buffer), c_offset, c_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xher2k<T,U>(queue_cpp, event);
+ routine.DoHer2k(layout, triangle, ab_transpose,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Her2k<float2,float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t,
@@ -1981,15 +2023,16 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xtrmm<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal,
- m, n,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(b_buffer), b_offset, b_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xtrmm<T>(queue_cpp, event);
+ routine.DoTrmm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Trmm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
@@ -2075,15 +2118,16 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = Queue(*queue);
- auto routine = Xomatcopy<T>(queue_cpp, event);
- auto status = routine.SetUp();
- if (status != StatusCode::kSuccess) { return status; }
- return routine.DoOmatcopy(layout, a_transpose,
- m, n,
- alpha,
- Buffer<T>(a_buffer), a_offset, a_ld,
- Buffer<T>(b_buffer), b_offset, b_ld);
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xomatcopy<T>(queue_cpp, event);
+ routine.DoOmatcopy(layout, a_transpose,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Omatcopy<float>(const Layout, const Transpose,
const size_t, const size_t,
@@ -2119,7 +2163,12 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
// =================================================================================================
// Clears the cache of stored binaries
-StatusCode ClearCache() { return CacheClearAll(); }
+StatusCode ClearCache() {
+ try {
+ CacheClearAll();
+ } catch (...) { return DispatchException(); }
+ return StatusCode::kSuccess;
+}
// Fills the cache with all binaries for a specific device
// TODO: Add half-precision FP16 set-up calls
@@ -2132,59 +2181,59 @@ StatusCode FillCache(const cl_device_id device) {
auto queue = Queue(context, device_cpp);
// Runs all the level 1 set-up functions
- Xswap<float>(queue, nullptr).SetUp(); Xswap<double>(queue, nullptr).SetUp(); Xswap<float2>(queue, nullptr).SetUp(); Xswap<double2>(queue, nullptr).SetUp();
- Xswap<float>(queue, nullptr).SetUp(); Xswap<double>(queue, nullptr).SetUp(); Xswap<float2>(queue, nullptr).SetUp(); Xswap<double2>(queue, nullptr).SetUp();
- Xscal<float>(queue, nullptr).SetUp(); Xscal<double>(queue, nullptr).SetUp(); Xscal<float2>(queue, nullptr).SetUp(); Xscal<double2>(queue, nullptr).SetUp();
- Xcopy<float>(queue, nullptr).SetUp(); Xcopy<double>(queue, nullptr).SetUp(); Xcopy<float2>(queue, nullptr).SetUp(); Xcopy<double2>(queue, nullptr).SetUp();
- Xaxpy<float>(queue, nullptr).SetUp(); Xaxpy<double>(queue, nullptr).SetUp(); Xaxpy<float2>(queue, nullptr).SetUp(); Xaxpy<double2>(queue, nullptr).SetUp();
- Xdot<float>(queue, nullptr).SetUp(); Xdot<double>(queue, nullptr).SetUp();
- Xdotu<float2>(queue, nullptr).SetUp(); Xdotu<double2>(queue, nullptr).SetUp();
- Xdotc<float2>(queue, nullptr).SetUp(); Xdotc<double2>(queue, nullptr).SetUp();
- Xnrm2<float>(queue, nullptr).SetUp(); Xnrm2<double>(queue, nullptr).SetUp(); Xnrm2<float2>(queue, nullptr).SetUp(); Xnrm2<double2>(queue, nullptr).SetUp();
- Xasum<float>(queue, nullptr).SetUp(); Xasum<double>(queue, nullptr).SetUp(); Xasum<float2>(queue, nullptr).SetUp(); Xasum<double2>(queue, nullptr).SetUp();
- Xsum<float>(queue, nullptr).SetUp(); Xsum<double>(queue, nullptr).SetUp(); Xsum<float2>(queue, nullptr).SetUp(); Xsum<double2>(queue, nullptr).SetUp();
- Xamax<float>(queue, nullptr).SetUp(); Xamax<double>(queue, nullptr).SetUp(); Xamax<float2>(queue, nullptr).SetUp(); Xamax<double2>(queue, nullptr).SetUp();
- Xmax<float>(queue, nullptr).SetUp(); Xmax<double>(queue, nullptr).SetUp(); Xmax<float2>(queue, nullptr).SetUp(); Xmax<double2>(queue, nullptr).SetUp();
- Xmin<float>(queue, nullptr).SetUp(); Xmin<double>(queue, nullptr).SetUp(); Xmin<float2>(queue, nullptr).SetUp(); Xmin<double2>(queue, nullptr).SetUp();
+ Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
+ Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
+ Xscal<float>(queue, nullptr); Xscal<double>(queue, nullptr); Xscal<float2>(queue, nullptr); Xscal<double2>(queue, nullptr);
+ Xcopy<float>(queue, nullptr); Xcopy<double>(queue, nullptr); Xcopy<float2>(queue, nullptr); Xcopy<double2>(queue, nullptr);
+ Xaxpy<float>(queue, nullptr); Xaxpy<double>(queue, nullptr); Xaxpy<float2>(queue, nullptr); Xaxpy<double2>(queue, nullptr);
+ Xdot<float>(queue, nullptr); Xdot<double>(queue, nullptr);
+ Xdotu<float2>(queue, nullptr); Xdotu<double2>(queue, nullptr);
+ Xdotc<float2>(queue, nullptr); Xdotc<double2>(queue, nullptr);
+ Xnrm2<float>(queue, nullptr); Xnrm2<double>(queue, nullptr); Xnrm2<float2>(queue, nullptr); Xnrm2<double2>(queue, nullptr);
+ Xasum<float>(queue, nullptr); Xasum<double>(queue, nullptr); Xasum<float2>(queue, nullptr); Xasum<double2>(queue, nullptr);
+ Xsum<float>(queue, nullptr); Xsum<double>(queue, nullptr); Xsum<float2>(queue, nullptr); Xsum<double2>(queue, nullptr);
+ Xamax<float>(queue, nullptr); Xamax<double>(queue, nullptr); Xamax<float2>(queue, nullptr); Xamax<double2>(queue, nullptr);
+ Xmax<float>(queue, nullptr); Xmax<double>(queue, nullptr); Xmax<float2>(queue, nullptr); Xmax<double2>(queue, nullptr);
+ Xmin<float>(queue, nullptr); Xmin<double>(queue, nullptr); Xmin<float2>(queue, nullptr); Xmin<double2>(queue, nullptr);
// Runs all the level 2 set-up functions
- Xgemv<float>(queue, nullptr).SetUp(); Xgemv<double>(queue, nullptr).SetUp(); Xgemv<float2>(queue, nullptr).SetUp(); Xgemv<double2>(queue, nullptr).SetUp();
- Xgbmv<float>(queue, nullptr).SetUp(); Xgbmv<double>(queue, nullptr).SetUp(); Xgbmv<float2>(queue, nullptr).SetUp(); Xgbmv<double2>(queue, nullptr).SetUp();
- Xhemv<float2>(queue, nullptr).SetUp(); Xhemv<double2>(queue, nullptr).SetUp();
- Xhbmv<float2>(queue, nullptr).SetUp(); Xhbmv<double2>(queue, nullptr).SetUp();
- Xhpmv<float2>(queue, nullptr).SetUp(); Xhpmv<double2>(queue, nullptr).SetUp();
- Xsymv<float>(queue, nullptr).SetUp(); Xsymv<double>(queue, nullptr).SetUp();
- Xsbmv<float>(queue, nullptr).SetUp(); Xsbmv<double>(queue, nullptr).SetUp();
- Xspmv<float>(queue, nullptr).SetUp(); Xspmv<double>(queue, nullptr).SetUp();
- Xtrmv<float>(queue, nullptr).SetUp(); Xtrmv<double>(queue, nullptr).SetUp(); Xtrmv<float2>(queue, nullptr).SetUp(); Xtrmv<double2>(queue, nullptr).SetUp();
- Xtbmv<float>(queue, nullptr).SetUp(); Xtbmv<double>(queue, nullptr).SetUp(); Xtbmv<float2>(queue, nullptr).SetUp(); Xtbmv<double2>(queue, nullptr).SetUp();
- Xtpmv<float>(queue, nullptr).SetUp(); Xtpmv<double>(queue, nullptr).SetUp(); Xtpmv<float2>(queue, nullptr).SetUp(); Xtpmv<double2>(queue, nullptr).SetUp();
- Xger<float>(queue, nullptr).SetUp(); Xger<double>(queue, nullptr).SetUp();
- Xgeru<float2>(queue, nullptr).SetUp(); Xgeru<double2>(queue, nullptr).SetUp();
- Xgerc<float2>(queue, nullptr).SetUp(); Xgerc<double2>(queue, nullptr).SetUp();
- Xher<float2,float>(queue, nullptr).SetUp(); Xher<double2,double>(queue, nullptr).SetUp();
- Xhpr<float2,float>(queue, nullptr).SetUp(); Xhpr<double2,double>(queue, nullptr).SetUp();
- Xher2<float2>(queue, nullptr).SetUp(); Xher2<double2>(queue, nullptr).SetUp();
- Xhpr2<float2>(queue, nullptr).SetUp(); Xhpr2<double2>(queue, nullptr).SetUp();
- Xsyr<float>(queue, nullptr).SetUp(); Xsyr<double>(queue, nullptr).SetUp();
- Xspr<float>(queue, nullptr).SetUp(); Xspr<double>(queue, nullptr).SetUp();
- Xsyr2<float>(queue, nullptr).SetUp(); Xsyr2<double>(queue, nullptr).SetUp();
- Xspr2<float>(queue, nullptr).SetUp(); Xspr2<double>(queue, nullptr).SetUp();
+ Xgemv<float>(queue, nullptr); Xgemv<double>(queue, nullptr); Xgemv<float2>(queue, nullptr); Xgemv<double2>(queue, nullptr);
+ Xgbmv<float>(queue, nullptr); Xgbmv<double>(queue, nullptr); Xgbmv<float2>(queue, nullptr); Xgbmv<double2>(queue, nullptr);
+ Xhemv<float2>(queue, nullptr); Xhemv<double2>(queue, nullptr);
+ Xhbmv<float2>(queue, nullptr); Xhbmv<double2>(queue, nullptr);
+ Xhpmv<float2>(queue, nullptr); Xhpmv<double2>(queue, nullptr);
+ Xsymv<float>(queue, nullptr); Xsymv<double>(queue, nullptr);
+ Xsbmv<float>(queue, nullptr); Xsbmv<double>(queue, nullptr);
+ Xspmv<float>(queue, nullptr); Xspmv<double>(queue, nullptr);
+ Xtrmv<float>(queue, nullptr); Xtrmv<double>(queue, nullptr); Xtrmv<float2>(queue, nullptr); Xtrmv<double2>(queue, nullptr);
+ Xtbmv<float>(queue, nullptr); Xtbmv<double>(queue, nullptr); Xtbmv<float2>(queue, nullptr); Xtbmv<double2>(queue, nullptr);
+ Xtpmv<float>(queue, nullptr); Xtpmv<double>(queue, nullptr); Xtpmv<float2>(queue, nullptr); Xtpmv<double2>(queue, nullptr);
+ Xger<float>(queue, nullptr); Xger<double>(queue, nullptr);
+ Xgeru<float2>(queue, nullptr); Xgeru<double2>(queue, nullptr);
+ Xgerc<float2>(queue, nullptr); Xgerc<double2>(queue, nullptr);
+ Xher<float2,float>(queue, nullptr); Xher<double2,double>(queue, nullptr);
+ Xhpr<float2,float>(queue, nullptr); Xhpr<double2,double>(queue, nullptr);
+ Xher2<float2>(queue, nullptr); Xher2<double2>(queue, nullptr);
+ Xhpr2<float2>(queue, nullptr); Xhpr2<double2>(queue, nullptr);
+ Xsyr<float>(queue, nullptr); Xsyr<double>(queue, nullptr);
+ Xspr<float>(queue, nullptr); Xspr<double>(queue, nullptr);
+ Xsyr2<float>(queue, nullptr); Xsyr2<double>(queue, nullptr);
+ Xspr2<float>(queue, nullptr); Xspr2<double>(queue, nullptr);
// Runs all the level 3 set-up functions
- Xgemm<float>(queue, nullptr).SetUp(); Xgemm<double>(queue, nullptr).SetUp(); Xgemm<float2>(queue, nullptr).SetUp(); Xgemm<double2>(queue, nullptr).SetUp();
- Xsymm<float>(queue, nullptr).SetUp(); Xsymm<double>(queue, nullptr).SetUp(); Xsymm<float2>(queue, nullptr).SetUp(); Xsymm<double2>(queue, nullptr).SetUp();
- Xhemm<float2>(queue, nullptr).SetUp(); Xhemm<double2>(queue, nullptr).SetUp();
- Xsyrk<float>(queue, nullptr).SetUp(); Xsyrk<double>(queue, nullptr).SetUp(); Xsyrk<float2>(queue, nullptr).SetUp(); Xsyrk<double2>(queue, nullptr).SetUp();
- Xherk<float2,float>(queue, nullptr).SetUp(); Xherk<double2,double>(queue, nullptr).SetUp();
- Xsyr2k<float>(queue, nullptr).SetUp(); Xsyr2k<double>(queue, nullptr).SetUp(); Xsyr2k<float2>(queue, nullptr).SetUp(); Xsyr2k<double2>(queue, nullptr).SetUp();
- Xher2k<float2,float>(queue, nullptr).SetUp(); Xher2k<double2,double>(queue, nullptr).SetUp();
- Xtrmm<float>(queue, nullptr).SetUp(); Xtrmm<double>(queue, nullptr).SetUp(); Xtrmm<float2>(queue, nullptr).SetUp(); Xtrmm<double2>(queue, nullptr).SetUp();
+ Xgemm<float>(queue, nullptr); Xgemm<double>(queue, nullptr); Xgemm<float2>(queue, nullptr); Xgemm<double2>(queue, nullptr);
+ Xsymm<float>(queue, nullptr); Xsymm<double>(queue, nullptr); Xsymm<float2>(queue, nullptr); Xsymm<double2>(queue, nullptr);
+ Xhemm<float2>(queue, nullptr); Xhemm<double2>(queue, nullptr);
+ Xsyrk<float>(queue, nullptr); Xsyrk<double>(queue, nullptr); Xsyrk<float2>(queue, nullptr); Xsyrk<double2>(queue, nullptr);
+ Xherk<float2,float>(queue, nullptr); Xherk<double2,double>(queue, nullptr);
+ Xsyr2k<float>(queue, nullptr); Xsyr2k<double>(queue, nullptr); Xsyr2k<float2>(queue, nullptr); Xsyr2k<double2>(queue, nullptr);
+ Xher2k<float2,float>(queue, nullptr); Xher2k<double2,double>(queue, nullptr);
+ Xtrmm<float>(queue, nullptr); Xtrmm<double>(queue, nullptr); Xtrmm<float2>(queue, nullptr); Xtrmm<double2>(queue, nullptr);
// Runs all the level 3 set-up functions
- Xomatcopy<float>(queue, nullptr).SetUp(); Xomatcopy<double>(queue, nullptr).SetUp(); Xomatcopy<float2>(queue, nullptr).SetUp(); Xomatcopy<double2>(queue, nullptr).SetUp();
+ Xomatcopy<float>(queue, nullptr); Xomatcopy<double>(queue, nullptr); Xomatcopy<float2>(queue, nullptr); Xomatcopy<double2>(queue, nullptr);
- } catch (...) { return StatusCode::kBuildProgramFailure; }
+ } catch (...) { return DispatchException(); }
return StatusCode::kSuccess;
}
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index 9ea2c884..59e4cd16 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -15,7 +15,7 @@
#include "clblast_c.h"
#include "clblast.h"
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
@@ -26,735 +26,930 @@ using double2 = clblast::double2;
// =================================================================================================
// ROTG
-StatusCode CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset,
- cl_mem sb_buffer, const size_t sb_offset,
- cl_mem sc_buffer, const size_t sc_offset,
- cl_mem ss_buffer, const size_t ss_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Rotg<float>(sa_buffer, sa_offset,
- sb_buffer, sb_offset,
- sc_buffer, sc_offset,
- ss_buffer, ss_offset,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
- cl_mem sb_buffer, const size_t sb_offset,
- cl_mem sc_buffer, const size_t sc_offset,
- cl_mem ss_buffer, const size_t ss_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Rotg<double>(sa_buffer, sa_offset,
- sb_buffer, sb_offset,
- sc_buffer, sc_offset,
- ss_buffer, ss_offset,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset,
+ cl_mem sb_buffer, const size_t sb_offset,
+ cl_mem sc_buffer, const size_t sc_offset,
+ cl_mem ss_buffer, const size_t ss_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Rotg<float>(sa_buffer, sa_offset,
+ sb_buffer, sb_offset,
+ sc_buffer, sc_offset,
+ ss_buffer, ss_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
+ cl_mem sb_buffer, const size_t sb_offset,
+ cl_mem sc_buffer, const size_t sc_offset,
+ cl_mem ss_buffer, const size_t ss_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Rotg<double>(sa_buffer, sa_offset,
+ sb_buffer, sb_offset,
+ sc_buffer, sc_offset,
+ ss_buffer, ss_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// ROTMG
-StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
- cl_mem sd2_buffer, const size_t sd2_offset,
- cl_mem sx1_buffer, const size_t sx1_offset,
- const cl_mem sy1_buffer, const size_t sy1_offset,
- cl_mem sparam_buffer, const size_t sparam_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Rotmg<float>(sd1_buffer, sd1_offset,
- sd2_buffer, sd2_offset,
- sx1_buffer, sx1_offset,
- sy1_buffer, sy1_offset,
- sparam_buffer, sparam_offset,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
- cl_mem sd2_buffer, const size_t sd2_offset,
- cl_mem sx1_buffer, const size_t sx1_offset,
- const cl_mem sy1_buffer, const size_t sy1_offset,
- cl_mem sparam_buffer, const size_t sparam_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Rotmg<double>(sd1_buffer, sd1_offset,
- sd2_buffer, sd2_offset,
- sx1_buffer, sx1_offset,
- sy1_buffer, sy1_offset,
- sparam_buffer, sparam_offset,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
+ cl_mem sd2_buffer, const size_t sd2_offset,
+ cl_mem sx1_buffer, const size_t sx1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
+ cl_mem sparam_buffer, const size_t sparam_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Rotmg<float>(sd1_buffer, sd1_offset,
+ sd2_buffer, sd2_offset,
+ sx1_buffer, sx1_offset,
+ sy1_buffer, sy1_offset,
+ sparam_buffer, sparam_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
+ cl_mem sd2_buffer, const size_t sd2_offset,
+ cl_mem sx1_buffer, const size_t sx1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
+ cl_mem sparam_buffer, const size_t sparam_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Rotmg<double>(sd1_buffer, sd1_offset,
+ sd2_buffer, sd2_offset,
+ sx1_buffer, sx1_offset,
+ sy1_buffer, sy1_offset,
+ sparam_buffer, sparam_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// ROT
-StatusCode CLBlastSrot(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- const float cos,
- const float sin,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Rot(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- cos,
- sin,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDrot(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- const double cos,
- const double sin,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Rot(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- cos,
- sin,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSrot(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ const float cos,
+ const float sin,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Rot(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ cos,
+ sin,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDrot(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ const double cos,
+ const double sin,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Rot(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ cos,
+ sin,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// ROTM
-StatusCode CLBlastSrotm(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem sparam_buffer, const size_t sparam_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Rotm<float>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- sparam_buffer, sparam_offset,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDrotm(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem sparam_buffer, const size_t sparam_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Rotm<double>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- sparam_buffer, sparam_offset,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSrotm(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem sparam_buffer, const size_t sparam_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Rotm<float>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ sparam_buffer, sparam_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDrotm(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem sparam_buffer, const size_t sparam_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Rotm<double>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ sparam_buffer, sparam_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SWAP
-StatusCode CLBlastSswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Swap<float>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Swap<double>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Swap<float2>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Swap<double2>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHswap(const size_t n,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Swap<half>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Swap<float>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Swap<double>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Swap<float2>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Swap<double2>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHswap(const size_t n,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Swap<half>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SCAL
-StatusCode CLBlastSscal(const size_t n,
- const float alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Scal(n,
- alpha,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDscal(const size_t n,
- const double alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Scal(n,
- alpha,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCscal(const size_t n,
- const cl_float2 alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Scal(n,
- float2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZscal(const size_t n,
- const cl_double2 alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Scal(n,
- double2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHscal(const size_t n,
- const cl_half alpha,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Scal(n,
- alpha,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSscal(const size_t n,
+ const float alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Scal(n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDscal(const size_t n,
+ const double alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Scal(n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCscal(const size_t n,
+ const cl_float2 alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Scal(n,
+ float2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZscal(const size_t n,
+ const cl_double2 alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Scal(n,
+ double2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHscal(const size_t n,
+ const cl_half alpha,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Scal(n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// COPY
-StatusCode CLBlastScopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Copy<float>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Copy<double>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Copy<float2>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Copy<double2>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHcopy(const size_t n,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Copy<half>(n,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastScopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Copy<float>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Copy<double>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Copy<float2>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Copy<double2>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHcopy(const size_t n,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Copy<half>(n,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// AXPY
-StatusCode CLBlastSaxpy(const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Axpy(n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDaxpy(const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Axpy(n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCaxpy(const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Axpy(n,
- float2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZaxpy(const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Axpy(n,
- double2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHaxpy(const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Axpy(n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSaxpy(const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Axpy(n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDaxpy(const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Axpy(n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCaxpy(const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Axpy(n,
+ float2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZaxpy(const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Axpy(n,
+ double2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHaxpy(const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Axpy(n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// DOT
-StatusCode CLBlastSdot(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Dot<float>(n,
- dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDdot(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Dot<double>(n,
- dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHdot(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Dot<half>(n,
- dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSdot(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Dot<float>(n,
+ dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDdot(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Dot<double>(n,
+ dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHdot(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Dot<half>(n,
+ dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// DOTU
-StatusCode CLBlastCdotu(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Dotu<float2>(n,
- dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZdotu(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Dotu<double2>(n,
- dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastCdotu(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Dotu<float2>(n,
+ dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZdotu(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Dotu<double2>(n,
+ dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// DOTC
-StatusCode CLBlastCdotc(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Dotc<float2>(n,
- dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZdotc(const size_t n,
- cl_mem dot_buffer, const size_t dot_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Dotc<double2>(n,
- dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastCdotc(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Dotc<float2>(n,
+ dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZdotc(const size_t n,
+ cl_mem dot_buffer, const size_t dot_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Dotc<double2>(n,
+ dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// NRM2
-StatusCode CLBlastSnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Nrm2<float>(n,
- nrm2_buffer, nrm2_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Nrm2<double>(n,
- nrm2_buffer, nrm2_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastScnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Nrm2<float2>(n,
- nrm2_buffer, nrm2_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDznrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Nrm2<double2>(n,
- nrm2_buffer, nrm2_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHnrm2(const size_t n,
- cl_mem nrm2_buffer, const size_t nrm2_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Nrm2<half>(n,
- nrm2_buffer, nrm2_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Nrm2<float>(n,
+ nrm2_buffer, nrm2_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Nrm2<double>(n,
+ nrm2_buffer, nrm2_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastScnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Nrm2<float2>(n,
+ nrm2_buffer, nrm2_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDznrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Nrm2<double2>(n,
+ nrm2_buffer, nrm2_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHnrm2(const size_t n,
+ cl_mem nrm2_buffer, const size_t nrm2_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Nrm2<half>(n,
+ nrm2_buffer, nrm2_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// ASUM
-StatusCode CLBlastSasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Asum<float>(n,
- asum_buffer, asum_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Asum<double>(n,
- asum_buffer, asum_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastScasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Asum<float2>(n,
- asum_buffer, asum_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDzasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Asum<double2>(n,
- asum_buffer, asum_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHasum(const size_t n,
- cl_mem asum_buffer, const size_t asum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Asum<half>(n,
- asum_buffer, asum_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Asum<float>(n,
+ asum_buffer, asum_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Asum<double>(n,
+ asum_buffer, asum_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastScasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Asum<float2>(n,
+ asum_buffer, asum_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDzasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Asum<double2>(n,
+ asum_buffer, asum_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHasum(const size_t n,
+ cl_mem asum_buffer, const size_t asum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Asum<half>(n,
+ asum_buffer, asum_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SUM
-StatusCode CLBlastSsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Sum<float>(n,
- sum_buffer, sum_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Sum<double>(n,
- sum_buffer, sum_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastScsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Sum<float2>(n,
- sum_buffer, sum_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDzsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Sum<double2>(n,
- sum_buffer, sum_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHsum(const size_t n,
- cl_mem sum_buffer, const size_t sum_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Sum<half>(n,
- sum_buffer, sum_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Sum<float>(n,
+ sum_buffer, sum_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Sum<double>(n,
+ sum_buffer, sum_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastScsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Sum<float2>(n,
+ sum_buffer, sum_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDzsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Sum<double2>(n,
+ sum_buffer, sum_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHsum(const size_t n,
+ cl_mem sum_buffer, const size_t sum_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Sum<half>(n,
+ sum_buffer, sum_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// AMAX
-StatusCode CLBlastiSamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Amax<float>(n,
- imax_buffer, imax_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiDamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Amax<double>(n,
- imax_buffer, imax_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiCamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Amax<float2>(n,
- imax_buffer, imax_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiZamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Amax<double2>(n,
- imax_buffer, imax_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiHamax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Amax<half>(n,
- imax_buffer, imax_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastiSamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Amax<float>(n,
+ imax_buffer, imax_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiDamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Amax<double>(n,
+ imax_buffer, imax_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiCamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Amax<float2>(n,
+ imax_buffer, imax_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiZamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Amax<double2>(n,
+ imax_buffer, imax_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiHamax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Amax<half>(n,
+ imax_buffer, imax_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// MAX
-StatusCode CLBlastiSmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Max<float>(n,
- imax_buffer, imax_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiDmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Max<double>(n,
- imax_buffer, imax_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiCmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Max<float2>(n,
- imax_buffer, imax_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiZmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Max<double2>(n,
- imax_buffer, imax_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiHmax(const size_t n,
- cl_mem imax_buffer, const size_t imax_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Max<half>(n,
- imax_buffer, imax_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastiSmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Max<float>(n,
+ imax_buffer, imax_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiDmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Max<double>(n,
+ imax_buffer, imax_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiCmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Max<float2>(n,
+ imax_buffer, imax_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiZmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Max<double2>(n,
+ imax_buffer, imax_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiHmax(const size_t n,
+ cl_mem imax_buffer, const size_t imax_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Max<half>(n,
+ imax_buffer, imax_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// MIN
-StatusCode CLBlastiSmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Min<float>(n,
- imin_buffer, imin_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiDmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Min<double>(n,
- imin_buffer, imin_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiCmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Min<float2>(n,
- imin_buffer, imin_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiZmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Min<double2>(n,
- imin_buffer, imin_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastiHmin(const size_t n,
- cl_mem imin_buffer, const size_t imin_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Min<half>(n,
- imin_buffer, imin_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastiSmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Min<float>(n,
+ imin_buffer, imin_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiDmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Min<double>(n,
+ imin_buffer, imin_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiCmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Min<float2>(n,
+ imin_buffer, imin_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiZmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Min<double2>(n,
+ imin_buffer, imin_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastiHmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Min<half>(n,
+ imin_buffer, imin_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// =================================================================================================
@@ -762,1365 +957,1602 @@ StatusCode CLBlastiHmin(const size_t n,
// =================================================================================================
// GEMV
-StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gemv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gemv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gemv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- float2{beta.s[0], beta.s[1]},
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gemv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- double2{beta.s[0], beta.s[1]},
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gemv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ float2{beta.s[0], beta.s[1]},
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ double2{beta.s[0], beta.s[1]},
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// GBMV
-StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n, kl, ku,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n, kl, ku,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n, kl, ku,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- float2{beta.s[0], beta.s[1]},
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n, kl, ku,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- double2{beta.s[0], beta.s[1]},
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n, kl, ku,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ float2{beta.s[0], beta.s[1]},
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ double2{beta.s[0], beta.s[1]},
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// HEMV
-StatusCode CLBlastChemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hemv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- float2{beta.s[0], beta.s[1]},
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hemv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- double2{beta.s[0], beta.s[1]},
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ float2{beta.s[0], beta.s[1]},
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ double2{beta.s[0], beta.s[1]},
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// HBMV
-StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hbmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n, k,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- float2{beta.s[0], beta.s[1]},
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hbmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n, k,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- double2{beta.s[0], beta.s[1]},
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ float2{beta.s[0], beta.s[1]},
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ double2{beta.s[0], beta.s[1]},
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// HPMV
-StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_float2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hpmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- float2{alpha.s[0], alpha.s[1]},
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- float2{beta.s[0], beta.s[1]},
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_double2 beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hpmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- double2{alpha.s[0], alpha.s[1]},
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- double2{beta.s[0], beta.s[1]},
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_float2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hpmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ float2{alpha.s[0], alpha.s[1]},
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ float2{beta.s[0], beta.s[1]},
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_double2 beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hpmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ double2{alpha.s[0], alpha.s[1]},
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ double2{beta.s[0], beta.s[1]},
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SYMV
-StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Symv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Symv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Symv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Symv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Symv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Symv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SBMV
-StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Sbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Sbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Sbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SPMV
-StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const float beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Spmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const double beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Spmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem ap_buffer, const size_t ap_offset,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_half beta,
- cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Spmv(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- beta,
- y_buffer, y_offset, y_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Spmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Spmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_half beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Spmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ beta,
+ y_buffer, y_offset, y_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// TRMV
-StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trmv<float>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trmv<double>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trmv<float2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trmv<double2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trmv<half>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trmv<half>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// TBMV
-StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tbmv<float>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n, k,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tbmv<double>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n, k,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tbmv<float2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n, k,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tbmv<double2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n, k,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tbmv<half>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n, k,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tbmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tbmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tbmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tbmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tbmv<half>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// TPMV
-StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tpmv<float>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tpmv<double>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tpmv<float2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tpmv<double2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tpmv<half>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tpmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tpmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tpmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tpmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tpmv<half>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// TRSV
-StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trsv<float>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trsv<double>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trsv<float2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trsv<double2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// TBSV
-StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tbsv<float>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n, k,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tbsv<double>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n, k,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tbsv<float2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n, k,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tbsv<double2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n, k,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastStbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tbsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tbsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tbsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tbsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// TPSV
-StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tpsv<float>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tpsv<double>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tpsv<float2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const cl_mem ap_buffer, const size_t ap_offset,
- cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Tpsv<double2>(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- n,
- ap_buffer, ap_offset,
- x_buffer, x_offset, x_inc,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastStpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tpsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tpsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tpsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Tpsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer, ap_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// GER
-StatusCode CLBlastSger(const Layout layout,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Ger(static_cast<clblast::Layout>(layout),
- m, n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDger(const Layout layout,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Ger(static_cast<clblast::Layout>(layout),
- m, n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHger(const Layout layout,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Ger(static_cast<clblast::Layout>(layout),
- m, n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSger(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Ger(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDger(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Ger(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHger(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Ger(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// GERU
-StatusCode CLBlastCgeru(const Layout layout,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Geru(static_cast<clblast::Layout>(layout),
- m, n,
- float2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZgeru(const Layout layout,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Geru(static_cast<clblast::Layout>(layout),
- m, n,
- double2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastCgeru(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Geru(static_cast<clblast::Layout>(layout),
+ m, n,
+ float2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZgeru(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Geru(static_cast<clblast::Layout>(layout),
+ m, n,
+ double2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// GERC
-StatusCode CLBlastCgerc(const Layout layout,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gerc(static_cast<clblast::Layout>(layout),
- m, n,
- float2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZgerc(const Layout layout,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gerc(static_cast<clblast::Layout>(layout),
- m, n,
- double2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastCgerc(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gerc(static_cast<clblast::Layout>(layout),
+ m, n,
+ float2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZgerc(const CLBlastLayout layout,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gerc(static_cast<clblast::Layout>(layout),
+ m, n,
+ double2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// HER
-StatusCode CLBlastCher(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Her(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZher(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Her(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Her(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Her(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// HPR
-StatusCode CLBlastChpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hpr(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- ap_buffer, ap_offset,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hpr(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- ap_buffer, ap_offset,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hpr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ ap_buffer, ap_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hpr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ ap_buffer, ap_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// HER2
-StatusCode CLBlastCher2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Her2(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- float2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZher2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Her2(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- double2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Her2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ float2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Her2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ double2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// HPR2
-StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_float2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hpr2(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- float2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- ap_buffer, ap_offset,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_double2 alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hpr2(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- double2{alpha.s[0], alpha.s[1]},
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- ap_buffer, ap_offset,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hpr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ float2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ ap_buffer, ap_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hpr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ double2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ ap_buffer, ap_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SYR
-StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SPR
-StatusCode CLBlastSspr(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Spr(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- ap_buffer, ap_offset,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDspr(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Spr(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- ap_buffer, ap_offset,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHspr(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Spr(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- ap_buffer, ap_offset,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Spr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ ap_buffer, ap_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Spr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ ap_buffer, ap_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Spr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ ap_buffer, ap_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SYR2
-StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr2(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr2(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr2(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SPR2
-StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const float alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Spr2(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- ap_buffer, ap_offset,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const double alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Spr2(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- ap_buffer, ap_offset,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const cl_half alpha,
- const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
- const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
- cl_mem ap_buffer, const size_t ap_offset,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Spr2(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- n,
- alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- ap_buffer, ap_offset,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Spr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ ap_buffer, ap_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Spr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ ap_buffer, ap_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Spr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ ap_buffer, ap_offset,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// =================================================================================================
@@ -2128,707 +2560,815 @@ StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
// =================================================================================================
// GEMM
-StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gemm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Transpose>(b_transpose),
- m, n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gemm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Transpose>(b_transpose),
- m, n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gemm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Transpose>(b_transpose),
- m, n, k,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- float2{beta.s[0], beta.s[1]},
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gemm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Transpose>(b_transpose),
- m, n, k,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- double2{beta.s[0], beta.s[1]},
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Gemm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Transpose>(b_transpose),
- m, n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ float2{beta.s[0], beta.s[1]},
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ double2{beta.s[0], beta.s[1]},
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SYMM
-StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Symm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Symm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Symm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- m, n,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- float2{beta.s[0], beta.s[1]},
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Symm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- m, n,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- double2{beta.s[0], beta.s[1]},
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Symm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ float2{beta.s[0], beta.s[1]},
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ double2{beta.s[0], beta.s[1]},
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// HEMM
-StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hemm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- m, n,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- float2{beta.s[0], beta.s[1]},
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Hemm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- m, n,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- double2{beta.s[0], beta.s[1]},
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ float2{beta.s[0], beta.s[1]},
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Hemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ double2{beta.s[0], beta.s[1]},
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SYRK
-StatusCode CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syrk(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syrk(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syrk(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- n, k,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- float2{beta.s[0], beta.s[1]},
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syrk(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- n, k,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- double2{beta.s[0], beta.s[1]},
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syrk(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ float2{beta.s[0], beta.s[1]},
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ double2{beta.s[0], beta.s[1]},
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// HERK
-StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Herk(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Herk(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Herk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Herk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// SYR2K
-StatusCode CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(ab_transpose),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(ab_transpose),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_float2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(ab_transpose),
- n, k,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- float2{beta.s[0], beta.s[1]},
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_double2 beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(ab_transpose),
- n, k,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- double2{beta.s[0], beta.s[1]},
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const cl_half beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(ab_transpose),
- n, k,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_float2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ float2{beta.s[0], beta.s[1]},
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_double2 beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ double2{beta.s[0], beta.s[1]},
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const cl_half beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// HER2K
-StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const float beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Her2k(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(ab_transpose),
- n, k,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- const double beta,
- cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Her2k(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(ab_transpose),
- n, k,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Her2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const size_t n, const size_t k,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Her2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// TRMM
-StatusCode CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trmm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trmm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trmm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- m, n,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trmm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- m, n,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trmm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// TRSM
-StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trsm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trsm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trsm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- m, n,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trsm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- m, n,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Trsm(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Side>(side),
- static_cast<clblast::Triangle>(triangle),
- static_cast<clblast::Transpose>(a_transpose),
- static_cast<clblast::Diagonal>(diagonal),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// =================================================================================================
@@ -2836,92 +3376,111 @@ StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle tri
// =================================================================================================
// OMATCOPY
-StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const float alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const double alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_float2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n,
- float2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_double2 alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n,
- double2{alpha.s[0], alpha.s[1]},
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
-}
-StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const cl_half alpha,
- const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
- cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
- cl_command_queue* queue, cl_event* event) {
- auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
- static_cast<clblast::Transpose>(a_transpose),
- m, n,
- alpha,
- a_buffer, a_offset, a_ld,
- b_buffer, b_offset, b_ld,
- queue, event);
- return static_cast<StatusCode>(status);
+CLBlastStatusCode CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// =================================================================================================
// Clears the cache of stored binaries
-StatusCode CLBlastClearCache() {
- return static_cast<StatusCode>(clblast::ClearCache());
+CLBlastStatusCode CLBlastClearCache() {
+ try {
+ return static_cast<CLBlastStatusCode>(clblast::ClearCache());
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// Fills the cache with binaries for a specific device
-StatusCode CLBlastFillCache(const cl_device_id device) {
- return static_cast<StatusCode>(clblast::FillCache(device));
+CLBlastStatusCode CLBlastFillCache(const cl_device_id device) {
+ try {
+ return static_cast<CLBlastStatusCode>(clblast::FillCache(device));
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// =================================================================================================
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index aaa76cb4..d306bb87 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -41,8 +41,8 @@
#include <string> // std::string
#include <vector> // std::vector
#include <memory> // std::shared_ptr
-#include <stdexcept> // std::runtime_error
#include <numeric> // std::accumulate
+#include <cstring> // std::strlen
// OpenCL
#if defined(__APPLE__) || defined(__MACOSX)
@@ -51,20 +51,41 @@
#include <CL/opencl.h>
#endif
+// Exception classes
+#include "cxpp11_common.hpp"
+
namespace clblast {
// =================================================================================================
-// Error occurred in the C++11 OpenCL header (this file)
-inline void Error(const std::string &message) {
- throw std::runtime_error("Internal OpenCL error: "+message);
-}
+// Represents a runtime error returned by an OpenCL API function
+class CLError : public ErrorCode<DeviceError, cl_int> {
+ public:
+ explicit CLError(cl_int status, const std::string &where):
+ ErrorCode(status,
+ where,
+ "OpenCL error: " + where + ": " + std::to_string(static_cast<int>(status))) {
+ }
-// Error occurred in OpenCL
-inline void CheckError(const cl_int status) {
- if (status != CL_SUCCESS) {
- throw std::runtime_error("Internal OpenCL error: "+std::to_string(status));
+ static void Check(const cl_int status, const std::string &where) {
+ if (status != CL_SUCCESS) {
+ throw CLError(status, where);
+ }
}
-}
+
+ static void CheckDtor(const cl_int status, const std::string &where) {
+ if (status != CL_SUCCESS) {
+ fprintf(stderr, "CLBlast: %s (ignoring)\n", CLError(status, where).what());
+ }
+ }
+};
+
+// =================================================================================================
+
+// Error occurred in OpenCL
+#define CheckError(call) CLError::Check(call, CLError::TrimCallString(#call))
+
+// Error occured in OpenCL (no-exception version for destructors)
+#define CheckErrorDtor(call) CLError::CheckDtor(call, CLError::TrimCallString(#call))
// =================================================================================================
@@ -81,7 +102,7 @@ class Event {
// Regular constructor with memory management
explicit Event():
event_(new cl_event, [](cl_event* e) {
- if (*e) { CheckError(clReleaseEvent(*e)); }
+ if (*e) { CheckErrorDtor(clReleaseEvent(*e)); }
delete e;
}) {
*event_ = nullptr;
@@ -92,16 +113,17 @@ class Event {
CheckError(clWaitForEvents(1, &(*event_)));
}
- // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
- // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
- // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
+ // Retrieves the elapsed time of the last recorded event.
+ // (Note that there is a bug in Apple's OpenCL implementation of the 'clGetEventProfilingInfo' function:
+ // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx)
+ // However, in our case the reply size is fixed to be cl_ulong, so we are not affected.
float GetElapsedTime() const {
WaitForCompletion();
const auto bytes = sizeof(cl_ulong);
auto time_start = cl_ulong{0};
- clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
+ CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr));
auto time_end = cl_ulong{0};
- clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
+ CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr));
return static_cast<float>(time_end - time_start) * 1.0e-6f;
}
@@ -130,10 +152,14 @@ class Platform {
explicit Platform(const size_t platform_id) {
auto num_platforms = cl_uint{0};
CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
- if (num_platforms == 0) { Error("no platforms found"); }
+ if (num_platforms == 0) {
+ throw RuntimeError("Platform: no platforms found");
+ }
+ if (platform_id >= num_platforms) {
+ throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id));
+ }
auto platforms = std::vector<cl_platform_id>(num_platforms);
CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
- if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
platform_ = platforms[platform_id];
}
@@ -173,11 +199,16 @@ class Device {
// Initialize the device. Note that this constructor can throw exceptions!
explicit Device(const Platform &platform, const size_t device_id) {
auto num_devices = platform.NumDevices();
- if (num_devices == 0) { Error("no devices found"); }
+ if (num_devices == 0) {
+ throw RuntimeError("Device: no devices found");
+ }
+ if (device_id >= num_devices) {
+ throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
+ }
+
auto devices = std::vector<cl_device_id>(num_devices);
CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
devices.data(), nullptr));
- if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
device_ = devices[device_id];
}
@@ -282,7 +313,8 @@ class Device {
auto result = std::string{};
result.resize(bytes);
CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
- return std::string{result.c_str()}; // Removes any trailing '\0'-characters
+ result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
+ return result;
}
};
@@ -300,11 +332,11 @@ class Context {
// Regular constructor with memory management
explicit Context(const Device &device):
- context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) {
+ context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) {
auto status = CL_SUCCESS;
const cl_device_id dev = device();
*context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
- CheckError(status);
+ CLError::Check(status, "clCreateContext");
}
// Accessor to the private data-member
@@ -329,18 +361,18 @@ class Program {
// Source-based constructor with memory management
explicit Program(const Context &context, std::string source):
- program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+ program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
length_(source.length()),
source_(std::move(source)),
source_ptr_(&source_[0]) {
auto status = CL_SUCCESS;
*program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
- CheckError(status);
+ CLError::Check(status, "clCreateProgramWithSource");
}
// Binary-based constructor with memory management
explicit Program(const Device &device, const Context &context, const std::string& binary):
- program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+ program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
length_(binary.length()),
source_(binary),
source_ptr_(&source_[0]) {
@@ -350,25 +382,15 @@ class Program {
*program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
reinterpret_cast<const unsigned char**>(&source_ptr_),
&status1, &status2);
- CheckError(status1);
- CheckError(status2);
+ CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
+ CLError::Check(status2, "clCreateProgramWithBinary");
}
// Compiles the device program and returns whether or not there where any warnings/errors
- BuildStatus Build(const Device &device, std::vector<std::string> &options) {
+ void Build(const Device &device, std::vector<std::string> &options) {
auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
const cl_device_id dev = device();
- auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr);
- if (status == CL_BUILD_PROGRAM_FAILURE) {
- return BuildStatus::kError;
- }
- else if (status == CL_INVALID_BINARY) {
- return BuildStatus::kInvalid;
- }
- else {
- CheckError(status);
- return BuildStatus::kSuccess;
- }
+ CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
}
// Retrieves the warning/error message from the compiler (if any)
@@ -416,7 +438,7 @@ class Queue {
// Regular constructor with memory management
explicit Queue(const Context &context, const Device &device):
- queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
+ queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s));
delete s; }) {
auto status = CL_SUCCESS;
#ifdef CL_VERSION_2_0
@@ -425,15 +447,17 @@ class Queue {
{
cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
*queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+ CLError::Check(status, "clCreateCommandQueueWithProperties");
}
else
{
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+ CLError::Check(status, "clCreateCommandQueue");
}
#else
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+ CLError::Check(status, "clCreateCommandQueue");
#endif
- CheckError(status);
}
// Synchronizes the queue
@@ -525,7 +549,7 @@ class Buffer {
if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
auto status = CL_SUCCESS;
*buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
- CheckError(status);
+ CLError::Check(status, "clCreateBuffer");
}
// As above, but now with read/write access as a default
@@ -546,18 +570,24 @@ class Buffer {
// Copies from device to host: reading the device buffer a-synchronously
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
- if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
+ if (access_ == BufferAccess::kWriteOnly) {
+ throw LogicError("Buffer: reading from a write-only buffer");
+ }
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) const {
- if (host.size() < size) { Error("target host buffer is too small"); }
+ if (host.size() < size) {
+ throw LogicError("Buffer: target host buffer is too small");
+ }
ReadAsync(queue, size, host.data(), offset);
}
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) const {
- if (host.size() < size) { Error("target host buffer is too small"); }
+ if (host.size() < size) {
+ throw LogicError("Buffer: target host buffer is too small");
+ }
ReadAsync(queue, size, host.data(), offset);
}
@@ -577,8 +607,12 @@ class Buffer {
// Copies from host to device: writing the device buffer a-synchronously
void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
- if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
- if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
+ if (access_ == BufferAccess::kReadOnly) {
+ throw LogicError("Buffer: writing to a read-only buffer");
+ }
+ if (GetSize() < (offset+size)*sizeof(T)) {
+ throw LogicError("Buffer: target device buffer is too small");
+ }
CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
@@ -644,10 +678,10 @@ class Kernel {
// Regular constructor with memory management
explicit Kernel(const Program &program, const std::string &name):
- kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) {
+ kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
auto status = CL_SUCCESS;
*kernel_ = clCreateKernel(program(), name.c_str(), &status);
- CheckError(status);
+ CLError::Check(status, "clCreateKernel");
}
// Sets a kernel argument at the indicated position
diff --git a/src/cxpp11_common.hpp b/src/cxpp11_common.hpp
new file mode 100644
index 00000000..6ac008be
--- /dev/null
+++ b/src/cxpp11_common.hpp
@@ -0,0 +1,109 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Ivan Shapovalov <intelfx@intelfx.name>
+//
+// This file contains exception classes corresponding to 'clpp11.hpp'. It is also part of the
+// CLCudaAPI project. See 'clpp11.hpp' for more details.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CXPP11_COMMON_H_
+#define CLBLAST_CXPP11_COMMON_H_
+
+#include <string> // std::string
+#include <stdexcept> // std::runtime_error
+
+namespace clblast {
+// =================================================================================================
+
+// Basic exception class: represents an error happened inside our code
+// (as opposed to an error in C++ runtime)
+template <typename Base>
+class Error : public Base {
+ public:
+ // Perfect forwarding of the constructor since "using Base::Base" is not supported by VS 2013
+ template <typename... Args>
+ Error(Args&&... args):
+ Base(std::forward<Args>(args)...) {
+ }
+};
+
+// =================================================================================================
+
+// Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function)
+class DeviceError : public Error<std::runtime_error> {
+ public:
+ // Perfect forwarding of the constructor since "using Error<std::runtime_error>::Error" is not
+ // supported by VS 2013
+ template <typename... Args>
+ DeviceError(Args&&... args):
+ Error<std::runtime_error>(std::forward<Args>(args)...) {
+ }
+
+ static std::string TrimCallString(const char *where) {
+ const char *paren = strchr(where, '(');
+ if (paren) {
+ return std::string(where, paren);
+ } else {
+ return std::string(where);
+ }
+ }
+};
+
+// =================================================================================================
+
+// Represents a generic runtime error (aka environmental problem)
+class RuntimeError : public Error<std::runtime_error> {
+ public:
+ explicit RuntimeError(const std::string &reason):
+ Error("Run-time error: " + reason) {
+ }
+};
+
+// =================================================================================================
+
+// Represents a generic logic error (aka failed assertion)
+class LogicError : public Error<std::logic_error> {
+ public:
+ explicit LogicError(const std::string &reason):
+ Error("Internal logic error: " + reason) {
+ }
+};
+
+// =================================================================================================
+
+// Internal exception base class with a status field and a subclass-specific "details" field
+// which can be used to recreate an exception
+template <typename Base, typename Status>
+class ErrorCode : public Base {
+ public:
+ ErrorCode(Status status, const std::string &details, const std::string &reason):
+ Base(reason),
+ status_(status),
+ details_(details) {
+ }
+
+ Status status() const {
+ return status_;
+ }
+
+ const std::string& details() const {
+ return details_;
+ }
+
+ private:
+ const Status status_;
+ const std::string details_;
+};
+
+// =================================================================================================
+
+} // namespace clblast
+
+// CLBLAST_CXPP11_COMMON_H_
+#endif
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 2340a89c..cf548d46 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "database/database.hpp"
#include "database/kernels/xaxpy.hpp"
@@ -92,7 +92,7 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
}
}
- if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
+ if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
}
}
diff --git a/src/database/database.hpp b/src/database/database.hpp
index 8a3e7040..7c05a20b 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -21,7 +21,7 @@
#include <vector>
#include <unordered_map>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index 16aa6b3f..3a569bf3 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -43,6 +43,7 @@ const Database::DatabaseEntry CopySingle = {
{ "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+ { "Tonga", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
}
},
@@ -89,6 +90,7 @@ const Database::DatabaseEntry CopySingle = {
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
+ { "GeForce GTX TITAN Black", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
@@ -114,6 +116,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
{ "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+ { "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
@@ -152,6 +155,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
{ "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX TITAN Black", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -177,6 +181,7 @@ const Database::DatabaseEntry CopyDouble = {
{ "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
@@ -211,15 +216,16 @@ const Database::DatabaseEntry CopyDouble = {
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+ { "GeForce GTX TITAN Black", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
}
@@ -236,6 +242,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
{ "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Tonga", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
@@ -270,6 +277,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX TITAN Black", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index 6c5e0c2f..62a22014 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -43,7 +43,8 @@ const Database::DatabaseEntry PadSingle = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
}
},
{ // ARM GPUs
@@ -89,6 +90,7 @@ const Database::DatabaseEntry PadSingle = {
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "GeForce GTX TITAN Black", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -114,6 +116,7 @@ const Database::DatabaseEntry PadComplexSingle = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
@@ -160,10 +163,11 @@ const Database::DatabaseEntry PadComplexSingle = {
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // Default
@@ -185,7 +189,8 @@ const Database::DatabaseEntry PadDouble = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Tonga", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@@ -219,6 +224,7 @@ const Database::DatabaseEntry PadDouble = {
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@@ -244,7 +250,8 @@ const Database::DatabaseEntry PadComplexDouble = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tonga", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@@ -278,6 +285,7 @@ const Database::DatabaseEntry PadComplexDouble = {
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 4003ec6d..609ccd45 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -43,6 +43,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+ { "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
@@ -89,6 +90,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "GeForce GTX TITAN Black", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
@@ -114,6 +116,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
@@ -160,6 +163,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -185,6 +189,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
@@ -219,6 +224,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -244,6 +250,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+ { "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
@@ -278,6 +285,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index c5ea50c2..54da15a7 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -43,7 +43,8 @@ const Database::DatabaseEntry TransposeSingle = {
{ "Oland", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+ { "Tonga", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
{ // ARM GPUs
@@ -89,6 +90,7 @@ const Database::DatabaseEntry TransposeSingle = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "GeForce GTX TITAN Black", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@@ -114,6 +116,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
{ "Oland", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "Tonga", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
@@ -154,6 +157,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@@ -162,7 +166,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
}
@@ -179,6 +183,7 @@ const Database::DatabaseEntry TransposeDouble = {
{ "Oland", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+ { "Tonga", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
}
},
@@ -213,6 +218,7 @@ const Database::DatabaseEntry TransposeDouble = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@@ -238,7 +244,8 @@ const Database::DatabaseEntry TransposeComplexDouble = {
{ "Oland", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "Tonga", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
{ // ARM GPUs
@@ -266,6 +273,7 @@ const Database::DatabaseEntry TransposeComplexDouble = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 60471bef..53d2446f 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -43,7 +43,8 @@ const Database::DatabaseEntry XaxpySingle = {
{ "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
- { "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
+ { "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
+ { "default", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
}
},
{ // ARM GPUs
@@ -89,6 +90,7 @@ const Database::DatabaseEntry XaxpySingle = {
{ "GeForce GTX 750 Ti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+ { "GeForce GTX TITAN Black", { {"VW",4}, {"WGS",128}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
@@ -97,7 +99,7 @@ const Database::DatabaseEntry XaxpySingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
+ { "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
}
},
}
@@ -114,6 +116,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
{ "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
@@ -160,6 +163,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+ { "GeForce GTX TITAN Black", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@@ -185,6 +189,7 @@ const Database::DatabaseEntry XaxpyDouble = {
{ "Oland", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "Tonga", { {"VW",1}, {"WGS",128}, {"WPT",4} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
@@ -219,15 +224,16 @@ const Database::DatabaseEntry XaxpyDouble = {
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
+ { "GeForce GTX TITAN Black", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "default", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
}
},
}
@@ -244,6 +250,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
{ "Oland", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "Tonga", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
@@ -278,6 +285,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
+ { "GeForce GTX TITAN Black", { {"VW",1}, {"WGS",128}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 686b2839..4712cc04 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -42,6 +42,7 @@ const Database::DatabaseEntry XdotSingle = {
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",128}, {"WGS2",32} } },
+ { "Tonga", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
@@ -72,6 +73,7 @@ const Database::DatabaseEntry XdotSingle = {
{ "GeForce GTX 750", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",512}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",256} } },
@@ -95,7 +97,8 @@ const Database::DatabaseEntry XdotComplexSingle = {
{ "Oland", { {"WGS1",128}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",32} } },
- { "default", { {"WGS1",128}, {"WGS2",32} } },
+ { "Tonga", { {"WGS1",256}, {"WGS2",64} } },
+ { "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
{ // Intel CPUs
@@ -125,6 +128,7 @@ const Database::DatabaseEntry XdotComplexSingle = {
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",512}, {"WGS2",64} } },
@@ -148,7 +152,8 @@ const Database::DatabaseEntry XdotDouble = {
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
- { "default", { {"WGS1",128}, {"WGS2",32} } },
+ { "Tonga", { {"WGS1",128}, {"WGS2",64} } },
+ { "default", { {"WGS1",128}, {"WGS2",64} } },
}
},
{ // Intel CPUs
@@ -167,9 +172,10 @@ const Database::DatabaseEntry XdotDouble = {
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",256} } },
{ "GeForce GTX 750 Ti", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
- { "default", { {"WGS1",256}, {"WGS2",64} } },
+ { "default", { {"WGS1",128}, {"WGS2",64} } },
}
},
{ // Default
@@ -190,6 +196,7 @@ const Database::DatabaseEntry XdotComplexDouble = {
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
+ { "Tonga", { {"WGS1",128}, {"WGS2",64} } },
{ "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
@@ -209,6 +216,7 @@ const Database::DatabaseEntry XdotComplexDouble = {
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index 8303fa83..aa95b1f6 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -36,6 +36,7 @@ const Database::DatabaseEntry XgemmSingle = {
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+ { "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
@@ -82,6 +83,7 @@ const Database::DatabaseEntry XgemmSingle = {
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+ { "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
@@ -107,6 +109,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
{ "Oland", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+ { "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
@@ -153,6 +156,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -178,6 +182,7 @@ const Database::DatabaseEntry XgemmDouble = {
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+ { "Tonga", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
@@ -212,6 +217,7 @@ const Database::DatabaseEntry XgemmDouble = {
{ "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+ { "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
@@ -237,6 +243,7 @@ const Database::DatabaseEntry XgemmComplexDouble = {
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
@@ -270,6 +277,7 @@ const Database::DatabaseEntry XgemmComplexDouble = {
{ "GeForce GTX 750", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+ { "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp
index 89499cc6..c83a9b0d 100644
--- a/src/database/kernels/xgemm_direct.hpp
+++ b/src/database/kernels/xgemm_direct.hpp
@@ -19,7 +19,7 @@ const Database::DatabaseEntry XgemmDirectHalf = {
"XgemmDirect", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
}
},
}
@@ -32,7 +32,8 @@ const Database::DatabaseEntry XgemmDirectSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
- { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ { "Tonga", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // Intel GPUs
@@ -44,12 +45,13 @@ const Database::DatabaseEntry XgemmDirectSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
- { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+ { "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
}
},
}
@@ -62,7 +64,8 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
- { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
}
},
{ // Intel GPUs
@@ -74,12 +77,13 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
- { "default", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+ { "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
}
},
}
@@ -92,18 +96,20 @@ const Database::DatabaseEntry XgemmDirectDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
- { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
- { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+ { "GeForce GTX TITAN Black", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
}
@@ -116,18 +122,20 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
- { "default", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+ { "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
}
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 90355b96..c3811c59 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -43,6 +43,7 @@ const Database::DatabaseEntry XgemvSingle = {
{ "Oland", { {"WGS1",128}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
+ { "Tonga", { {"WGS1",128}, {"WPT1",2} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
@@ -82,6 +83,7 @@ const Database::DatabaseEntry XgemvSingle = {
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } },
{ "Tesla K20m", { {"WGS1",128}, {"WPT1",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } },
@@ -107,6 +109,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
{ "Oland", { {"WGS1",64}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",64}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",64}, {"WPT1",1} } },
+ { "Tonga", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
@@ -145,6 +148,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
{ "GeForce GTX 750", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
@@ -167,6 +171,7 @@ const Database::DatabaseEntry XgemvDouble = {
{ "Oland", { {"WGS1",256}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
+ { "Tonga", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",256}, {"WPT1",1} } },
}
},
@@ -194,6 +199,7 @@ const Database::DatabaseEntry XgemvDouble = {
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } },
{ "Tesla K20m", { {"WGS1",256}, {"WPT1",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } },
@@ -219,6 +225,7 @@ const Database::DatabaseEntry XgemvComplexDouble = {
{ "Oland", { {"WGS1",256}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
+ { "Tonga", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 8e6254ac..9abd33e1 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -43,6 +43,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+ { "Tonga", { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@@ -82,6 +83,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
{ "GeForce GTX 750 Ti", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+ { "GeForce GTX TITAN Black", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tesla K20m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@@ -107,6 +109,7 @@ const Database::DatabaseEntry XgemvFastComplexSingle = {
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+ { "Tonga", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@@ -164,6 +167,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+ { "Tonga", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@@ -191,6 +195,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
{ "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+ { "GeForce GTX TITAN Black", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tesla K20m", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@@ -216,6 +221,7 @@ const Database::DatabaseEntry XgemvFastComplexDouble = {
{ "Oland", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+ { "Tonga", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 8fe45e01..49650a38 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -32,7 +32,8 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
- { "default", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
+ { "Tonga", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+ { "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
}
},
{ // Intel CPUs
@@ -55,6 +56,7 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+ { "GeForce GTX TITAN Black", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
@@ -73,7 +75,8 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
- { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+ { "Tonga", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+ { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Intel CPUs
@@ -107,6 +110,7 @@ const Database::DatabaseEntry XgemvFastRotDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+ { "Tonga", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
}
},
@@ -120,6 +124,7 @@ const Database::DatabaseEntry XgemvFastRotDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+ { "GeForce GTX TITAN Black", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
}
},
@@ -138,7 +143,8 @@ const Database::DatabaseEntry XgemvFastRotComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
- { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+ { "Tonga", { {"VW3",4}, {"WGS3",16}, {"WPT3",8} } },
+ { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
}
},
{ // Intel CPUs
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index f2fc2a9a..a80f9860 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -43,7 +43,8 @@ const Database::DatabaseEntry XgerSingle = {
{ "Oland", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
- { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+ { "Tonga", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+ { "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
}
},
{ // ARM GPUs
@@ -80,6 +81,7 @@ const Database::DatabaseEntry XgerSingle = {
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
}
},
@@ -102,7 +104,8 @@ const Database::DatabaseEntry XgerComplexSingle = {
{ "Oland", { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
- { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+ { "Tonga", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+ { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // ARM GPUs
@@ -139,12 +142,13 @@ const Database::DatabaseEntry XgerComplexSingle = {
{ "GeForce GTX 750", { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
- { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
+ { "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
+ { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
},
}
@@ -161,7 +165,8 @@ const Database::DatabaseEntry XgerDouble = {
{ "Oland", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
- { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+ { "Tonga", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
+ { "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
}
},
{ // ARM GPUs
@@ -187,6 +192,7 @@ const Database::DatabaseEntry XgerDouble = {
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
},
@@ -209,6 +215,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
{ "Oland", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+ { "Tonga", { {"WGS1",16}, {"WGS2",4}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
}
},
@@ -235,6 +242,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
{ "GeForce GTX 750", { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+ { "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
}
},
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index a5faef5a..8ac3a3a8 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -113,7 +113,7 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
const real beta = GetRealArg(arg_beta);
// Skip these threads if they do not contain threads contributing to the upper-triangle
- if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
+ if ((GetGroupID1() + 1)*NWG < GetGroupID0()*MWG) {
return;
}
@@ -153,7 +153,7 @@ void XgemmLower(const int kSizeN, const int kSizeK,
const real beta = GetRealArg(arg_beta);
// Skip these threads if they do not contain threads contributing to the lower-triangle
- if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
+ if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) {
return;
}
diff --git a/src/routine.cpp b/src/routine.cpp
index 80764b74..acafb0d2 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -21,10 +21,11 @@
namespace clblast {
// =================================================================================================
-// Constructor: not much here, because no status codes can be returned
+// The constructor does all heavy work, errors are returned as exceptions
Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
- const std::vector<const Database::DatabaseEntry*> &userDatabase):
+ const std::vector<const Database::DatabaseEntry*> &userDatabase,
+ std::initializer_list<const char *> source):
precision_(precision),
routine_name_(name),
queue_(queue),
@@ -33,15 +34,9 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
device_(queue_.GetDevice()),
device_name_(device_.Name()),
db_(queue_, routines, precision_, userDatabase) {
-}
-
-// =================================================================================================
-
-// Separate set-up function to allow for status codes to be returned
-StatusCode Routine::SetUp() {
// Queries the cache to see whether or not the program (context-specific) is already there
- if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
+ if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }
// Sets the build options from an environmental variable (if set)
auto options = std::vector<std::string>();
@@ -53,13 +48,10 @@ StatusCode Routine::SetUp() {
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
- try {
- auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
- auto program = Program(device_, context_, binary);
- program.Build(device_, options);
- StoreProgramToCache(program, context_, precision_, routine_name_);
- } catch (...) { return StatusCode::kBuildProgramFailure; }
- return StatusCode::kSuccess;
+ auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
+ auto program = Program(device_, context_, binary);
+ program.Build(device_, options);
+ StoreProgramToCache(program, context_, precision_, routine_name_);
}
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
@@ -69,48 +61,50 @@ StatusCode Routine::SetUp() {
const auto extensions = device_.Capabilities();
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
- return StatusCode::kNoDoublePrecision;
+ throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
}
}
// As above, but for cl_khr_fp16 (half precision)
if (precision_ == Precision::kHalf) {
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
- return StatusCode::kNoHalfPrecision;
+ throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
}
}
- // Loads the common header (typedefs and defines and such)
- std::string common_header =
- #include "kernels/common.opencl"
- ;
-
// Collects the parameters for this device in the form of defines, and adds the precision
- auto defines = db_.GetDefines();
- defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
+ auto source_string = db_.GetDefines();
+ source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
// Adds the name of the routine as a define
- defines += "#define ROUTINE_"+routine_name_+"\n";
+ source_string += "#define ROUTINE_"+routine_name_+"\n";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.IsAMD() && device_.IsGPU()) {
- defines += "#define USE_CL_MAD 1\n";
+ source_string += "#define USE_CL_MAD 1\n";
}
// For specific devices, use staggered/shuffled workgroup indices.
if (device_.IsAMD() && device_.IsGPU()) {
- defines += "#define USE_STAGGERED_INDICES 1\n";
+ source_string += "#define USE_STAGGERED_INDICES 1\n";
}
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if (device_.IsARM() && device_.IsGPU()) {
- defines += "#define GLOBAL_MEM_FENCE 1\n";
+ source_string += "#define GLOBAL_MEM_FENCE 1\n";
}
- // Combines everything together into a single source string
- const auto source_string = defines + common_header + source_string_;
+ // Loads the common header (typedefs and defines and such)
+ source_string +=
+ #include "kernels/common.opencl"
+ ;
+
+ // Adds routine-specific code to the constructed source string
+ for (const char *s: source) {
+ source_string += s;
+ }
// Prints details of the routine to compile in case of debugging in verbose mode
#ifdef VERBOSE
@@ -120,23 +114,21 @@ StatusCode Routine::SetUp() {
#endif
// Compiles the kernel
+ auto program = Program(context_, source_string);
try {
- auto program = Program(context_, source_string);
- const auto build_status = program.Build(device_, options);
-
- // Checks for compiler crashes/errors/warnings
- if (build_status == BuildStatus::kError) {
- const auto message = program.GetBuildInfo(device_);
- fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
- return StatusCode::kBuildProgramFailure;
+ program.Build(device_, options);
+ } catch (const CLError &e) {
+ if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
+ fprintf(stdout, "OpenCL compiler error/warning: %s\n",
+ program.GetBuildInfo(device_).c_str());
}
- if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
+ throw;
+ }
- // Store the compiled binary and program in the cache
- const auto binary = program.GetIR();
- StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
- StoreProgramToCache(program, context_, precision_, routine_name_);
- } catch (...) { return StatusCode::kBuildProgramFailure; }
+ // Store the compiled binary and program in the cache
+ const auto binary = program.GetIR();
+ StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
+ StoreProgramToCache(program, context_, precision_, routine_name_);
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
@@ -144,9 +136,6 @@ StatusCode Routine::SetUp() {
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
#endif
-
- // No errors, normal termination of this function
- return StatusCode::kSuccess;
}
// =================================================================================================
diff --git a/src/routine.hpp b/src/routine.hpp
index 8582a2b7..2d8b2415 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -19,9 +19,9 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "cache.hpp"
-#include "buffer_test.hpp"
+#include "utilities/buffer_test.hpp"
#include "database/database.hpp"
#include "routines/common.hpp"
@@ -34,21 +34,19 @@ class Routine {
// Base class constructor. The user database is an optional extra database to override the
// built-in database.
+ // All heavy preparation work is done inside this constructor.
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
- const std::vector<const Database::DatabaseEntry*> &userDatabase = {});
-
- // Set-up phase of the kernel
- StatusCode SetUp();
+ const std::vector<const Database::DatabaseEntry*> &userDatabase,
+ std::initializer_list<const char *> source);
protected:
// Non-static variable for the precision
const Precision precision_;
- // The routine's name and its kernel-source in string form
+ // The routine's name
const std::string routine_name_;
- std::string source_string_;
// The OpenCL objects, accessible only from derived classes
Queue queue_;
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index 3969cf9f..c995dc12 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -20,22 +20,26 @@ namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
- std::vector<size_t> global, const std::vector<size_t> &local,
- EventPointer event, const std::vector<Event> &waitForEvents) {
+void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+ std::vector<size_t> global, const std::vector<size_t> &local,
+ EventPointer event, const std::vector<Event> &waitForEvents) {
if (!local.empty()) {
// Tests for validity of the local thread sizes
if (local.size() > device.MaxWorkItemDimensions()) {
- return StatusCode::kInvalidLocalNumDimensions;
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
}
const auto max_work_item_sizes = device.MaxWorkItemSizes();
for (auto i=size_t{0}; i<local.size(); ++i) {
- if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+ if (local[i] > max_work_item_sizes[i]) {
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
+ }
}
auto local_size = size_t{1};
for (auto &item: local) { local_size *= item; }
- if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
+ if (local_size > device.MaxWorkGroupSize()) {
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
+ }
// Make sure the global thread sizes are at least equal to the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
@@ -45,7 +49,9 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Tests for local memory usage
const auto local_mem_usage = kernel.LocalMemUsage(device);
- if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
+ if (!device.IsLocalMemoryValid(local_mem_usage)) {
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
+ }
// Prints the name of the kernel to launch in case of debugging in verbose mode
#ifdef VERBOSE
@@ -55,9 +61,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
#endif
// Launches the kernel (and checks for launch errors)
- try {
- kernel.Launch(queue, global, local, event, waitForEvents);
- } catch (...) { return StatusCode::kKernelLaunchError; }
+ kernel.Launch(queue, global, local, event, waitForEvents);
// Prints the elapsed execution time in case of debugging in verbose mode
#ifdef VERBOSE
@@ -66,9 +70,6 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
#endif
-
- // No errors, normal termination of this function
- return StatusCode::kSuccess;
}
// =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 9d8849c3..53ca6355 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -27,29 +27,29 @@ namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
- std::vector<size_t> global, const std::vector<size_t> &local,
- EventPointer event, const std::vector<Event> &waitForEvents = {});
+void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+ std::vector<size_t> global, const std::vector<size_t> &local,
+ EventPointer event, const std::vector<Event> &waitForEvents = {});
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
// to write to symmetric and triangular matrices through optional arguments.
template <typename T>
-StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
- const Database &db,
- EventPointer event, const std::vector<Event> &waitForEvents,
- const size_t src_one, const size_t src_two,
- const size_t src_ld, const size_t src_offset,
- const Buffer<T> &src,
- const size_t dest_one, const size_t dest_two,
- const size_t dest_ld, const size_t dest_offset,
- const Buffer<T> &dest,
- const T alpha,
- const Program &program, const bool do_pad,
- const bool do_transpose, const bool do_conjugate,
- const bool upper = false, const bool lower = false,
- const bool diagonal_imag_zero = false) {
+void PadCopyTransposeMatrix(Queue &queue, const Device &device,
+ const Database &db,
+ EventPointer event, const std::vector<Event> &waitForEvents,
+ const size_t src_one, const size_t src_two,
+ const size_t src_ld, const size_t src_offset,
+ const Buffer<T> &src,
+ const size_t dest_one, const size_t dest_two,
+ const size_t dest_ld, const size_t dest_offset,
+ const Buffer<T> &dest,
+ const T alpha,
+ const Program &program, const bool do_pad,
+ const bool do_transpose, const bool do_conjugate,
+ const bool upper = false, const bool lower = false,
+ const bool diagonal_imag_zero = false) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
@@ -61,8 +61,8 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
if (do_transpose) {
if (use_fast_kernel &&
IsMultiple(src_ld, db["TRA_WPT"]) &&
- IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
- IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
+ IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) &&
+ IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) {
kernel_name = "TransposeMatrixFast";
}
else {
@@ -84,77 +84,75 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
// Retrieves the kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program, kernel_name);
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(src_ld));
- kernel.SetArgument(1, src());
- kernel.SetArgument(2, dest());
- kernel.SetArgument(3, GetRealArg(alpha));
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(src_ld));
+ kernel.SetArgument(1, src());
+ kernel.SetArgument(2, dest());
+ kernel.SetArgument(3, GetRealArg(alpha));
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(src_one));
+ kernel.SetArgument(1, static_cast<int>(src_two));
+ kernel.SetArgument(2, static_cast<int>(src_ld));
+ kernel.SetArgument(3, static_cast<int>(src_offset));
+ kernel.SetArgument(4, src());
+ kernel.SetArgument(5, static_cast<int>(dest_one));
+ kernel.SetArgument(6, static_cast<int>(dest_two));
+ kernel.SetArgument(7, static_cast<int>(dest_ld));
+ kernel.SetArgument(8, static_cast<int>(dest_offset));
+ kernel.SetArgument(9, dest());
+ kernel.SetArgument(10, GetRealArg(alpha));
+ if (do_pad) {
+ kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
- kernel.SetArgument(0, static_cast<int>(src_one));
- kernel.SetArgument(1, static_cast<int>(src_two));
- kernel.SetArgument(2, static_cast<int>(src_ld));
- kernel.SetArgument(3, static_cast<int>(src_offset));
- kernel.SetArgument(4, src());
- kernel.SetArgument(5, static_cast<int>(dest_one));
- kernel.SetArgument(6, static_cast<int>(dest_two));
- kernel.SetArgument(7, static_cast<int>(dest_ld));
- kernel.SetArgument(8, static_cast<int>(dest_offset));
- kernel.SetArgument(9, dest());
- kernel.SetArgument(10, GetRealArg(alpha));
- if (do_pad) {
- kernel.SetArgument(11, static_cast<int>(do_conjugate));
- }
- else {
- kernel.SetArgument(11, static_cast<int>(upper));
- kernel.SetArgument(12, static_cast<int>(lower));
- kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
- }
+ kernel.SetArgument(11, static_cast<int>(upper));
+ kernel.SetArgument(12, static_cast<int>(lower));
+ kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
+ }
- // Launches the kernel and returns the error code. Uses global and local thread sizes based on
- // parameters in the database.
- if (do_transpose) {
- if (use_fast_kernel) {
- const auto global = std::vector<size_t>{
- dest_one / db["TRA_WPT"],
- dest_two / db["TRA_WPT"]
- };
- const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
- else {
- const auto global = std::vector<size_t>{
- Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
- Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
- };
- const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
+ // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+ // parameters in the database.
+ if (do_transpose) {
+ if (use_fast_kernel) {
+ const auto global = std::vector<size_t>{
+ dest_one / db["TRA_WPT"],
+ dest_two / db["TRA_WPT"]
+ };
+ const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
+ RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
- if (use_fast_kernel) {
- const auto global = std::vector<size_t>{
- dest_one / db["COPY_VW"],
- dest_two / db["COPY_WPT"]
- };
- const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
- else {
- const auto global = std::vector<size_t>{
- Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
- Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
- };
- const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
+ const auto global = std::vector<size_t>{
+ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+ Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
+ };
+ const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
+ RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
- } catch (...) { return StatusCode::kInvalidKernel; }
+ }
+ else {
+ if (use_fast_kernel) {
+ const auto global = std::vector<size_t>{
+ dest_one / db["COPY_VW"],
+ dest_two / db["COPY_WPT"]
+ };
+ const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
+ RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ else {
+ const auto global = std::vector<size_t>{
+ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+ Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
+ };
+ const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
+ RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ }
}
// =================================================================================================
diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp
index 6b6e7f9e..e9efa1a7 100644
--- a/src/routines/level1/xamax.cpp
+++ b/src/routines/level1/xamax.cpp
@@ -22,74 +22,64 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xamax.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xamax<T>::DoAmax(const size_t n,
- const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xamax<T>::DoAmax(const size_t n,
+ const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorIndex(1, imax_buffer, imax_offset);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorIndex(1, imax_buffer, imax_offset);
// Retrieves the Xamax kernels from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xamax");
- auto kernel2 = Kernel(program, "XamaxEpilogue");
-
- // Creates the buffer for intermediate values
- auto temp_size = 2*db_["WGS2"];
- auto temp_buffer1 = Buffer<T>(context_, temp_size);
- auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
-
- // Sets the kernel arguments
- kernel1.SetArgument(0, static_cast<int>(n));
- kernel1.SetArgument(1, x_buffer());
- kernel1.SetArgument(2, static_cast<int>(x_offset));
- kernel1.SetArgument(3, static_cast<int>(x_inc));
- kernel1.SetArgument(4, temp_buffer1());
- kernel1.SetArgument(5, temp_buffer2());
-
- // Event waiting list
- auto eventWaitList = std::vector<Event>();
-
- // Launches the main kernel
- auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
- auto local1 = std::vector<size_t>{db_["WGS1"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(kernelEvent);
-
- // Sets the arguments for the epilogue kernel
- kernel2.SetArgument(0, temp_buffer1());
- kernel2.SetArgument(1, temp_buffer2());
- kernel2.SetArgument(2, imax_buffer());
- kernel2.SetArgument(3, static_cast<int>(imax_offset));
-
- // Launches the epilogue kernel
- auto global2 = std::vector<size_t>{db_["WGS2"]};
- auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel1 = Kernel(program, "Xamax");
+ auto kernel2 = Kernel(program, "XamaxEpilogue");
+
+ // Creates the buffer for intermediate values
+ auto temp_size = 2*db_["WGS2"];
+ auto temp_buffer1 = Buffer<T>(context_, temp_size);
+ auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
+
+ // Sets the kernel arguments
+ kernel1.SetArgument(0, static_cast<int>(n));
+ kernel1.SetArgument(1, x_buffer());
+ kernel1.SetArgument(2, static_cast<int>(x_offset));
+ kernel1.SetArgument(3, static_cast<int>(x_inc));
+ kernel1.SetArgument(4, temp_buffer1());
+ kernel1.SetArgument(5, temp_buffer2());
+
+ // Event waiting list
+ auto eventWaitList = std::vector<Event>();
+
+ // Launches the main kernel
+ auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+ auto local1 = std::vector<size_t>{db_["WGS1"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+ eventWaitList.push_back(kernelEvent);
+
+ // Sets the arguments for the epilogue kernel
+ kernel2.SetArgument(0, temp_buffer1());
+ kernel2.SetArgument(1, temp_buffer2());
+ kernel2.SetArgument(2, imax_buffer());
+ kernel2.SetArgument(3, static_cast<int>(imax_offset));
+
+ // Launches the epilogue kernel
+ auto global2 = std::vector<size_t>{db_["WGS2"]};
+ auto local2 = std::vector<size_t>{db_["WGS2"]};
+ RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================
diff --git a/src/routines/level1/xamax.hpp b/src/routines/level1/xamax.hpp
index aa45a8e4..4d1e0082 100644
--- a/src/routines/level1/xamax.hpp
+++ b/src/routines/level1/xamax.hpp
@@ -28,9 +28,9 @@ class Xamax: public Routine {
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
// Templated-precision implementation of the routine
- StatusCode DoAmax(const size_t n,
- const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoAmax(const size_t n,
+ const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp
index 0c1ce903..a242a5fa 100644
--- a/src/routines/level1/xasum.cpp
+++ b/src/routines/level1/xasum.cpp
@@ -22,71 +22,61 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xasum.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xasum<T>::DoAsum(const size_t n,
- const Buffer<T> &asum_buffer, const size_t asum_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xasum<T>::DoAsum(const size_t n,
+ const Buffer<T> &asum_buffer, const size_t asum_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorScalar(1, asum_buffer, asum_offset);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorScalar(1, asum_buffer, asum_offset);
// Retrieves the Xasum kernels from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xasum");
- auto kernel2 = Kernel(program, "XasumEpilogue");
-
- // Creates the buffer for intermediate values
- auto temp_size = 2*db_["WGS2"];
- auto temp_buffer = Buffer<T>(context_, temp_size);
-
- // Sets the kernel arguments
- kernel1.SetArgument(0, static_cast<int>(n));
- kernel1.SetArgument(1, x_buffer());
- kernel1.SetArgument(2, static_cast<int>(x_offset));
- kernel1.SetArgument(3, static_cast<int>(x_inc));
- kernel1.SetArgument(4, temp_buffer());
-
- // Event waiting list
- auto eventWaitList = std::vector<Event>();
-
- // Launches the main kernel
- auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
- auto local1 = std::vector<size_t>{db_["WGS1"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(kernelEvent);
-
- // Sets the arguments for the epilogue kernel
- kernel2.SetArgument(0, temp_buffer());
- kernel2.SetArgument(1, asum_buffer());
- kernel2.SetArgument(2, static_cast<int>(asum_offset));
-
- // Launches the epilogue kernel
- auto global2 = std::vector<size_t>{db_["WGS2"]};
- auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel1 = Kernel(program, "Xasum");
+ auto kernel2 = Kernel(program, "XasumEpilogue");
+
+ // Creates the buffer for intermediate values
+ auto temp_size = 2*db_["WGS2"];
+ auto temp_buffer = Buffer<T>(context_, temp_size);
+
+ // Sets the kernel arguments
+ kernel1.SetArgument(0, static_cast<int>(n));
+ kernel1.SetArgument(1, x_buffer());
+ kernel1.SetArgument(2, static_cast<int>(x_offset));
+ kernel1.SetArgument(3, static_cast<int>(x_inc));
+ kernel1.SetArgument(4, temp_buffer());
+
+ // Event waiting list
+ auto eventWaitList = std::vector<Event>();
+
+ // Launches the main kernel
+ auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+ auto local1 = std::vector<size_t>{db_["WGS1"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+ eventWaitList.push_back(kernelEvent);
+
+ // Sets the arguments for the epilogue kernel
+ kernel2.SetArgument(0, temp_buffer());
+ kernel2.SetArgument(1, asum_buffer());
+ kernel2.SetArgument(2, static_cast<int>(asum_offset));
+
+ // Launches the epilogue kernel
+ auto global2 = std::vector<size_t>{db_["WGS2"]};
+ auto local2 = std::vector<size_t>{db_["WGS2"]};
+ RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================
diff --git a/src/routines/level1/xasum.hpp b/src/routines/level1/xasum.hpp
index 5a253f4d..0afcc4ff 100644
--- a/src/routines/level1/xasum.hpp
+++ b/src/routines/level1/xasum.hpp
@@ -28,9 +28,9 @@ class Xasum: public Routine {
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
// Templated-precision implementation of the routine
- StatusCode DoAsum(const size_t n,
- const Buffer<T> &asum_buffer, const size_t asum_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoAsum(const size_t n,
+ const Buffer<T> &asum_buffer, const size_t asum_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp
index 3445e2b5..5436c5b7 100644
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xaxpy.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@@ -55,45 +52,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
// Retrieves the Xaxpy kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, GetRealArg(alpha));
- kernel.SetArgument(2, x_buffer());
- kernel.SetArgument(3, y_buffer());
- }
- else {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, GetRealArg(alpha));
- kernel.SetArgument(2, x_buffer());
- kernel.SetArgument(3, static_cast<int>(x_offset));
- kernel.SetArgument(4, static_cast<int>(x_inc));
- kernel.SetArgument(5, y_buffer());
- kernel.SetArgument(6, static_cast<int>(y_offset));
- kernel.SetArgument(7, static_cast<int>(y_inc));
- }
-
- // Launches the kernel
- if (use_fast_kernel) {
- auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- else {
- auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
- auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(alpha));
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, y_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(alpha));
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, static_cast<int>(x_offset));
+ kernel.SetArgument(4, static_cast<int>(x_inc));
+ kernel.SetArgument(5, y_buffer());
+ kernel.SetArgument(6, static_cast<int>(y_offset));
+ kernel.SetArgument(7, static_cast<int>(y_inc));
+ }
+
+ // Launches the kernel
+ if (use_fast_kernel) {
+ auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
+ else {
+ auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+ auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
}
// =================================================================================================
diff --git a/src/routines/level1/xaxpy.hpp b/src/routines/level1/xaxpy.hpp
index caac871e..9b30dfaa 100644
--- a/src/routines/level1/xaxpy.hpp
+++ b/src/routines/level1/xaxpy.hpp
@@ -28,9 +28,9 @@ class Xaxpy: public Routine {
Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
// Templated-precision implementation of the routine
- StatusCode DoAxpy(const size_t n, const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoAxpy(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp
index 673ef349..d86200c0 100644
--- a/src/routines/level1/xcopy.cpp
+++ b/src/routines/level1/xcopy.cpp
@@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xcopy.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xcopy<T>::DoCopy(const size_t n,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xcopy<T>::DoCopy(const size_t n,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@@ -55,43 +52,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";
// Retrieves the Xcopy kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, x_buffer());
- kernel.SetArgument(2, y_buffer());
- }
- else {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, x_buffer());
- kernel.SetArgument(2, static_cast<int>(x_offset));
- kernel.SetArgument(3, static_cast<int>(x_inc));
- kernel.SetArgument(4, y_buffer());
- kernel.SetArgument(5, static_cast<int>(y_offset));
- kernel.SetArgument(6, static_cast<int>(y_inc));
- }
-
- // Launches the kernel
- if (use_fast_kernel) {
- auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- else {
- auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
- auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, x_buffer());
+ kernel.SetArgument(2, y_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, x_buffer());
+ kernel.SetArgument(2, static_cast<int>(x_offset));
+ kernel.SetArgument(3, static_cast<int>(x_inc));
+ kernel.SetArgument(4, y_buffer());
+ kernel.SetArgument(5, static_cast<int>(y_offset));
+ kernel.SetArgument(6, static_cast<int>(y_inc));
+ }
+
+ // Launches the kernel
+ if (use_fast_kernel) {
+ auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
+ else {
+ auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+ auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
}
// =================================================================================================
diff --git a/src/routines/level1/xcopy.hpp b/src/routines/level1/xcopy.hpp
index 0c424ba3..a6454fcc 100644
--- a/src/routines/level1/xcopy.hpp
+++ b/src/routines/level1/xcopy.hpp
@@ -28,9 +28,9 @@ class Xcopy: public Routine {
Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
// Templated-precision implementation of the routine
- StatusCode DoCopy(const size_t n,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoCopy(const size_t n,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp
index bafea157..9d718913 100644
--- a/src/routines/level1/xdot.cpp
+++ b/src/routines/level1/xdot.cpp
@@ -22,79 +22,68 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xdot.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xdot<T>::DoDot(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const bool do_conjugate) {
+void Xdot<T>::DoDot(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const bool do_conjugate) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorScalar(1, dot_buffer, dot_offset);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
+ TestVectorScalar(1, dot_buffer, dot_offset);
// Retrieves the Xdot kernels from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xdot");
- auto kernel2 = Kernel(program, "XdotEpilogue");
-
- // Creates the buffer for intermediate values
- auto temp_size = 2*db_["WGS2"];
- auto temp_buffer = Buffer<T>(context_, temp_size);
-
- // Sets the kernel arguments
- kernel1.SetArgument(0, static_cast<int>(n));
- kernel1.SetArgument(1, x_buffer());
- kernel1.SetArgument(2, static_cast<int>(x_offset));
- kernel1.SetArgument(3, static_cast<int>(x_inc));
- kernel1.SetArgument(4, y_buffer());
- kernel1.SetArgument(5, static_cast<int>(y_offset));
- kernel1.SetArgument(6, static_cast<int>(y_inc));
- kernel1.SetArgument(7, temp_buffer());
- kernel1.SetArgument(8, static_cast<int>(do_conjugate));
-
- // Event waiting list
- auto eventWaitList = std::vector<Event>();
-
- // Launches the main kernel
- auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
- auto local1 = std::vector<size_t>{db_["WGS1"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(kernelEvent);
-
- // Sets the arguments for the epilogue kernel
- kernel2.SetArgument(0, temp_buffer());
- kernel2.SetArgument(1, dot_buffer());
- kernel2.SetArgument(2, static_cast<int>(dot_offset));
-
- // Launches the epilogue kernel
- auto global2 = std::vector<size_t>{db_["WGS2"]};
- auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel1 = Kernel(program, "Xdot");
+ auto kernel2 = Kernel(program, "XdotEpilogue");
+
+ // Creates the buffer for intermediate values
+ auto temp_size = 2*db_["WGS2"];
+ auto temp_buffer = Buffer<T>(context_, temp_size);
+
+ // Sets the kernel arguments
+ kernel1.SetArgument(0, static_cast<int>(n));
+ kernel1.SetArgument(1, x_buffer());
+ kernel1.SetArgument(2, static_cast<int>(x_offset));
+ kernel1.SetArgument(3, static_cast<int>(x_inc));
+ kernel1.SetArgument(4, y_buffer());
+ kernel1.SetArgument(5, static_cast<int>(y_offset));
+ kernel1.SetArgument(6, static_cast<int>(y_inc));
+ kernel1.SetArgument(7, temp_buffer());
+ kernel1.SetArgument(8, static_cast<int>(do_conjugate));
+
+ // Event waiting list
+ auto eventWaitList = std::vector<Event>();
+
+ // Launches the main kernel
+ auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+ auto local1 = std::vector<size_t>{db_["WGS1"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+ eventWaitList.push_back(kernelEvent);
+
+ // Sets the arguments for the epilogue kernel
+ kernel2.SetArgument(0, temp_buffer());
+ kernel2.SetArgument(1, dot_buffer());
+ kernel2.SetArgument(2, static_cast<int>(dot_offset));
+
+ // Launches the epilogue kernel
+ auto global2 = std::vector<size_t>{db_["WGS2"]};
+ auto local2 = std::vector<size_t>{db_["WGS2"]};
+ RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================
diff --git a/src/routines/level1/xdot.hpp b/src/routines/level1/xdot.hpp
index 02c1efaa..a4c9dfa0 100644
--- a/src/routines/level1/xdot.hpp
+++ b/src/routines/level1/xdot.hpp
@@ -28,11 +28,11 @@ class Xdot: public Routine {
Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
// Templated-precision implementation of the routine
- StatusCode DoDot(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const bool do_conjugate = false);
+ void DoDot(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const bool do_conjugate = false);
};
// =================================================================================================
diff --git a/src/routines/level1/xdotc.cpp b/src/routines/level1/xdotc.cpp
index 27cf2bab..5a4e939a 100644
--- a/src/routines/level1/xdotc.cpp
+++ b/src/routines/level1/xdotc.cpp
@@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xdotc<T>::DoDotc(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
- return DoDot(n, dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- true);
+void Xdotc<T>::DoDotc(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+ DoDot(n, dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ true);
}
// =================================================================================================
diff --git a/src/routines/level1/xdotc.hpp b/src/routines/level1/xdotc.hpp
index b8cbdaf5..ab7465f5 100644
--- a/src/routines/level1/xdotc.hpp
+++ b/src/routines/level1/xdotc.hpp
@@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> {
Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
// Templated-precision implementation of the routine
- StatusCode DoDotc(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoDotc(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xdotu.cpp b/src/routines/level1/xdotu.cpp
index 0bce70b7..b9d8bcef 100644
--- a/src/routines/level1/xdotu.cpp
+++ b/src/routines/level1/xdotu.cpp
@@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xdotu<T>::DoDotu(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
- return DoDot(n, dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- false);
+void Xdotu<T>::DoDotu(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+ DoDot(n, dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ false);
}
// =================================================================================================
diff --git a/src/routines/level1/xdotu.hpp b/src/routines/level1/xdotu.hpp
index b3f73086..cad91c58 100644
--- a/src/routines/level1/xdotu.hpp
+++ b/src/routines/level1/xdotu.hpp
@@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> {
Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
// Templated-precision implementation of the routine
- StatusCode DoDotu(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoDotu(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xmax.hpp b/src/routines/level1/xmax.hpp
index 5a0236f2..2b7a5ae7 100644
--- a/src/routines/level1/xmax.hpp
+++ b/src/routines/level1/xmax.hpp
@@ -35,10 +35,10 @@ class Xmax: public Xamax<T> {
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
- StatusCode DoMax(const size_t n,
- const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
- return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
+ void DoMax(const size_t n,
+ const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
}
};
diff --git a/src/routines/level1/xmin.hpp b/src/routines/level1/xmin.hpp
index 6befec64..47a195ea 100644
--- a/src/routines/level1/xmin.hpp
+++ b/src/routines/level1/xmin.hpp
@@ -35,10 +35,10 @@ class Xmin: public Xamax<T> {
// Forwards to the regular max-absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
- StatusCode DoMin(const size_t n,
- const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
- return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
+ void DoMin(const size_t n,
+ const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
}
};
diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp
index 97615d8b..373820a4 100644
--- a/src/routines/level1/xnrm2.cpp
+++ b/src/routines/level1/xnrm2.cpp
@@ -22,71 +22,61 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xnrm2.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xnrm2<T>::DoNrm2(const size_t n,
- const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xnrm2<T>::DoNrm2(const size_t n,
+ const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorScalar(1, nrm2_buffer, nrm2_offset);
// Retrieves the Xnrm2 kernels from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xnrm2");
- auto kernel2 = Kernel(program, "Xnrm2Epilogue");
-
- // Creates the buffer for intermediate values
- auto temp_size = 2*db_["WGS2"];
- auto temp_buffer = Buffer<T>(context_, temp_size);
-
- // Sets the kernel arguments
- kernel1.SetArgument(0, static_cast<int>(n));
- kernel1.SetArgument(1, x_buffer());
- kernel1.SetArgument(2, static_cast<int>(x_offset));
- kernel1.SetArgument(3, static_cast<int>(x_inc));
- kernel1.SetArgument(4, temp_buffer());
-
- // Event waiting list
- auto eventWaitList = std::vector<Event>();
-
- // Launches the main kernel
- auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
- auto local1 = std::vector<size_t>{db_["WGS1"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(kernelEvent);
-
- // Sets the arguments for the epilogue kernel
- kernel2.SetArgument(0, temp_buffer());
- kernel2.SetArgument(1, nrm2_buffer());
- kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
-
- // Launches the epilogue kernel
- auto global2 = std::vector<size_t>{db_["WGS2"]};
- auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel1 = Kernel(program, "Xnrm2");
+ auto kernel2 = Kernel(program, "Xnrm2Epilogue");
+
+ // Creates the buffer for intermediate values
+ auto temp_size = 2*db_["WGS2"];
+ auto temp_buffer = Buffer<T>(context_, temp_size);
+
+ // Sets the kernel arguments
+ kernel1.SetArgument(0, static_cast<int>(n));
+ kernel1.SetArgument(1, x_buffer());
+ kernel1.SetArgument(2, static_cast<int>(x_offset));
+ kernel1.SetArgument(3, static_cast<int>(x_inc));
+ kernel1.SetArgument(4, temp_buffer());
+
+ // Event waiting list
+ auto eventWaitList = std::vector<Event>();
+
+ // Launches the main kernel
+ auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+ auto local1 = std::vector<size_t>{db_["WGS1"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+ eventWaitList.push_back(kernelEvent);
+
+ // Sets the arguments for the epilogue kernel
+ kernel2.SetArgument(0, temp_buffer());
+ kernel2.SetArgument(1, nrm2_buffer());
+ kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
+
+ // Launches the epilogue kernel
+ auto global2 = std::vector<size_t>{db_["WGS2"]};
+ auto local2 = std::vector<size_t>{db_["WGS2"]};
+ RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================
diff --git a/src/routines/level1/xnrm2.hpp b/src/routines/level1/xnrm2.hpp
index 7baf07f5..3183ce24 100644
--- a/src/routines/level1/xnrm2.hpp
+++ b/src/routines/level1/xnrm2.hpp
@@ -28,9 +28,9 @@ class Xnrm2: public Routine {
Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
// Templated-precision implementation of the routine
- StatusCode DoNrm2(const size_t n,
- const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoNrm2(const size_t n,
+ const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp
index bcc43c3b..17410f01 100644
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@@ -22,26 +22,24 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xscal.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xscal<T>::DoScal(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vector for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@@ -51,41 +49,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";
// Retrieves the Xscal kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, alpha);
- kernel.SetArgument(2, x_buffer());
- }
- else {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, alpha);
- kernel.SetArgument(2, x_buffer());
- kernel.SetArgument(3, static_cast<int>(x_offset));
- kernel.SetArgument(4, static_cast<int>(x_inc));
- }
-
- // Launches the kernel
- if (use_fast_kernel) {
- auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- else {
- auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
- auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, alpha);
+ kernel.SetArgument(2, x_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, alpha);
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, static_cast<int>(x_offset));
+ kernel.SetArgument(4, static_cast<int>(x_inc));
+ }
+
+ // Launches the kernel
+ if (use_fast_kernel) {
+ auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
+ else {
+ auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+ auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
}
// =================================================================================================
diff --git a/src/routines/level1/xscal.hpp b/src/routines/level1/xscal.hpp
index 6c585cb2..02c847cc 100644
--- a/src/routines/level1/xscal.hpp
+++ b/src/routines/level1/xscal.hpp
@@ -28,8 +28,8 @@ class Xscal: public Routine {
Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
// Templated-precision implementation of the routine
- StatusCode DoScal(const size_t n, const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoScal(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xsum.hpp b/src/routines/level1/xsum.hpp
index 84e20bea..a69d6511 100644
--- a/src/routines/level1/xsum.hpp
+++ b/src/routines/level1/xsum.hpp
@@ -35,10 +35,10 @@ class Xsum: public Xasum<T> {
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
- StatusCode DoSum(const size_t n,
- const Buffer<T> &sum_buffer, const size_t sum_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
- return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
+ void DoSum(const size_t n,
+ const Buffer<T> &sum_buffer, const size_t sum_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
}
};
diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp
index 03907cbd..c9b97dc9 100644
--- a/src/routines/level1/xswap.cpp
+++ b/src/routines/level1/xswap.cpp
@@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xswap.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xswap<T>::DoSwap(const size_t n,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xswap<T>::DoSwap(const size_t n,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@@ -55,43 +52,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";
// Retrieves the Xswap kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, x_buffer());
- kernel.SetArgument(2, y_buffer());
- }
- else {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, x_buffer());
- kernel.SetArgument(2, static_cast<int>(x_offset));
- kernel.SetArgument(3, static_cast<int>(x_inc));
- kernel.SetArgument(4, y_buffer());
- kernel.SetArgument(5, static_cast<int>(y_offset));
- kernel.SetArgument(6, static_cast<int>(y_inc));
- }
-
- // Launches the kernel
- if (use_fast_kernel) {
- auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- else {
- auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
- auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, x_buffer());
+ kernel.SetArgument(2, y_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, x_buffer());
+ kernel.SetArgument(2, static_cast<int>(x_offset));
+ kernel.SetArgument(3, static_cast<int>(x_inc));
+ kernel.SetArgument(4, y_buffer());
+ kernel.SetArgument(5, static_cast<int>(y_offset));
+ kernel.SetArgument(6, static_cast<int>(y_inc));
+ }
+
+ // Launches the kernel
+ if (use_fast_kernel) {
+ auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
+ else {
+ auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+ auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
}
// =================================================================================================
diff --git a/src/routines/level1/xswap.hpp b/src/routines/level1/xswap.hpp
index 4f9ea36d..eadf58e5 100644
--- a/src/routines/level1/xswap.hpp
+++ b/src/routines/level1/xswap.hpp
@@ -28,9 +28,9 @@ class Xswap: public Routine {
Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
// Templated-precision implementation of the routine
- StatusCode DoSwap(const size_t n,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoSwap(const size_t n,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xgbmv.cpp b/src/routines/level2/xgbmv.cpp
index ea4f001c..e80b9a96 100644
--- a/src/routines/level2/xgbmv.cpp
+++ b/src/routines/level2/xgbmv.cpp
@@ -29,13 +29,13 @@ Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Reverses the upper and lower band count
auto rotated = (layout == Layout::kRowMajor);
@@ -46,13 +46,13 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
// The specific hermitian matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_GBMV define.
bool fast_kernels = false;
- return MatVec(layout, a_transpose,
- m, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- 0, false, kl_real, ku_real);
+ MatVec(layout, a_transpose,
+ m, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ 0, false, kl_real, ku_real);
}
// =================================================================================================
diff --git a/src/routines/level2/xgbmv.hpp b/src/routines/level2/xgbmv.hpp
index 686ab642..e5f670ec 100644
--- a/src/routines/level2/xgbmv.hpp
+++ b/src/routines/level2/xgbmv.hpp
@@ -33,13 +33,13 @@ class Xgbmv: public Xgemv<T> {
Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
// Templated-precision implementation of the routine
- StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoGbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 4e32ba41..7b4c2e8f 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -22,52 +22,51 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/xgemv.opencl"
#include "../../kernels/level2/xgemv_fast.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Performs the matrix-vector multiplication
- return MatVec(layout, a_transpose,
- m, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- true, true,
- 0, false, 0, 0); // N/A for this routine
+ MatVec(layout, a_transpose,
+ m, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ true, true,
+ 0, false, 0, 0); // N/A for this routine
}
// =================================================================================================
// The generic implementation, also suited for other (non general) matrix-vector multiplications
template <typename T>
-StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- bool fast_kernel, bool fast_kernel_rot,
- const size_t parameter, const bool packed,
- const size_t kl, const size_t ku) {
+void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ bool fast_kernel, bool fast_kernel_rot,
+ const size_t parameter, const bool packed,
+ const size_t kl, const size_t ku) {
// Makes sure all dimensions are larger than zero
- if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+ if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrix has an alternative layout (row or column-major).
auto a_altlayout = (layout == Layout::kRowMajor);
@@ -91,14 +90,10 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
auto a_conjugate = (a_transpose == Transpose::kConjugate);
// Tests the matrix and the vectors for validity
- auto status = StatusCode::kSuccess;
- if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
- else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
- if (ErrorIn(status)) { return status; }
- status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+ else { TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
+ TestVectorX(n_real, x_buffer, x_offset, x_inc);
+ TestVectorY(m_real, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
@@ -127,39 +122,33 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
}
// Retrieves the Xgemv kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(m_real));
- kernel.SetArgument(1, static_cast<int>(n_real));
- kernel.SetArgument(2, GetRealArg(alpha));
- kernel.SetArgument(3, GetRealArg(beta));
- kernel.SetArgument(4, static_cast<int>(a_rotated));
- kernel.SetArgument(5, a_buffer());
- kernel.SetArgument(6, static_cast<int>(a_offset));
- kernel.SetArgument(7, static_cast<int>(a_ld));
- kernel.SetArgument(8, x_buffer());
- kernel.SetArgument(9, static_cast<int>(x_offset));
- kernel.SetArgument(10, static_cast<int>(x_inc));
- kernel.SetArgument(11, y_buffer());
- kernel.SetArgument(12, static_cast<int>(y_offset));
- kernel.SetArgument(13, static_cast<int>(y_inc));
- kernel.SetArgument(14, static_cast<int>(a_conjugate));
- kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
- kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
- kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
-
- // Launches the kernel
- auto global = std::vector<size_t>{global_size};
- auto local = std::vector<size_t>{local_size};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(m_real));
+ kernel.SetArgument(1, static_cast<int>(n_real));
+ kernel.SetArgument(2, GetRealArg(alpha));
+ kernel.SetArgument(3, GetRealArg(beta));
+ kernel.SetArgument(4, static_cast<int>(a_rotated));
+ kernel.SetArgument(5, a_buffer());
+ kernel.SetArgument(6, static_cast<int>(a_offset));
+ kernel.SetArgument(7, static_cast<int>(a_ld));
+ kernel.SetArgument(8, x_buffer());
+ kernel.SetArgument(9, static_cast<int>(x_offset));
+ kernel.SetArgument(10, static_cast<int>(x_inc));
+ kernel.SetArgument(11, y_buffer());
+ kernel.SetArgument(12, static_cast<int>(y_offset));
+ kernel.SetArgument(13, static_cast<int>(y_inc));
+ kernel.SetArgument(14, static_cast<int>(a_conjugate));
+ kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
+ kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
+ kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
+
+ // Launches the kernel
+ auto global = std::vector<size_t>{global_size};
+ auto local = std::vector<size_t>{local_size};
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
diff --git a/src/routines/level2/xgemv.hpp b/src/routines/level2/xgemv.hpp
index e9afec8d..1e1fa726 100644
--- a/src/routines/level2/xgemv.hpp
+++ b/src/routines/level2/xgemv.hpp
@@ -28,25 +28,25 @@ class Xgemv: public Routine {
Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
// Templated-precision implementation of the routine
- StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoGemv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
// Generic version used also for other matrix-vector multiplications
- StatusCode MatVec(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- bool fast_kernel, bool fast_kernel_rot,
- const size_t parameter, const bool packed,
- const size_t kl, const size_t ku);
+ void MatVec(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ bool fast_kernel, bool fast_kernel_rot,
+ const size_t parameter, const bool packed,
+ const size_t kl, const size_t ku);
};
// =================================================================================================
diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp
index 29cffe0c..d16ebd11 100644
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@@ -22,26 +22,25 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xger.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xger<T>::DoGer(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xger<T>::DoGer(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Makes sure all dimensions are larger than zero
- if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+ if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrix has an alternative layout (row or column-major).
const auto a_is_rowmajor = (layout == Layout::kRowMajor);
@@ -49,44 +48,35 @@ StatusCode Xger<T>::DoGer(const Layout layout,
const auto a_two = (a_is_rowmajor) ? m : n;
// Tests the matrix and the vectors for validity
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestVectorX(m, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+ TestVectorX(m, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, "Xger");
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(a_one));
- kernel.SetArgument(1, static_cast<int>(a_two));
- kernel.SetArgument(2, GetRealArg(alpha));
- kernel.SetArgument(3, x_buffer());
- kernel.SetArgument(4, static_cast<int>(x_offset));
- kernel.SetArgument(5, static_cast<int>(x_inc));
- kernel.SetArgument(6, y_buffer());
- kernel.SetArgument(7, static_cast<int>(y_offset));
- kernel.SetArgument(8, static_cast<int>(y_inc));
- kernel.SetArgument(9, a_buffer());
- kernel.SetArgument(10, static_cast<int>(a_offset));
- kernel.SetArgument(11, static_cast<int>(a_ld));
- kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
-
- // Launches the kernel
- auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
- auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
- auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
- auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, "Xger");
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(a_one));
+ kernel.SetArgument(1, static_cast<int>(a_two));
+ kernel.SetArgument(2, GetRealArg(alpha));
+ kernel.SetArgument(3, x_buffer());
+ kernel.SetArgument(4, static_cast<int>(x_offset));
+ kernel.SetArgument(5, static_cast<int>(x_inc));
+ kernel.SetArgument(6, y_buffer());
+ kernel.SetArgument(7, static_cast<int>(y_offset));
+ kernel.SetArgument(8, static_cast<int>(y_inc));
+ kernel.SetArgument(9, a_buffer());
+ kernel.SetArgument(10, static_cast<int>(a_offset));
+ kernel.SetArgument(11, static_cast<int>(a_ld));
+ kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
+
+ // Launches the kernel
+ auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
+ auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
+ auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
+ auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
diff --git a/src/routines/level2/xger.hpp b/src/routines/level2/xger.hpp
index 3c6abe44..fbbb07a1 100644
--- a/src/routines/level2/xger.hpp
+++ b/src/routines/level2/xger.hpp
@@ -28,12 +28,12 @@ class Xger: public Routine {
Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
// Templated-precision implementation of the routine
- StatusCode DoGer(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+ void DoGer(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
diff --git a/src/routines/level2/xgerc.cpp b/src/routines/level2/xgerc.cpp
index d9feda97..4fa2e2a8 100644
--- a/src/routines/level2/xgerc.cpp
+++ b/src/routines/level2/xgerc.cpp
@@ -28,19 +28,19 @@ Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xgerc<T>::DoGerc(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xgerc<T>::DoGerc(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Regular Ger operation on complex data, plus conjugation in the kernel guarded by the
// ROUTINE_GERC guard.
- return DoGer(layout, m, n, alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld);
+ DoGer(layout, m, n, alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld);
}
// =================================================================================================
diff --git a/src/routines/level2/xgerc.hpp b/src/routines/level2/xgerc.hpp
index f1d04dfd..2d61f2b7 100644
--- a/src/routines/level2/xgerc.hpp
+++ b/src/routines/level2/xgerc.hpp
@@ -31,12 +31,12 @@ class Xgerc: public Xger<T> {
Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
// Templated-precision implementation of the routine
- StatusCode DoGerc(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+ void DoGerc(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
diff --git a/src/routines/level2/xgeru.cpp b/src/routines/level2/xgeru.cpp
index da9e91c2..c77e69c5 100644
--- a/src/routines/level2/xgeru.cpp
+++ b/src/routines/level2/xgeru.cpp
@@ -28,18 +28,18 @@ Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xgeru<T>::DoGeru(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xgeru<T>::DoGeru(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Regular Ger operation on complex data
- return DoGer(layout, m, n, alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld);
+ DoGer(layout, m, n, alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld);
}
// =================================================================================================
diff --git a/src/routines/level2/xgeru.hpp b/src/routines/level2/xgeru.hpp
index fb50e917..4cae6b58 100644
--- a/src/routines/level2/xgeru.hpp
+++ b/src/routines/level2/xgeru.hpp
@@ -31,12 +31,12 @@ class Xgeru: public Xger<T> {
Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
// Templated-precision implementation of the routine
- StatusCode DoGeru(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+ void DoGeru(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
diff --git a/src/routines/level2/xhbmv.cpp b/src/routines/level2/xhbmv.cpp
index f6c0e3c4..c7c9ed9d 100644
--- a/src/routines/level2/xhbmv.cpp
+++ b/src/routines/level2/xhbmv.cpp
@@ -29,13 +29,13 @@ Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
// The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HBMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, false, k, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, k, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xhbmv.hpp b/src/routines/level2/xhbmv.hpp
index d668eb88..76d3c91e 100644
--- a/src/routines/level2/xhbmv.hpp
+++ b/src/routines/level2/xhbmv.hpp
@@ -33,13 +33,13 @@ class Xhbmv: public Xgemv<T> {
Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
// Templated-precision implementation of the routine
- StatusCode DoHbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoHbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xhemv.cpp b/src/routines/level2/xhemv.cpp
index 2cbcf7b4..209ff654 100644
--- a/src/routines/level2/xhemv.cpp
+++ b/src/routines/level2/xhemv.cpp
@@ -29,13 +29,13 @@ Xhemv<T>::Xhemv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
// The specific hermitian matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HEMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, false, 0, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xhemv.hpp b/src/routines/level2/xhemv.hpp
index 8e062fd3..20d2df22 100644
--- a/src/routines/level2/xhemv.hpp
+++ b/src/routines/level2/xhemv.hpp
@@ -33,13 +33,13 @@ class Xhemv: public Xgemv<T> {
Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
// Templated-precision implementation of the routine
- StatusCode DoHemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoHemv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp
index 6dd95938..6c334e63 100644
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@@ -21,11 +21,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher.opencl"
- ;
+ }) {
}
// =================================================================================================
@@ -41,15 +40,15 @@ template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }
// The main routine
template <typename T, typename U>
-StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
- const size_t n,
- const U alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const bool packed) {
+void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const U alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const bool packed) {
// Makes sure the dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -57,47 +56,38 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Tests the matrix and the vectors for validity
- auto status = StatusCode::kSuccess;
- if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
- else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
- if (ErrorIn(status)) { return status; }
- status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
+ if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+ else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
// If alpha is zero an update is not required
- if (alpha == U{0}) { return StatusCode::kSuccess; }
+ if (alpha == U{0}) { return; }
// Creates a matching version of alpha
const auto matching_alpha = GetAlpha(alpha);
// Retrieves the kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, "Xher");
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, GetRealArg(matching_alpha));
- kernel.SetArgument(2, x_buffer());
- kernel.SetArgument(3, static_cast<int>(x_offset));
- kernel.SetArgument(4, static_cast<int>(x_inc));
- kernel.SetArgument(5, a_buffer());
- kernel.SetArgument(6, static_cast<int>(a_offset));
- kernel.SetArgument(7, static_cast<int>(a_ld));
- kernel.SetArgument(8, static_cast<int>(is_upper));
- kernel.SetArgument(9, static_cast<int>(is_rowmajor));
-
- // Launches the kernel
- auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
- auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
- auto global = std::vector<size_t>{global_one, global_two};
- auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, "Xher");
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(matching_alpha));
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, static_cast<int>(x_offset));
+ kernel.SetArgument(4, static_cast<int>(x_inc));
+ kernel.SetArgument(5, a_buffer());
+ kernel.SetArgument(6, static_cast<int>(a_offset));
+ kernel.SetArgument(7, static_cast<int>(a_ld));
+ kernel.SetArgument(8, static_cast<int>(is_upper));
+ kernel.SetArgument(9, static_cast<int>(is_rowmajor));
+
+ // Launches the kernel
+ auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
+ auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
+ auto global = std::vector<size_t>{global_one, global_two};
+ auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
diff --git a/src/routines/level2/xher.hpp b/src/routines/level2/xher.hpp
index 9ff6bf3f..70a30bda 100644
--- a/src/routines/level2/xher.hpp
+++ b/src/routines/level2/xher.hpp
@@ -31,12 +31,12 @@ class Xher: public Routine {
T GetAlpha(const U alpha);
// Templated-precision implementation of the routine
- StatusCode DoHer(const Layout layout, const Triangle triangle,
- const size_t n,
- const U alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const bool packed = false);
+ void DoHer(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const U alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const bool packed = false);
};
// =================================================================================================
diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp
index 3d57a9b9..11e2c871 100644
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@@ -21,27 +21,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher2.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const bool packed) {
+void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const bool packed) {
// Makes sure the dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -49,46 +48,36 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Tests the matrix and the vectors for validity
- auto status = StatusCode::kSuccess;
- if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
- else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
- if (ErrorIn(status)) { return status; }
- status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+ else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, "Xher2");
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, GetRealArg(alpha));
- kernel.SetArgument(2, x_buffer());
- kernel.SetArgument(3, static_cast<int>(x_offset));
- kernel.SetArgument(4, static_cast<int>(x_inc));
- kernel.SetArgument(5, y_buffer());
- kernel.SetArgument(6, static_cast<int>(y_offset));
- kernel.SetArgument(7, static_cast<int>(y_inc));
- kernel.SetArgument(8, a_buffer());
- kernel.SetArgument(9, static_cast<int>(a_offset));
- kernel.SetArgument(10, static_cast<int>(a_ld));
- kernel.SetArgument(11, static_cast<int>(is_upper));
- kernel.SetArgument(12, static_cast<int>(is_rowmajor));
-
- // Launches the kernel
- auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
- auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
- auto global = std::vector<size_t>{global_one, global_two};
- auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, "Xher2");
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(alpha));
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, static_cast<int>(x_offset));
+ kernel.SetArgument(4, static_cast<int>(x_inc));
+ kernel.SetArgument(5, y_buffer());
+ kernel.SetArgument(6, static_cast<int>(y_offset));
+ kernel.SetArgument(7, static_cast<int>(y_inc));
+ kernel.SetArgument(8, a_buffer());
+ kernel.SetArgument(9, static_cast<int>(a_offset));
+ kernel.SetArgument(10, static_cast<int>(a_ld));
+ kernel.SetArgument(11, static_cast<int>(is_upper));
+ kernel.SetArgument(12, static_cast<int>(is_rowmajor));
+
+ // Launches the kernel
+ auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
+ auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
+ auto global = std::vector<size_t>{global_one, global_two};
+ auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
diff --git a/src/routines/level2/xher2.hpp b/src/routines/level2/xher2.hpp
index 8c53c047..dcb2ecb7 100644
--- a/src/routines/level2/xher2.hpp
+++ b/src/routines/level2/xher2.hpp
@@ -28,13 +28,13 @@ class Xher2: public Routine {
Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
// Templated-precision implementation of the routine
- StatusCode DoHer2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const bool packed = false);
+ void DoHer2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const bool packed = false);
};
// =================================================================================================
diff --git a/src/routines/level2/xhpmv.cpp b/src/routines/level2/xhpmv.cpp
index e6f82b34..70a0ab0d 100644
--- a/src/routines/level2/xhpmv.cpp
+++ b/src/routines/level2/xhpmv.cpp
@@ -29,13 +29,13 @@ Xhpmv<T>::Xhpmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
// The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HPMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- ap_buffer, ap_offset, n,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, true, 0, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ ap_buffer, ap_offset, n,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, true, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xhpmv.hpp b/src/routines/level2/xhpmv.hpp
index b11192f9..13a6277c 100644
--- a/src/routines/level2/xhpmv.hpp
+++ b/src/routines/level2/xhpmv.hpp
@@ -33,13 +33,13 @@ class Xhpmv: public Xgemv<T> {
Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
// Templated-precision implementation of the routine
- StatusCode DoHpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoHpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xhpr.cpp b/src/routines/level2/xhpr.cpp
index 225ebfe5..7e517c59 100644
--- a/src/routines/level2/xhpr.cpp
+++ b/src/routines/level2/xhpr.cpp
@@ -28,17 +28,17 @@ Xhpr<T,U>::Xhpr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T, typename U>
-StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const U alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const U alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xhpr functionality is implemented in the kernel using defines
- return DoHer(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- ap_buffer, ap_offset, n,
- true); // packed matrix
+ DoHer(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ ap_buffer, ap_offset, n,
+ true); // packed matrix
}
// =================================================================================================
diff --git a/src/routines/level2/xhpr.hpp b/src/routines/level2/xhpr.hpp
index 37801c68..6ebc220e 100644
--- a/src/routines/level2/xhpr.hpp
+++ b/src/routines/level2/xhpr.hpp
@@ -31,11 +31,11 @@ class Xhpr: public Xher<T,U> {
Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
// Templated-precision implementation of the routine
- StatusCode DoHpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const U alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
+ void DoHpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const U alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
diff --git a/src/routines/level2/xhpr2.cpp b/src/routines/level2/xhpr2.cpp
index 85f9d3f9..35daa365 100644
--- a/src/routines/level2/xhpr2.cpp
+++ b/src/routines/level2/xhpr2.cpp
@@ -28,19 +28,19 @@ Xhpr2<T>::Xhpr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xhpr2 functionality is implemented in the kernel using defines
- return DoHer2(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- ap_buffer, ap_offset, n,
- true); // packed matrix
+ DoHer2(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ ap_buffer, ap_offset, n,
+ true); // packed matrix
}
// =================================================================================================
diff --git a/src/routines/level2/xhpr2.hpp b/src/routines/level2/xhpr2.hpp
index d66dce55..f344fd48 100644
--- a/src/routines/level2/xhpr2.hpp
+++ b/src/routines/level2/xhpr2.hpp
@@ -31,12 +31,12 @@ class Xhpr2: public Xher2<T> {
Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
// Templated-precision implementation of the routine
- StatusCode DoHpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
+ void DoHpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
diff --git a/src/routines/level2/xsbmv.cpp b/src/routines/level2/xsbmv.cpp
index 28730899..e47430d1 100644
--- a/src/routines/level2/xsbmv.cpp
+++ b/src/routines/level2/xsbmv.cpp
@@ -29,13 +29,13 @@ Xsbmv<T>::Xsbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
// The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SBMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, false, k, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, k, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xsbmv.hpp b/src/routines/level2/xsbmv.hpp
index 16c5e9a8..a4542f49 100644
--- a/src/routines/level2/xsbmv.hpp
+++ b/src/routines/level2/xsbmv.hpp
@@ -33,13 +33,13 @@ class Xsbmv: public Xgemv<T> {
Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
// Templated-precision implementation of the routine
- StatusCode DoSbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoSbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xspmv.cpp b/src/routines/level2/xspmv.cpp
index f6651012..bf1a49e1 100644
--- a/src/routines/level2/xspmv.cpp
+++ b/src/routines/level2/xspmv.cpp
@@ -29,13 +29,13 @@ Xspmv<T>::Xspmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
// The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SPMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- ap_buffer, ap_offset, n,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, true, 0, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ ap_buffer, ap_offset, n,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, true, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xspmv.hpp b/src/routines/level2/xspmv.hpp
index a0c69b85..94caa4ac 100644
--- a/src/routines/level2/xspmv.hpp
+++ b/src/routines/level2/xspmv.hpp
@@ -33,13 +33,13 @@ class Xspmv: public Xgemv<T> {
Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
// Templated-precision implementation of the routine
- StatusCode DoSpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoSpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xspr.cpp b/src/routines/level2/xspr.cpp
index a75fe9c3..56791a7b 100644
--- a/src/routines/level2/xspr.cpp
+++ b/src/routines/level2/xspr.cpp
@@ -28,17 +28,17 @@ Xspr<T>::Xspr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xspr functionality is implemented in the kernel using defines
- return DoHer(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- ap_buffer, ap_offset, n,
- true); // packed matrix
+ DoHer(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ ap_buffer, ap_offset, n,
+ true); // packed matrix
}
// =================================================================================================
diff --git a/src/routines/level2/xspr.hpp b/src/routines/level2/xspr.hpp
index 6468c736..760a2ddb 100644
--- a/src/routines/level2/xspr.hpp
+++ b/src/routines/level2/xspr.hpp
@@ -31,11 +31,11 @@ class Xspr: public Xher<T,T> {
Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
// Templated-precision implementation of the routine
- StatusCode DoSpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
+ void DoSpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
diff --git a/src/routines/level2/xspr2.cpp b/src/routines/level2/xspr2.cpp
index c39a2eb4..8d0432c2 100644
--- a/src/routines/level2/xspr2.cpp
+++ b/src/routines/level2/xspr2.cpp
@@ -28,19 +28,19 @@ Xspr2<T>::Xspr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xspr2 functionality is implemented in the kernel using defines
- return DoHer2(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- ap_buffer, ap_offset, n,
- true); // packed matrix
+ DoHer2(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ ap_buffer, ap_offset, n,
+ true); // packed matrix
}
// =================================================================================================
diff --git a/src/routines/level2/xspr2.hpp b/src/routines/level2/xspr2.hpp
index 693c56a1..9f03f768 100644
--- a/src/routines/level2/xspr2.hpp
+++ b/src/routines/level2/xspr2.hpp
@@ -31,12 +31,12 @@ class Xspr2: public Xher2<T> {
Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
// Templated-precision implementation of the routine
- StatusCode DoSpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
+ void DoSpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
diff --git a/src/routines/level2/xsymv.cpp b/src/routines/level2/xsymv.cpp
index 648d2a3e..86bb66b8 100644
--- a/src/routines/level2/xsymv.cpp
+++ b/src/routines/level2/xsymv.cpp
@@ -29,13 +29,13 @@ Xsymv<T>::Xsymv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
// The specific symmetric matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SYMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, false, 0, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xsymv.hpp b/src/routines/level2/xsymv.hpp
index 67815f2f..3945802f 100644
--- a/src/routines/level2/xsymv.hpp
+++ b/src/routines/level2/xsymv.hpp
@@ -33,13 +33,13 @@ class Xsymv: public Xgemv<T> {
Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");
// Templated-precision implementation of the routine
- StatusCode DoSymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoSymv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xsyr.cpp b/src/routines/level2/xsyr.cpp
index 758d8f8f..64c2dc74 100644
--- a/src/routines/level2/xsyr.cpp
+++ b/src/routines/level2/xsyr.cpp
@@ -28,16 +28,16 @@ Xsyr<T>::Xsyr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Specific Xsyr functionality is implemented in the kernel using defines
- return DoHer(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- a_buffer, a_offset, a_ld);
+ DoHer(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ a_buffer, a_offset, a_ld);
}
// =================================================================================================
diff --git a/src/routines/level2/xsyr.hpp b/src/routines/level2/xsyr.hpp
index 20393454..a23ff80f 100644
--- a/src/routines/level2/xsyr.hpp
+++ b/src/routines/level2/xsyr.hpp
@@ -31,11 +31,11 @@ class Xsyr: public Xher<T,T> {
Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");
// Templated-precision implementation of the routine
- StatusCode DoSyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+ void DoSyr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
diff --git a/src/routines/level2/xsyr2.cpp b/src/routines/level2/xsyr2.cpp
index 6f43b219..38ca9d69 100644
--- a/src/routines/level2/xsyr2.cpp
+++ b/src/routines/level2/xsyr2.cpp
@@ -28,18 +28,18 @@ Xsyr2<T>::Xsyr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Specific Xsyr2 functionality is implemented in the kernel using defines
- return DoHer2(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld);
+ DoHer2(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld);
}
// =================================================================================================
diff --git a/src/routines/level2/xsyr2.hpp b/src/routines/level2/xsyr2.hpp
index 1a8dcbe8..5a8d8eb4 100644
--- a/src/routines/level2/xsyr2.hpp
+++ b/src/routines/level2/xsyr2.hpp
@@ -31,12 +31,12 @@ class Xsyr2: public Xher2<T> {
Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");
// Templated-precision implementation of the routine
- StatusCode DoSyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+ void DoSyr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
diff --git a/src/routines/level2/xtbmv.cpp b/src/routines/level2/xtbmv.cpp
index e315c544..f4a58ed2 100644
--- a/src/routines/level2/xtbmv.cpp
+++ b/src/routines/level2/xtbmv.cpp
@@ -29,17 +29,15 @@ Xtbmv<T>::Xtbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Creates a copy of X: a temporary scratch buffer
auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
- try {
- x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
- } catch (...) { } // Continues: error-code is returned in MatVec
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -52,20 +50,22 @@ StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
// The specific triangular banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_TBMV define.
auto fast_kernels = false;
- auto status = MatVec(layout, a_transpose,
- n, n, static_cast<T>(1),
- a_buffer, a_offset, a_ld,
- scratch_buffer, x_offset, x_inc, static_cast<T>(0),
- x_buffer, x_offset, x_inc,
- fast_kernels, fast_kernels,
- parameter, false, k, 0);
-
- // Returns the proper error code (renames vector Y to X)
- switch(status) {
- case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
- case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
- case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
- default: return status;
+ try {
+ MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ a_buffer, a_offset, a_ld,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, false, k, 0);
+ } catch (BLASError &e) {
+ // Returns the proper error code (renames vector Y to X)
+ switch (e.status()) {
+ case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details());
+ case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details());
+ case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
+ default: throw;
+ }
}
}
diff --git a/src/routines/level2/xtbmv.hpp b/src/routines/level2/xtbmv.hpp
index 389e9705..abd12db6 100644
--- a/src/routines/level2/xtbmv.hpp
+++ b/src/routines/level2/xtbmv.hpp
@@ -35,11 +35,11 @@ class Xtbmv: public Xgemv<T> {
Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");
// Templated-precision implementation of the routine
- StatusCode DoTbmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoTbmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xtpmv.cpp b/src/routines/level2/xtpmv.cpp
index 46811089..c0d26699 100644
--- a/src/routines/level2/xtpmv.cpp
+++ b/src/routines/level2/xtpmv.cpp
@@ -29,17 +29,15 @@ Xtpmv<T>::Xtpmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Creates a copy of X: a temporary scratch buffer
auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
- try {
- x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
- } catch (...) { } // Continues: error-code is returned in MatVec
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -52,20 +50,22 @@ StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
// The specific triangular packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_TPMV define.
auto fast_kernels = false;
- auto status = MatVec(layout, a_transpose,
- n, n, static_cast<T>(1),
- ap_buffer, ap_offset, n,
- scratch_buffer, x_offset, x_inc, static_cast<T>(0),
- x_buffer, x_offset, x_inc,
- fast_kernels, fast_kernels,
- parameter, true, 0, 0);
-
- // Returns the proper error code (renames vector Y to X)
- switch(status) {
- case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
- case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
- case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
- default: return status;
+ try {
+ MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ ap_buffer, ap_offset, n,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, true, 0, 0);
+ } catch (BLASError &e) {
+ // Returns the proper error code (renames vector Y to X)
+ switch (e.status()) {
+ case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details());
+ case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details());
+ case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
+ default: throw;
+ }
}
}
diff --git a/src/routines/level2/xtpmv.hpp b/src/routines/level2/xtpmv.hpp
index 0e8cf1d2..5b3954e8 100644
--- a/src/routines/level2/xtpmv.hpp
+++ b/src/routines/level2/xtpmv.hpp
@@ -35,11 +35,11 @@ class Xtpmv: public Xgemv<T> {
Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");
// Templated-precision implementation of the routine
- StatusCode DoTpmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoTpmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xtrmv.cpp b/src/routines/level2/xtrmv.cpp
index d2f24252..5fff9b31 100644
--- a/src/routines/level2/xtrmv.cpp
+++ b/src/routines/level2/xtrmv.cpp
@@ -29,17 +29,15 @@ Xtrmv<T>::Xtrmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Creates a copy of X: a temporary scratch buffer
auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
- try {
- x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
- } catch (...) { } // Continues: error-code is returned in MatVec
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -52,20 +50,22 @@ StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
// The specific triangular matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_TRMV define.
auto fast_kernels = false;
- auto status = MatVec(layout, a_transpose,
- n, n, static_cast<T>(1),
- a_buffer, a_offset, a_ld,
- scratch_buffer, x_offset, x_inc, static_cast<T>(0),
- x_buffer, x_offset, x_inc,
- fast_kernels, fast_kernels,
- parameter, false, 0, 0);
-
- // Returns the proper error code (renames vector Y to X)
- switch(status) {
- case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
- case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
- case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
- default: return status;
+ try {
+ MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ a_buffer, a_offset, a_ld,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, false, 0, 0);
+ } catch (BLASError &e) {
+ // Returns the proper error code (renames vector Y to X)
+ switch (e.status()) {
+ case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details());
+ case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details());
+ case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
+ default: throw;
+ }
}
}
diff --git a/src/routines/level2/xtrmv.hpp b/src/routines/level2/xtrmv.hpp
index 07dd7841..b028ee68 100644
--- a/src/routines/level2/xtrmv.hpp
+++ b/src/routines/level2/xtrmv.hpp
@@ -35,11 +35,11 @@ class Xtrmv: public Xgemv<T> {
Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");
// Templated-precision implementation of the routine
- StatusCode DoTrmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoTrmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 1602c69f..4f70dc7a 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -24,8 +24,7 @@ template <typename T>
Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name,
{"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
- PrecisionValue<T>()) {
- source_string_ =
+ PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@@ -37,30 +36,28 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_direct_part1.opencl"
#include "../../kernels/level3/xgemm_direct_part2.opencl"
#include "../../kernels/level3/xgemm_direct_part3.opencl"
- ;
- auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013
+ , // separated in two parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
- ;
- source_string_ += source_string_part_2;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xgemm<T>::DoGemm(const Layout layout,
- const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
+void Xgemm<T>::DoGemm(const Layout layout,
+ const Transpose a_transpose, const Transpose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
+ if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed. Note
@@ -99,12 +96,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// matrix A cannot be less than K when rotated, or less than M when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N when rotated, or less than M when not-rotated
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+ TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
+ TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
// Selects which version of GEMM to run
const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
@@ -131,7 +125,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// requirements, but several pre and post-processing kernels take care of those. However, the
// overhead of these extra kernels might not be ideal for certain devices/arguments.
template <typename T>
-StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
+void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
@@ -142,8 +136,6 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
const size_t a_one, const size_t a_two, const bool a_want_rotated,
const size_t b_one, const size_t b_two, const bool b_want_rotated,
const size_t c_one, const size_t c_two, const bool c_want_rotated) {
- auto status = StatusCode::kSuccess;
-
// Calculates the ceiled versions of m, n, and k
const auto m_ceiled = Ceil(m, db_["MWG"]);
const auto n_ceiled = Ceil(n, db_["NWG"]);
@@ -158,109 +150,95 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;
- // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
- try {
-
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
- // Determines whether or not temporary matrices are needed
- auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
- a_do_transpose == false && a_conjugate == false;
- auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
- b_do_transpose == false && b_conjugate == false;
- auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
- c_do_transpose == false;
-
- // Creates the temporary matrices
- const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
- const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
- const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
-
- // Events of all kernels (including pre/post processing kernels)
- auto eventWaitList = std::vector<Event>();
- auto emptyEventList = std::vector<Event>();
-
- // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
- // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
- // case nothing has to be done, these kernels can be skipped.
- if (!a_no_temp) {
- auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
- a_one, a_two, a_ld, a_offset, a_buffer,
- a_one_i, a_two_i, a_one_i, 0, a_temp,
- ConstantOne<T>(), program,
- true, a_do_transpose, a_conjugate);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessA);
- }
-
- // As above, but now for matrix B
- if (!b_no_temp) {
- auto eventProcessB = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
- b_one, b_two, b_ld, b_offset, b_buffer,
- b_one_i, b_two_i, b_one_i, 0, b_temp,
- ConstantOne<T>(), program,
- true, b_do_transpose, b_conjugate);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessB);
- }
-
- // As above, but now for matrix C. This is only necessary if C is used both as input and output.
- if (!c_no_temp && beta != static_cast<T>(0)) {
- auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
- c_one, c_two, c_ld, c_offset, c_buffer,
- c_one_i, c_two_i, c_one_i, 0, c_temp,
- ConstantOne<T>(), program,
- true, c_do_transpose, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessC);
- }
-
- // Retrieves the Xgemm kernel from the compiled binary
- try {
- auto kernel = Kernel(program, "Xgemm");
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(m_ceiled));
- kernel.SetArgument(1, static_cast<int>(n_ceiled));
- kernel.SetArgument(2, static_cast<int>(k_ceiled));
- kernel.SetArgument(3, GetRealArg(alpha));
- kernel.SetArgument(4, GetRealArg(beta));
- kernel.SetArgument(5, a_temp());
- kernel.SetArgument(6, b_temp());
- kernel.SetArgument(7, c_temp());
-
- // Computes the global and local thread sizes
- const auto global = std::vector<size_t>{
- (c_one_i * db_["MDIMC"]) / db_["MWG"],
- (c_two_i * db_["NDIMC"]) / db_["NWG"]
- };
- const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
- // Launches the kernel
- auto eventKernel = Event();
- auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
- status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
- if (ErrorIn(status)) { return status; }
-
- // Runs the post-processing kernel if needed
- if (!c_no_temp) {
- eventWaitList.push_back(eventKernel);
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
- c_one_i, c_two_i, c_one_i, 0, c_temp,
- c_one, c_two, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
- false, c_do_transpose, false);
- if (ErrorIn(status)) { return status; }
- }
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Determines whether or not temporary matrices are needed
+ auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
+ a_do_transpose == false && a_conjugate == false;
+ auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
+ b_do_transpose == false && b_conjugate == false;
+ auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
+ c_do_transpose == false;
+
+ // Creates the temporary matrices
+ const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
+ const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
+ const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
+
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
+ // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+ // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+ // case nothing has to be done, these kernels can be skipped.
+ if (!a_no_temp) {
+ auto eventProcessA = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ a_one_i, a_two_i, a_one_i, 0, a_temp,
+ ConstantOne<T>(), program,
+ true, a_do_transpose, a_conjugate);
+ eventWaitList.push_back(eventProcessA);
+ }
+
+ // As above, but now for matrix B
+ if (!b_no_temp) {
+ auto eventProcessB = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
+ b_one, b_two, b_ld, b_offset, b_buffer,
+ b_one_i, b_two_i, b_one_i, 0, b_temp,
+ ConstantOne<T>(), program,
+ true, b_do_transpose, b_conjugate);
+ eventWaitList.push_back(eventProcessB);
+ }
+
+ // As above, but now for matrix C. This is only necessary if C is used both as input and output.
+ if (!c_no_temp && beta != static_cast<T>(0)) {
+ auto eventProcessC = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+ c_one, c_two, c_ld, c_offset, c_buffer,
+ c_one_i, c_two_i, c_one_i, 0, c_temp,
+ ConstantOne<T>(), program,
+ true, c_do_transpose, false);
+ eventWaitList.push_back(eventProcessC);
+ }
+
+ // Retrieves the Xgemm kernel from the compiled binary
+ auto kernel = Kernel(program, "Xgemm");
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(m_ceiled));
+ kernel.SetArgument(1, static_cast<int>(n_ceiled));
+ kernel.SetArgument(2, static_cast<int>(k_ceiled));
+ kernel.SetArgument(3, GetRealArg(alpha));
+ kernel.SetArgument(4, GetRealArg(beta));
+ kernel.SetArgument(5, a_temp());
+ kernel.SetArgument(6, b_temp());
+ kernel.SetArgument(7, c_temp());
+
+ // Computes the global and local thread sizes
+ const auto global = std::vector<size_t>{
+ (c_one_i * db_["MDIMC"]) / db_["MWG"],
+ (c_two_i * db_["NDIMC"]) / db_["NWG"]
+ };
+ const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+ // Launches the kernel
+ auto eventKernel = Event();
+ auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
+ RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
+
+ // Runs the post-processing kernel if needed
+ if (!c_no_temp) {
+ eventWaitList.push_back(eventKernel);
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+ c_one_i, c_two_i, c_one_i, 0, c_temp,
+ c_one, c_two, c_ld, c_offset, c_buffer,
+ ConstantOne<T>(), program,
+ false, c_do_transpose, false);
+ }
}
@@ -268,7 +246,7 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
template <typename T>
-StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
+void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
@@ -281,46 +259,40 @@ StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Retrieves the proper XgemmDirect kernel from the compiled binary
- try {
- const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
- (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
- auto kernel = Kernel(program, name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(m));
- kernel.SetArgument(1, static_cast<int>(n));
- kernel.SetArgument(2, static_cast<int>(k));
- kernel.SetArgument(3, GetRealArg(alpha));
- kernel.SetArgument(4, GetRealArg(beta));
- kernel.SetArgument(5, a_buffer());
- kernel.SetArgument(6, static_cast<int>(a_offset));
- kernel.SetArgument(7, static_cast<int>(a_ld));
- kernel.SetArgument(8, b_buffer());
- kernel.SetArgument(9, static_cast<int>(b_offset));
- kernel.SetArgument(10, static_cast<int>(b_ld));
- kernel.SetArgument(11, c_buffer());
- kernel.SetArgument(12, static_cast<int>(c_offset));
- kernel.SetArgument(13, static_cast<int>(c_ld));
- kernel.SetArgument(14, static_cast<int>(c_do_transpose));
- kernel.SetArgument(15, static_cast<int>(a_conjugate));
- kernel.SetArgument(16, static_cast<int>(b_conjugate));
-
- // Computes the global and local thread sizes
- const auto m_ceiled = Ceil(m, db_["WGD"]);
- const auto n_ceiled = Ceil(n, db_["WGD"]);
- const auto global = std::vector<size_t>{
- (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
- (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
- };
- const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
-
- // Launches the kernel
- auto status = RunKernel(kernel, queue_, device_, global, local, event_);
- if (ErrorIn(status)) { return status; }
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
+ (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
+ auto kernel = Kernel(program, name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(m));
+ kernel.SetArgument(1, static_cast<int>(n));
+ kernel.SetArgument(2, static_cast<int>(k));
+ kernel.SetArgument(3, GetRealArg(alpha));
+ kernel.SetArgument(4, GetRealArg(beta));
+ kernel.SetArgument(5, a_buffer());
+ kernel.SetArgument(6, static_cast<int>(a_offset));
+ kernel.SetArgument(7, static_cast<int>(a_ld));
+ kernel.SetArgument(8, b_buffer());
+ kernel.SetArgument(9, static_cast<int>(b_offset));
+ kernel.SetArgument(10, static_cast<int>(b_ld));
+ kernel.SetArgument(11, c_buffer());
+ kernel.SetArgument(12, static_cast<int>(c_offset));
+ kernel.SetArgument(13, static_cast<int>(c_ld));
+ kernel.SetArgument(14, static_cast<int>(c_do_transpose));
+ kernel.SetArgument(15, static_cast<int>(a_conjugate));
+ kernel.SetArgument(16, static_cast<int>(b_conjugate));
+
+ // Computes the global and local thread sizes
+ const auto m_ceiled = Ceil(m, db_["WGD"]);
+ const auto n_ceiled = Ceil(n, db_["WGD"]);
+ const auto global = std::vector<size_t>{
+ (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+ (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
+ };
+ const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
+
+ // Launches the kernel
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp
index 46e12453..c61611b6 100644
--- a/src/routines/level3/xgemm.hpp
+++ b/src/routines/level3/xgemm.hpp
@@ -28,36 +28,36 @@ class Xgemm: public Routine {
Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
// Templated-precision implementation of the routine
- StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
+ void DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ // Indirect version of GEMM (with pre and post-processing kernels)
+ void GemmIndirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-
- // Indirect version of GEMM (with pre and post-processing kernels)
- StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
- const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
- const bool a_conjugate, const bool b_conjugate,
- const size_t a_one, const size_t a_two, const bool a_want_rotated,
- const size_t b_one, const size_t b_two, const bool b_want_rotated,
- const size_t c_one, const size_t c_two, const bool c_want_rotated);
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate,
+ const size_t a_one, const size_t a_two, const bool a_want_rotated,
+ const size_t b_one, const size_t b_two, const bool b_want_rotated,
+ const size_t c_one, const size_t c_two, const bool c_want_rotated);
// Direct version of GEMM (no pre and post-processing kernels)
- StatusCode GemmDirect(const size_t m, const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
- const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
- const bool a_conjugate, const bool b_conjugate);
+ void GemmDirect(const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate);
};
// =================================================================================================
diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp
index 9813503e..e5b1502a 100644
--- a/src/routines/level3/xhemm.cpp
+++ b/src/routines/level3/xhemm.cpp
@@ -29,7 +29,7 @@ Xhemm<T>::Xhemm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
+void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@@ -38,15 +38,14 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
// left) or B (on the right) in the Xgemm routine.
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the squared A matrix
- auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(k, k, a_buffer, a_offset, a_ld);
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
@@ -55,73 +54,68 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
// Temporary buffer for a copy of the hermitian matrix
- try {
- auto temp_herm = Buffer<T>(context_, k*k);
-
- // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
- // routine afterwards
+ auto temp_herm = Buffer<T>(context_, k*k);
+
+ // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
+ // routine afterwards
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the arguments for the hermitian-to-squared kernel
+ kernel.SetArgument(0, static_cast<int>(k));
+ kernel.SetArgument(1, static_cast<int>(a_ld));
+ kernel.SetArgument(2, static_cast<int>(a_offset));
+ kernel.SetArgument(3, a_buffer());
+ kernel.SetArgument(4, static_cast<int>(k));
+ kernel.SetArgument(5, static_cast<int>(k));
+ kernel.SetArgument(6, static_cast<int>(0));
+ kernel.SetArgument(7, temp_herm());
+
+ // Uses the common padding kernel's thread configuration. This is allowed, since the
+ // hermitian-to-squared kernel uses the same parameters.
+ auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+ Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+ auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
+
+ // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+ kernelEvent.WaitForCompletion();
+
+ // Runs the regular Xgemm code with either "C := AB+C" or ...
+ if (side == Side::kLeft) {
+ DoGemm(layout, Transpose::kNo, Transpose::kNo,
+ m, n, k,
+ alpha,
+ temp_herm, 0, k,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld);
+ }
+
+ // ... with "C := BA+C". Note that A and B are now reversed.
+ else {
try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the arguments for the hermitian-to-squared kernel
- kernel.SetArgument(0, static_cast<int>(k));
- kernel.SetArgument(1, static_cast<int>(a_ld));
- kernel.SetArgument(2, static_cast<int>(a_offset));
- kernel.SetArgument(3, a_buffer());
- kernel.SetArgument(4, static_cast<int>(k));
- kernel.SetArgument(5, static_cast<int>(k));
- kernel.SetArgument(6, static_cast<int>(0));
- kernel.SetArgument(7, temp_herm());
-
- // Uses the common padding kernel's thread configuration. This is allowed, since the
- // hermitian-to-squared kernel uses the same parameters.
- auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
- Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
- auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
-
- // Synchronize now: 'DoGemm' does not accept a list of events to wait for
- kernelEvent.WaitForCompletion();
-
- // Runs the regular Xgemm code with either "C := AB+C" or ...
- if (side == Side::kLeft) {
- status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
- m, n, k,
- alpha,
- temp_herm, 0, k,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld);
- }
-
- // ... with "C := BA+C". Note that A and B are now reversed.
- else {
- status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
- m, n, k,
- alpha,
- b_buffer, b_offset, b_ld,
- temp_herm, 0, k,
- beta,
- c_buffer, c_offset, c_ld);
-
- // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
- switch(status) {
- case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
- case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
- case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
- case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
- case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
- case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
- }
+ DoGemm(layout, Transpose::kNo, Transpose::kNo,
+ m, n, k,
+ alpha,
+ b_buffer, b_offset, b_ld,
+ temp_herm, 0, k,
+ beta,
+ c_buffer, c_offset, c_ld);
+ } catch (BLASError &e) {
+ // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+ switch(e.status()) {
+ case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details());
+ case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details());
+ case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details());
+ case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details());
+ case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details());
+ case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details());
+ default: throw;
}
-
- // Return the status of the Xgemm routine
- return status;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ }
+ }
}
// =================================================================================================
diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp
index 272bd2ec..2385706e 100644
--- a/src/routines/level3/xhemm.hpp
+++ b/src/routines/level3/xhemm.hpp
@@ -37,13 +37,13 @@ class Xhemm: public Xgemm<T> {
Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");
// Templated-precision implementation of the routine
- StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoHemm(const Layout layout, const Side side, const Triangle triangle,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index ba770065..ee3bb8b8 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@@ -32,23 +31,23 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T, typename U>
-StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const U beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
+void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const U beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
// to matrix A (argument: conjugate transpose)
@@ -71,159 +70,139 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
- auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
+ TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
+ TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
- auto n_ceiled = Ceil(n, db_["NWG"]);
+ auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
- try {
-
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
- // Determines whether or not temporary matrices are needed
- auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- ab_rotated == false && ab_conjugate == false;
- auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- ab_rotated == false && ab_conjugate == true;
- auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
- ab_rotated == false && ab_conjugate == false;
- auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
- ab_rotated == false && ab_conjugate == true;
-
- // Creates the temporary matrices
- auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
-
- // Convert the arguments to complex versions
- auto complex_beta = T{beta, static_cast<U>(0.0)};
-
- // Events of all kernels (including pre/post processing kernels)
- auto eventWaitList = std::vector<Event>();
- auto emptyEventList = std::vector<Event>();
-
- // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
- // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
- // case nothing has to be done, these kernels can be skipped.
- if (!a1_no_temp) {
- auto eventProcessA1 = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
- ab_one, ab_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, ab_conjugate);
- eventWaitList.push_back(eventProcessA1);
- if (ErrorIn(status)) { return status; }
- }
- if (!a2_no_temp) {
- auto eventProcessA2 = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
- ab_one, ab_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, !ab_conjugate);
- eventWaitList.push_back(eventProcessA2);
- if (ErrorIn(status)) { return status; }
- }
- if (!b1_no_temp) {
- auto eventProcessB1 = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
- ab_one, ab_two, b_ld, b_offset, b_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, ab_conjugate);
- eventWaitList.push_back(eventProcessB1);
- if (ErrorIn(status)) { return status; }
- }
- if (!b2_no_temp) {
- auto eventProcessB2 = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
- ab_one, ab_two, b_ld, b_offset, b_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, !ab_conjugate);
- eventWaitList.push_back(eventProcessB2);
- if (ErrorIn(status)) { return status; }
- }
-
- // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
- // modify the other triangle.
- auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
- n, n, c_ld, c_offset, c_buffer,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
- true, c_rotated, false);
- eventWaitList.push_back(eventProcessC);
- if (ErrorIn(status)) { return status; }
-
- // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n_ceiled));
- kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, GetRealArg(alpha));
- kernel.SetArgument(3, GetRealArg(complex_beta));
- kernel.SetArgument(4, a1_temp());
- kernel.SetArgument(5, b2_temp());
- kernel.SetArgument(6, c_temp());
-
- // Computes the global and local thread sizes
- auto global = std::vector<size_t>{
- (n_ceiled * db_["MDIMC"]) / db_["MWG"],
- (n_ceiled * db_["NDIMC"]) / db_["NWG"]
- };
- auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
- // Launches the kernel
- auto eventKernel1 = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel1);
-
- // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
- auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
- auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
- kernel.SetArgument(2, GetRealArg(conjugate_alpha));
- kernel.SetArgument(3, GetRealArg(complex_one));
- kernel.SetArgument(4, b1_temp());
- kernel.SetArgument(5, a2_temp());
-
- // Runs the kernel again
- auto eventKernel2 = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel2);
-
- // Runs the post-processing kernel
- auto upper = (triangle == Triangle::kUpper);
- auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
- false, c_rotated, false, upper, lower, true);
- if (ErrorIn(status)) { return status; }
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Determines whether or not temporary matrices are needed
+ auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ ab_rotated == false && ab_conjugate == false;
+ auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ ab_rotated == false && ab_conjugate == true;
+ auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+ ab_rotated == false && ab_conjugate == false;
+ auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+ ab_rotated == false && ab_conjugate == true;
+
+ // Creates the temporary matrices
+ auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+
+ // Convert the arguments to complex versions
+ auto complex_beta = T{beta, static_cast<U>(0.0)};
+
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
+ // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+ // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+ // case nothing has to be done, these kernels can be skipped.
+ if (!a1_no_temp) {
+ auto eventProcessA1 = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
+ ab_one, ab_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, ab_conjugate);
+ eventWaitList.push_back(eventProcessA1);
+ }
+ if (!a2_no_temp) {
+ auto eventProcessA2 = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
+ ab_one, ab_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, !ab_conjugate);
+ eventWaitList.push_back(eventProcessA2);
+ }
+ if (!b1_no_temp) {
+ auto eventProcessB1 = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
+ ab_one, ab_two, b_ld, b_offset, b_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, ab_conjugate);
+ eventWaitList.push_back(eventProcessB1);
+ }
+ if (!b2_no_temp) {
+ auto eventProcessB2 = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
+ ab_one, ab_two, b_ld, b_offset, b_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, !ab_conjugate);
+ eventWaitList.push_back(eventProcessB2);
+ }
+
+ // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+ // modify the other triangle.
+ auto eventProcessC = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
+ eventWaitList.push_back(eventProcessC);
+
+ // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n_ceiled));
+ kernel.SetArgument(1, static_cast<int>(k_ceiled));
+ kernel.SetArgument(2, GetRealArg(alpha));
+ kernel.SetArgument(3, GetRealArg(complex_beta));
+ kernel.SetArgument(4, a1_temp());
+ kernel.SetArgument(5, b2_temp());
+ kernel.SetArgument(6, c_temp());
+
+ // Computes the global and local thread sizes
+ auto global = std::vector<size_t>{
+ (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+ (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+ };
+ auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+ // Launches the kernel
+ auto eventKernel1 = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel1);
+
+ // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
+ auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
+ auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
+ kernel.SetArgument(2, GetRealArg(conjugate_alpha));
+ kernel.SetArgument(3, GetRealArg(complex_one));
+ kernel.SetArgument(4, b1_temp());
+ kernel.SetArgument(5, a2_temp());
+
+ // Runs the kernel again
+ auto eventKernel2 = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel2);
+
+ // Runs the post-processing kernel
+ auto upper = (triangle == Triangle::kUpper);
+ auto lower = (triangle == Triangle::kLower);
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ n, n, c_ld, c_offset, c_buffer,
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, true);
}
// =================================================================================================
diff --git a/src/routines/level3/xher2k.hpp b/src/routines/level3/xher2k.hpp
index 23996219..acc346e4 100644
--- a/src/routines/level3/xher2k.hpp
+++ b/src/routines/level3/xher2k.hpp
@@ -30,13 +30,13 @@ class Xher2k: public Routine {
Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
// Templated-precision implementation of the routine
- StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const U beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const U beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index 3063f3bc..ae8e9324 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@@ -32,14 +31,14 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T, typename U>
-StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const U alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@@ -47,7 +46,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
// to matrix A (argument: conjugate transpose)
@@ -70,118 +69,102 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+ TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
- auto n_ceiled = Ceil(n, db_["NWG"]);
+ auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
- try {
-
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
- // Determines whether or not temporary matrices are needed
- auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- a_rotated == false && a_conjugate == false;
- auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- a_rotated == false && b_conjugate == false;
-
- // Creates the temporary matrices
- auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
-
- // Convert the arguments to complex versions
- auto complex_alpha = T{alpha, static_cast<U>(0.0)};
- auto complex_beta = T{beta, static_cast<U>(0.0)};
-
- // Events of all kernels (including pre/post processing kernels)
- auto eventWaitList = std::vector<Event>();
- auto emptyEventList = std::vector<Event>();
-
- // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
- // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
- // case nothing has to be done, these kernels can be skipped. Two copies are created.
- if (!a_no_temp) {
- auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
- a_one, a_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- ConstantOne<T>(), program,
- true, a_rotated, a_conjugate);
- eventWaitList.push_back(eventProcessA);
- if (ErrorIn(status)) { return status; }
- }
- if (!b_no_temp) {
- auto eventProcessB = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
- a_one, a_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- ConstantOne<T>(), program,
- true, a_rotated, b_conjugate);
- eventWaitList.push_back(eventProcessB);
- if (ErrorIn(status)) { return status; }
- }
-
- // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
- // modify the other triangle.
- auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
- n, n, c_ld, c_offset, c_buffer,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
- true, c_rotated, false);
- eventWaitList.push_back(eventProcessC);
- if (ErrorIn(status)) { return status; }
-
- // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n_ceiled));
- kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, GetRealArg(complex_alpha));
- kernel.SetArgument(3, GetRealArg(complex_beta));
- kernel.SetArgument(4, a_temp());
- kernel.SetArgument(5, b_temp());
- kernel.SetArgument(6, c_temp());
-
- // Computes the global and local thread sizes
- auto global = std::vector<size_t>{
- (n_ceiled * db_["MDIMC"]) / db_["MWG"],
- (n_ceiled * db_["NDIMC"]) / db_["NWG"]
- };
- auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
- // Launches the kernel
- auto eventKernel = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel);
-
- // Runs the post-processing kernel
- auto upper = (triangle == Triangle::kUpper);
- auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
- false, c_rotated, false, upper, lower, true);
- if (ErrorIn(status)) { return status; }
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Determines whether or not temporary matrices are needed
+ auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ a_rotated == false && a_conjugate == false;
+ auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ a_rotated == false && b_conjugate == false;
+
+ // Creates the temporary matrices
+ auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+
+ // Convert the arguments to complex versions
+ auto complex_alpha = T{alpha, static_cast<U>(0.0)};
+ auto complex_beta = T{beta, static_cast<U>(0.0)};
+
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
+ // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+ // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+ // case nothing has to be done, these kernels can be skipped. Two copies are created.
+ if (!a_no_temp) {
+ auto eventProcessA = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+ ConstantOne<T>(), program,
+ true, a_rotated, a_conjugate);
+ eventWaitList.push_back(eventProcessA);
+ }
+ if (!b_no_temp) {
+ auto eventProcessB = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+ ConstantOne<T>(), program,
+ true, a_rotated, b_conjugate);
+ eventWaitList.push_back(eventProcessB);
+ }
+
+ // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+ // modify the other triangle.
+ auto eventProcessC = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
+ eventWaitList.push_back(eventProcessC);
+
+ // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n_ceiled));
+ kernel.SetArgument(1, static_cast<int>(k_ceiled));
+ kernel.SetArgument(2, GetRealArg(complex_alpha));
+ kernel.SetArgument(3, GetRealArg(complex_beta));
+ kernel.SetArgument(4, a_temp());
+ kernel.SetArgument(5, b_temp());
+ kernel.SetArgument(6, c_temp());
+
+ // Computes the global and local thread sizes
+ auto global = std::vector<size_t>{
+ (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+ (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+ };
+ auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+ // Launches the kernel
+ auto eventKernel = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel);
+
+ // Runs the post-processing kernel
+ auto upper = (triangle == Triangle::kUpper);
+ auto lower = (triangle == Triangle::kLower);
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ n, n, c_ld, c_offset, c_buffer,
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, true);
}
// =================================================================================================
diff --git a/src/routines/level3/xherk.hpp b/src/routines/level3/xherk.hpp
index 3f156a1b..51f29d7e 100644
--- a/src/routines/level3/xherk.hpp
+++ b/src/routines/level3/xherk.hpp
@@ -30,12 +30,12 @@ class Xherk: public Routine {
Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
// Templated-precision implementation of the routine
- StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const U alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const U beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const size_t n, const size_t k,
+ const U alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const U beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp
index 04e4b718..d7f771d1 100644
--- a/src/routines/level3/xsymm.cpp
+++ b/src/routines/level3/xsymm.cpp
@@ -29,7 +29,7 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
+void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@@ -38,15 +38,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
// left) or B (on the right) in the Xgemm routine.
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the squared A matrix
- auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(k, k, a_buffer, a_offset, a_ld);
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
@@ -55,73 +54,68 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
// Temporary buffer for a copy of the symmetric matrix
- try {
- auto temp_symm = Buffer<T>(context_, k*k);
-
- // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
- // routine afterwards
+ auto temp_symm = Buffer<T>(context_, k*k);
+
+ // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
+ // routine afterwards
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the arguments for the symmetric-to-squared kernel
+ kernel.SetArgument(0, static_cast<int>(k));
+ kernel.SetArgument(1, static_cast<int>(a_ld));
+ kernel.SetArgument(2, static_cast<int>(a_offset));
+ kernel.SetArgument(3, a_buffer());
+ kernel.SetArgument(4, static_cast<int>(k));
+ kernel.SetArgument(5, static_cast<int>(k));
+ kernel.SetArgument(6, static_cast<int>(0));
+ kernel.SetArgument(7, temp_symm());
+
+ // Uses the common padding kernel's thread configuration. This is allowed, since the
+ // symmetric-to-squared kernel uses the same parameters.
+ auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+ Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+ auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
+
+ // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+ kernelEvent.WaitForCompletion();
+
+ // Runs the regular Xgemm code with either "C := AB+C" or ...
+ if (side == Side::kLeft) {
+ DoGemm(layout, Transpose::kNo, Transpose::kNo,
+ m, n, k,
+ alpha,
+ temp_symm, 0, k,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld);
+ }
+
+ // ... with "C := BA+C". Note that A and B are now reversed.
+ else {
try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the arguments for the symmetric-to-squared kernel
- kernel.SetArgument(0, static_cast<int>(k));
- kernel.SetArgument(1, static_cast<int>(a_ld));
- kernel.SetArgument(2, static_cast<int>(a_offset));
- kernel.SetArgument(3, a_buffer());
- kernel.SetArgument(4, static_cast<int>(k));
- kernel.SetArgument(5, static_cast<int>(k));
- kernel.SetArgument(6, static_cast<int>(0));
- kernel.SetArgument(7, temp_symm());
-
- // Uses the common padding kernel's thread configuration. This is allowed, since the
- // symmetric-to-squared kernel uses the same parameters.
- auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
- Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
- auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
-
- // Synchronize now: 'DoGemm' does not accept a list of events to wait for
- kernelEvent.WaitForCompletion();
-
- // Runs the regular Xgemm code with either "C := AB+C" or ...
- if (side == Side::kLeft) {
- status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
- m, n, k,
- alpha,
- temp_symm, 0, k,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld);
- }
-
- // ... with "C := BA+C". Note that A and B are now reversed.
- else {
- status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
- m, n, k,
- alpha,
- b_buffer, b_offset, b_ld,
- temp_symm, 0, k,
- beta,
- c_buffer, c_offset, c_ld);
-
- // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
- switch(status) {
- case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
- case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
- case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
- case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
- case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
- case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
- }
+ DoGemm(layout, Transpose::kNo, Transpose::kNo,
+ m, n, k,
+ alpha,
+ b_buffer, b_offset, b_ld,
+ temp_symm, 0, k,
+ beta,
+ c_buffer, c_offset, c_ld);
+ } catch (BLASError &e) {
+ // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+ switch(e.status()) {
+ case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details());
+ case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details());
+ case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details());
+ case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details());
+ case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details());
+ case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details());
+ default: throw;
}
-
- // Return the status of the Xgemm routine
- return status;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ }
+ }
}
// =================================================================================================
diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp
index 428f78ef..ee965364 100644
--- a/src/routines/level3/xsymm.hpp
+++ b/src/routines/level3/xsymm.hpp
@@ -39,13 +39,13 @@ class Xsymm: public Xgemm<T> {
Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");
// Templated-precision implementation of the routine
- StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoSymm(const Layout layout, const Side side, const Triangle triangle,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index 158cd9e5..cb0e0461 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@@ -32,14 +31,14 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@@ -48,7 +47,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
@@ -67,128 +66,110 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
- auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
+ TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
+ TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
- auto n_ceiled = Ceil(n, db_["NWG"]);
+ auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
- try {
-
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
- // Determines whether or not temporary matrices are needed
- auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- ab_rotated == false;
- auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
- ab_rotated == false;
-
- // Creates the temporary matrices
- auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
-
- // Events of all kernels (including pre/post processing kernels)
- auto eventWaitList = std::vector<Event>();
- auto emptyEventList = std::vector<Event>();
-
- // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
- // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
- // case nothing has to be done, these kernels can be skipped.
- if (!a_no_temp) {
- auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
- ab_one, ab_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessA);
- }
- if (!b_no_temp) {
- auto eventProcessB = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
- ab_one, ab_two, b_ld, b_offset, b_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessB);
- }
-
- // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
- // modify the other triangle.
- auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
- n, n, c_ld, c_offset, c_buffer,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
- true, c_rotated, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessC);
-
- // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n_ceiled));
- kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, GetRealArg(alpha));
- kernel.SetArgument(3, GetRealArg(beta));
- kernel.SetArgument(4, a_temp());
- kernel.SetArgument(5, b_temp());
- kernel.SetArgument(6, c_temp());
-
- // Computes the global and local thread sizes
- auto global = std::vector<size_t>{
- (n_ceiled * db_["MDIMC"]) / db_["MWG"],
- (n_ceiled * db_["NDIMC"]) / db_["NWG"]
- };
- auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
- // Launches the kernel
- auto eventKernel1 = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel1);
-
- // Swaps the arguments for matrices A and B, and sets 'beta' to 1
- auto one = static_cast<T>(1);
- kernel.SetArgument(3, GetRealArg(one));
- kernel.SetArgument(4, b_temp());
- kernel.SetArgument(5, a_temp());
-
- // Runs the kernel again
- auto eventKernel2 = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel2);
-
- // Runs the post-processing kernel
- auto upper = (triangle == Triangle::kUpper);
- auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
- false, c_rotated, false, upper, lower, false);
- if (ErrorIn(status)) { return status; }
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Determines whether or not temporary matrices are needed
+ auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ ab_rotated == false;
+ auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+ ab_rotated == false;
+
+ // Creates the temporary matrices
+ auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
+ // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+ // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+ // case nothing has to be done, these kernels can be skipped.
+ if (!a_no_temp) {
+ auto eventProcessA = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+ ab_one, ab_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, false);
+ eventWaitList.push_back(eventProcessA);
+ }
+ if (!b_no_temp) {
+ auto eventProcessB = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
+ ab_one, ab_two, b_ld, b_offset, b_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, false);
+ eventWaitList.push_back(eventProcessB);
+ }
+
+ // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+ // modify the other triangle.
+ auto eventProcessC = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
+ eventWaitList.push_back(eventProcessC);
+
+ // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n_ceiled));
+ kernel.SetArgument(1, static_cast<int>(k_ceiled));
+ kernel.SetArgument(2, GetRealArg(alpha));
+ kernel.SetArgument(3, GetRealArg(beta));
+ kernel.SetArgument(4, a_temp());
+ kernel.SetArgument(5, b_temp());
+ kernel.SetArgument(6, c_temp());
+
+ // Computes the global and local thread sizes
+ auto global = std::vector<size_t>{
+ (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+ (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+ };
+ auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+ // Launches the kernel
+ auto eventKernel1 = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel1);
+
+ // Swaps the arguments for matrices A and B, and sets 'beta' to 1
+ auto one = static_cast<T>(1);
+ kernel.SetArgument(3, GetRealArg(one));
+ kernel.SetArgument(4, b_temp());
+ kernel.SetArgument(5, a_temp());
+
+ // Runs the kernel again
+ auto eventKernel2 = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel2);
+
+ // Runs the post-processing kernel
+ auto upper = (triangle == Triangle::kUpper);
+ auto lower = (triangle == Triangle::kLower);
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ n, n, c_ld, c_offset, c_buffer,
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, false);
}
// =================================================================================================
diff --git a/src/routines/level3/xsyr2k.hpp b/src/routines/level3/xsyr2k.hpp
index 56185653..a02c6e16 100644
--- a/src/routines/level3/xsyr2k.hpp
+++ b/src/routines/level3/xsyr2k.hpp
@@ -30,13 +30,13 @@ class Xsyr2k: public Routine {
Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
// Templated-precision implementation of the routine
- StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index e1a72ef6..bd6c4b25 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@@ -32,14 +31,14 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@@ -47,7 +46,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
@@ -65,102 +64,86 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+ TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
- auto n_ceiled = Ceil(n, db_["NWG"]);
+ auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
- try {
-
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
- // Determines whether or not temporary matrices are needed
- auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- a_rotated == false;
-
- // Creates the temporary matrices
- auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
-
- // Events of all kernels (including pre/post processing kernels)
- auto eventWaitList = std::vector<Event>();
- auto emptyEventList = std::vector<Event>();
-
- // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
- // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
- // case nothing has to be done, these kernels can be skipped.
- if (!a_no_temp) {
- auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
- a_one, a_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- ConstantOne<T>(), program,
- true, a_rotated, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessA);
- }
-
- // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
- // modify the other triangle.
- auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
- n, n, c_ld, c_offset, c_buffer,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
- true, c_rotated, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessC);
-
- // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n_ceiled));
- kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, GetRealArg(alpha));
- kernel.SetArgument(3, GetRealArg(beta));
- kernel.SetArgument(4, a_temp());
- kernel.SetArgument(5, a_temp());
- kernel.SetArgument(6, c_temp());
-
- // Computes the global and local thread sizes
- auto global = std::vector<size_t>{
- (n_ceiled * db_["MDIMC"]) / db_["MWG"],
- (n_ceiled * db_["NDIMC"]) / db_["NWG"]
- };
- auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
- // Launches the kernel
- auto eventKernel = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel);
-
- // Runs the post-processing kernel
- auto upper = (triangle == Triangle::kUpper);
- auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
- false, c_rotated, false, upper, lower, false);
- if (ErrorIn(status)) { return status; }
-
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Determines whether or not temporary matrices are needed
+ auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ a_rotated == false;
+
+ // Creates the temporary matrices
+ auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
+ // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+ // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+ // case nothing has to be done, these kernels can be skipped.
+ if (!a_no_temp) {
+ auto eventProcessA = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+ ConstantOne<T>(), program,
+ true, a_rotated, false);
+ eventWaitList.push_back(eventProcessA);
+ }
+
+ // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+ // modify the other triangle.
+ auto eventProcessC = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
+ eventWaitList.push_back(eventProcessC);
+
+ // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n_ceiled));
+ kernel.SetArgument(1, static_cast<int>(k_ceiled));
+ kernel.SetArgument(2, GetRealArg(alpha));
+ kernel.SetArgument(3, GetRealArg(beta));
+ kernel.SetArgument(4, a_temp());
+ kernel.SetArgument(5, a_temp());
+ kernel.SetArgument(6, c_temp());
+
+ // Computes the global and local thread sizes
+ auto global = std::vector<size_t>{
+ (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+ (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+ };
+ auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+ // Launches the kernel
+ auto eventKernel = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel);
+
+ // Runs the post-processing kernel
+ auto upper = (triangle == Triangle::kUpper);
+ auto lower = (triangle == Triangle::kLower);
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ n, n, c_ld, c_offset, c_buffer,
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, false);
}
// =================================================================================================
diff --git a/src/routines/level3/xsyrk.hpp b/src/routines/level3/xsyrk.hpp
index 7c075c26..de42b824 100644
--- a/src/routines/level3/xsyrk.hpp
+++ b/src/routines/level3/xsyrk.hpp
@@ -32,12 +32,12 @@ class Xsyrk: public Routine {
Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
// Templated-precision implementation of the routine
- StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp
index 74a82822..6bf77cfa 100644
--- a/src/routines/level3/xtrmm.cpp
+++ b/src/routines/level3/xtrmm.cpp
@@ -29,7 +29,7 @@ Xtrmm<T>::Xtrmm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
@@ -37,15 +37,14 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
// Makes sure all dimensions are larger than zero
- if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+ if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes the k dimension. This is based on whether or not matrix is A (on the left)
// or B (on the right) in the Xgemm routine.
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the triangular A matrix
- auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(k, k, a_buffer, a_offset, a_ld);
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
@@ -57,74 +56,69 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
// Temporary buffer for a copy of the triangular matrix
- try {
- auto temp_triangular = Buffer<T>(context_, k*k);
-
- // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
- // routine afterwards
+ auto temp_triangular = Buffer<T>(context_, k*k);
+
+ // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
+ // routine afterwards
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the arguments for the triangular-to-squared kernel
+ kernel.SetArgument(0, static_cast<int>(k));
+ kernel.SetArgument(1, static_cast<int>(a_ld));
+ kernel.SetArgument(2, static_cast<int>(a_offset));
+ kernel.SetArgument(3, a_buffer());
+ kernel.SetArgument(4, static_cast<int>(k));
+ kernel.SetArgument(5, static_cast<int>(k));
+ kernel.SetArgument(6, static_cast<int>(0));
+ kernel.SetArgument(7, temp_triangular());
+ kernel.SetArgument(8, static_cast<int>(unit_diagonal));
+
+ // Uses the common padding kernel's thread configuration. This is allowed, since the
+ // triangular-to-squared kernel uses the same parameters.
+ auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+ Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+ auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
+
+ // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+ kernelEvent.WaitForCompletion();
+
+ // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
+ if (side == Side::kLeft) {
+ DoGemm(layout, a_transpose, Transpose::kNo,
+ m, n, k,
+ alpha,
+ temp_triangular, 0, k,
+ b_buffer, b_offset, b_ld,
+ static_cast<T>(0.0),
+ b_buffer, b_offset, b_ld);
+ }
+
+ // ... with "B := alpha*B*A". Note that A and B are now reversed.
+ else {
try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the arguments for the triangular-to-squared kernel
- kernel.SetArgument(0, static_cast<int>(k));
- kernel.SetArgument(1, static_cast<int>(a_ld));
- kernel.SetArgument(2, static_cast<int>(a_offset));
- kernel.SetArgument(3, a_buffer());
- kernel.SetArgument(4, static_cast<int>(k));
- kernel.SetArgument(5, static_cast<int>(k));
- kernel.SetArgument(6, static_cast<int>(0));
- kernel.SetArgument(7, temp_triangular());
- kernel.SetArgument(8, static_cast<int>(unit_diagonal));
-
- // Uses the common padding kernel's thread configuration. This is allowed, since the
- // triangular-to-squared kernel uses the same parameters.
- auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
- Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
- auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
-
- // Synchronize now: 'DoGemm' does not accept a list of events to wait for
- kernelEvent.WaitForCompletion();
-
- // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
- if (side == Side::kLeft) {
- status = DoGemm(layout, a_transpose, Transpose::kNo,
- m, n, k,
- alpha,
- temp_triangular, 0, k,
- b_buffer, b_offset, b_ld,
- static_cast<T>(0.0),
- b_buffer, b_offset, b_ld);
- }
-
- // ... with "B := alpha*B*A". Note that A and B are now reversed.
- else {
- status = DoGemm(layout, Transpose::kNo, a_transpose,
- m, n, k,
- alpha,
- b_buffer, b_offset, b_ld,
- temp_triangular, 0, k,
- static_cast<T>(0.0),
- b_buffer, b_offset, b_ld);
-
- // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
- switch(status) {
- case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
- case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
- case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
- case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
- case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
- case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
- }
+ DoGemm(layout, Transpose::kNo, a_transpose,
+ m, n, k,
+ alpha,
+ b_buffer, b_offset, b_ld,
+ temp_triangular, 0, k,
+ static_cast<T>(0.0),
+ b_buffer, b_offset, b_ld);
+ } catch (BLASError &e) {
+ // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+ switch(e.status()) {
+ case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details());
+ case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details());
+ case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details());
+ case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details());
+ case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details());
+ case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details());
+ default: throw;
}
-
- // Return the status of the Xgemm routine
- return status;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ }
+ }
}
// =================================================================================================
diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp
index 186a120e..967bf132 100644
--- a/src/routines/level3/xtrmm.hpp
+++ b/src/routines/level3/xtrmm.hpp
@@ -38,12 +38,12 @@ class Xtrmm: public Xgemm<T> {
Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM");
// Templated-precision implementation of the routine
- StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+ void DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
};
// =================================================================================================
diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp
index af9080af..875ca7d2 100644
--- a/src/routines/levelx/xomatcopy.cpp
+++ b/src/routines/levelx/xomatcopy.cpp
@@ -22,27 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xomatcopy<T>::Xomatcopy(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
#include "../../kernels/level3/transpose_fast.opencl"
#include "../../kernels/level3/transpose_pad.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+void Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
// Makes sure all dimensions are larger than zero
- if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+ if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Determines whether to transpose the matrix A
const auto transpose = (a_transpose != Transpose::kNo);
@@ -63,22 +62,17 @@ StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_trans
// Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than M when not-rotated
// matrix B cannot be less than M when rotated, or less than N when not-rotated
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+ TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto emptyEventList = std::vector<Event>();
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList,
- a_one, a_two, a_ld, a_offset, a_buffer,
- b_one, b_two, b_ld, b_offset, b_buffer,
- alpha, program, false, transpose, conjugate);
- if (ErrorIn(status)) { return status; }
-
- return StatusCode::kSuccess;
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ b_one, b_two, b_ld, b_offset, b_buffer,
+ alpha, program, false, transpose, conjugate);
}
// =================================================================================================
diff --git a/src/routines/levelx/xomatcopy.hpp b/src/routines/levelx/xomatcopy.hpp
index 0e580230..2da66693 100644
--- a/src/routines/levelx/xomatcopy.hpp
+++ b/src/routines/levelx/xomatcopy.hpp
@@ -28,10 +28,10 @@ class Xomatcopy: public Routine {
Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY");
// Templated-precision implementation of the routine
- StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+ void DoOmatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
};
// =================================================================================================
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index c57aab39..1651d299 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -14,7 +14,7 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 9486ee8d..5be58369 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -14,7 +14,7 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 2d9d5e49..01d9b46a 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -14,7 +14,7 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index d364dabe..4e830faa 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -14,7 +14,7 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 403ee9e4..44538b4b 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -14,7 +14,7 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp
index f8416761..2ba7d91d 100644
--- a/src/tuning/kernels/xdot.cpp
+++ b/src/tuning/kernels/xdot.cpp
@@ -15,7 +15,7 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 0eb1875b..dc9040b0 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -16,7 +16,7 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp
index 204e0be4..b1d5fbc4 100644
--- a/src/tuning/kernels/xgemm_direct.cpp
+++ b/src/tuning/kernels/xgemm_direct.cpp
@@ -16,7 +16,7 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index f332f52a..1ae641a7 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -17,7 +17,7 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index c3d0c7dd..8a3ede98 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -14,7 +14,7 @@
#include <string>
#include <vector>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index afb092bc..c4ee0da0 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -20,7 +20,7 @@
#include <cltune.h>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp
new file mode 100644
index 00000000..9a23e0b7
--- /dev/null
+++ b/src/utilities/buffer_test.hpp
@@ -0,0 +1,113 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
+// templated and thus header-only.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_BUFFER_TEST_H_
+#define CLBLAST_BUFFER_TEST_H_
+
+#include "clblast.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Tests matrix 'A' for validity
+template <typename T>
+void TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
+ const size_t offset, const size_t ld) {
+ if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); }
+ try {
+ const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); }
+ } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); }
+}
+
+// Tests matrix 'B' for validity
+template <typename T>
+void TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
+ const size_t offset, const size_t ld) {
+ if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); }
+ try {
+ const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryB); }
+ } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixB, e.what()); }
+}
+
+// Tests matrix 'C' for validity
+template <typename T>
+void TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
+ const size_t offset, const size_t ld) {
+ if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimC); }
+ try {
+ const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryC); }
+ } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixC, e.what()); }
+}
+
+// Tests matrix 'AP' for validity
+template <typename T>
+void TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+ try {
+ const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); }
+ } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); }
+}
+
+// =================================================================================================
+
+// Tests vector 'X' for validity
+template <typename T>
+void TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
+ const size_t inc) {
+ if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementX); }
+ try {
+ const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryX); }
+ } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorX, e.what()); }
+}
+
+// Tests vector 'Y' for validity
+template <typename T>
+void TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
+ const size_t inc) {
+ if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementY); }
+ try {
+ const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryY); }
+ } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorY, e.what()); }
+}
+
+// =================================================================================================
+
+// Tests vector 'scalar' for validity
+template <typename T>
+void TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+ try {
+ const auto required_size = (n + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); }
+ } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); }
+}
+
+// Tests vector 'index' for validity
+template <typename T>
+void TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+ try {
+ const auto required_size = (n + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); }
+ } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_BUFFER_TEST_H_
+#endif
diff --git a/src/utilities/clblast_exceptions.cpp b/src/utilities/clblast_exceptions.cpp
new file mode 100644
index 00000000..96f10860
--- /dev/null
+++ b/src/utilities/clblast_exceptions.cpp
@@ -0,0 +1,95 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Ivan Shapovalov <intelfx@intelfx.name>
+//
+// This file implements the exception hierarchy for CLBlast. It contains classes for exceptions
+// generated by different parts of CLBlast (e.g. OpenCL API calls, internal logic, semantic BLAS
+// errors).
+//
+// =================================================================================================
+
+#include "utilities/clblast_exceptions.hpp"
+
+namespace {
+// =================================================================================================
+
+std::string MakeReason(const std::string &reason, const std::string &subreason) {
+ std::string r = reason;
+ if (!subreason.empty()) {
+ r += " (" + subreason + ")";
+ }
+ return r;
+}
+
+} // anonymous namespace
+
+namespace clblast {
+// =================================================================================================
+
+BLASError::BLASError(StatusCode status, const std::string &subreason):
+ ErrorCode(status,
+ subreason,
+ "BLAS error: " + MakeReason(std::to_string(static_cast<int>(status)), subreason)) {
+}
+
+RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreason):
+ ErrorCode(status,
+ subreason,
+ MakeReason(std::to_string(static_cast<int>(status)), subreason)) {
+}
+
+// =================================================================================================
+
+StatusCode DispatchException()
+{
+ const char *message = nullptr;
+ StatusCode status;
+
+ try {
+ throw;
+ } catch (BLASError &e) {
+ // no message is printed for invalid argument errors
+ status = e.status();
+ } catch (CLError &e) {
+ message = e.what();
+ status = static_cast<StatusCode>(e.status());
+ } catch (RuntimeErrorCode &e) {
+ message = e.what();
+ status = e.status();
+ } catch (Error<std::runtime_error> &e) {
+ message = e.what();
+ status = StatusCode::kUnknownError;
+ }
+
+ if (message) {
+ fprintf(stderr, "CLBlast: %s\n", message);
+ }
+ return status;
+}
+
+// =================================================================================================
+
+StatusCode DispatchExceptionForC()
+{
+ const char *message = nullptr;
+
+ try {
+ throw;
+ } catch (std::exception &e) {
+ message = e.what();
+ } catch (...) {
+ message = "unknown exception";
+ }
+
+ fprintf (stderr, "CLBlast (unexpected): %s\n", message);
+ return StatusCode::kUnexpectedError;
+}
+
+// =================================================================================================
+
+} // namespace clblast
diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp
new file mode 100644
index 00000000..f3c7b9a3
--- /dev/null
+++ b/src/utilities/clblast_exceptions.hpp
@@ -0,0 +1,50 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Ivan Shapovalov <intelfx@intelfx.name>
+//
+// This file implements the exception hierarchy for CLBlast. It contains classes for exceptions
+// generated by different parts of CLBlast (e.g. OpenCL API calls, internal logic, semantic BLAS
+// errors).
+//
+// =================================================================================================
+
+#ifndef CLBLAST_EXCEPTIONS_H_
+#define CLBLAST_EXCEPTIONS_H_
+
+#include "clblast.h"
+#include "clpp11.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Represents a semantic error in BLAS function arguments
+class BLASError : public ErrorCode<Error<std::invalid_argument>, StatusCode> {
+ public:
+ explicit BLASError(StatusCode status, const std::string &subreason = std::string{});
+};
+// =================================================================================================
+
+// Represents a runtime error generated by internal logic
+class RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> {
+ public:
+ explicit RuntimeErrorCode(StatusCode status, const std::string &subreason = std::string{});
+};
+
+// =================================================================================================
+
+// Handles (most of the) runtime exceptions and converts them to StatusCode
+StatusCode DispatchException();
+
+// Handles remaining exceptions and converts them to StatusCode::kUnhandledError
+StatusCode DispatchExceptionForC();
+
+// =================================================================================================
+
+} // namespace clblast
+
+#endif // CLBLAST_EXCEPTIONS_H_
diff --git a/src/msvc.hpp b/src/utilities/msvc.hpp
index a45105df..a45105df 100644
--- a/src/msvc.hpp
+++ b/src/utilities/msvc.hpp
diff --git a/src/utilities.cpp b/src/utilities/utilities.cpp
index 86cc2d13..b4a18311 100644
--- a/src/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
#include <string>
#include <vector>
diff --git a/src/utilities.hpp b/src/utilities/utilities.hpp
index 038a8a96..9bc7401a 100644
--- a/src/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -24,8 +24,8 @@
#include "clblast.h"
#include "clblast_half.h"
#include "clpp11.hpp"
-
-#include "msvc.hpp"
+#include "utilities/clblast_exceptions.hpp"
+#include "utilities/msvc.hpp"
namespace clblast {
// =================================================================================================
@@ -207,11 +207,6 @@ bool CheckArgument(const int argc, char *argv[], std::string &help, const std::s
// =================================================================================================
-// Helper function to check for errors in the status code
-inline bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
-
-// =================================================================================================
-
// Returns a random number to be used as a seed
unsigned int GetRandomSeed();
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index fc908b9e..ce3b0e07 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -19,7 +19,24 @@
namespace clblast {
// =================================================================================================
-// The transpose-options to test with (data-type dependent)
+// Test settings for the regular test. Append to these lists in case more tests are required.
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 4096 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncrements = { 1, 2, 7 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 512 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBandSizes = { 4, 19 };
+
+// Test settings for the invalid tests
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kInvalidIncrements = { 0, 1 };
+template <typename T, typename U> const size_t TestBlas<T,U>::kBufferSize = 64;
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVecSizes = {0, kBufferSize - 1, kBufferSize};
+
+// The layout/transpose/triangle options to test with
+template <typename T, typename U> const std::vector<Layout> TestBlas<T,U>::kLayouts = {Layout::kRowMajor, Layout::kColMajor};
+template <typename T, typename U> const std::vector<Triangle> TestBlas<T,U>::kTriangles = {Triangle::kUpper, Triangle::kLower};
+template <typename T, typename U> const std::vector<Side> TestBlas<T,U>::kSides = {Side::kLeft, Side::kRight};
+template <typename T, typename U> const std::vector<Diagonal> TestBlas<T,U>::kDiagonals = {Diagonal::kUnit, Diagonal::kNonUnit};
template <> const std::vector<Transpose> TestBlas<half,half>::kTransposes = {Transpose::kNo, Transpose::kYes};
template <> const std::vector<Transpose> TestBlas<float,float>::kTransposes = {Transpose::kNo, Transpose::kYes};
template <> const std::vector<Transpose> TestBlas<double,double>::kTransposes = {Transpose::kNo, Transpose::kYes};
@@ -39,6 +56,9 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
const ResultGet get_result, const ResultIndex get_index,
const ResultIterator get_id1, const ResultIterator get_id2):
Tester<T,U>(argc, argv, silent, name, options),
+ kOffsets(GetOffsets()),
+ kAlphaValues(GetExampleScalars<U>(full_test_)),
+ kBetaValues(GetExampleScalars<U>(full_test_)),
run_routine_(run_routine),
get_result_(get_result),
get_index_(get_index),
diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp
index 4b773801..da572e01 100644
--- a/test/correctness/testblas.hpp
+++ b/test/correctness/testblas.hpp
@@ -51,26 +51,26 @@ class TestBlas: public Tester<T,U> {
using Tester<T,U>::GetSizesString;
// Test settings for the regular test. Append to these lists in case more tests are required.
- const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
- const std::vector<size_t> kIncrements = { 1, 2, 7 };
- const std::vector<size_t> kMatrixDims = { 7, 64 };
- const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
- const std::vector<size_t> kBandSizes = { 4, 19 };
- const std::vector<size_t> kOffsets = GetOffsets();
- const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
- const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
+ static const std::vector<size_t> kVectorDims;
+ static const std::vector<size_t> kIncrements;
+ static const std::vector<size_t> kMatrixDims;
+ static const std::vector<size_t> kMatrixVectorDims;
+ static const std::vector<size_t> kBandSizes;
+ const std::vector<size_t> kOffsets;
+ const std::vector<U> kAlphaValues;
+ const std::vector<U> kBetaValues;
// Test settings for the invalid tests
- const std::vector<size_t> kInvalidIncrements = { 0, 1 };
- const size_t kBufferSize = 64;
- const std::vector<size_t> kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
- const std::vector<size_t> kVecSizes = {0, kBufferSize - 1, kBufferSize};
+ static const std::vector<size_t> kInvalidIncrements;
+ static const size_t kBufferSize;
+ static const std::vector<size_t> kMatSizes;
+ static const std::vector<size_t> kVecSizes;
// The layout/transpose/triangle options to test with
- const std::vector<Layout> kLayouts = {Layout::kRowMajor, Layout::kColMajor};
- const std::vector<Triangle> kTriangles = {Triangle::kUpper, Triangle::kLower};
- const std::vector<Side> kSides = {Side::kLeft, Side::kRight};
- const std::vector<Diagonal> kDiagonals = {Diagonal::kUnit, Diagonal::kNonUnit};
+ static const std::vector<Layout> kLayouts;
+ static const std::vector<Triangle> kTriangles;
+ static const std::vector<Side> kSides;
+ static const std::vector<Diagonal> kDiagonals;
static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
// Shorthand for the routine-specific functions passed to the tester
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index 41e457b6..5c4435f2 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -22,6 +22,30 @@
namespace clblast {
// =================================================================================================
+// Maximum number of test results printed on a single line
+template <typename T, typename U> const size_t Tester<T,U>::kResultsPerLine = size_t{64};
+
+// Error percentage is not applicable: error was caused by an incorrect status
+template <typename T, typename U> const float Tester<T,U>::kStatusError = -1.0f;
+
+// Constants holding start and end strings for terminal-output in colour
+template <typename T, typename U> const std::string Tester<T,U>::kPrintError = "\x1b[31m";
+template <typename T, typename U> const std::string Tester<T,U>::kPrintSuccess = "\x1b[32m";
+template <typename T, typename U> const std::string Tester<T,U>::kPrintWarning = "\x1b[35m";
+template <typename T, typename U> const std::string Tester<T,U>::kPrintMessage = "\x1b[1m";
+template <typename T, typename U> const std::string Tester<T,U>::kPrintEnd = "\x1b[0m";
+
+// Sets the output error coding
+template <typename T, typename U> const std::string Tester<T,U>::kSuccessData = kPrintSuccess + ":" + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kSuccessStatus = kPrintSuccess + "." + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kErrorData = kPrintError + "X" + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kErrorStatus = kPrintError + "/" + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kSkippedCompilation = kPrintWarning + "\\" + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kUnsupportedPrecision = kPrintWarning + "o" + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kUnsupportedReference = kPrintWarning + "-" + kPrintEnd;
+
+// =================================================================================================
+
// General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
// the clBLAS library for reference.
template <typename T, typename U>
@@ -41,8 +65,8 @@ Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
print_count_{0},
tests_passed_{0},
tests_skipped_{0},
- tests_failed_{0},
- options_{options} {
+ tests_failed_{0} {
+ options_ = options;
// Determines which reference to test against
#if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS)
@@ -224,7 +248,7 @@ void Tester<T,U>::TestErrorCodes(const StatusCode clblas_status, const StatusCod
}
// Could not compile the CLBlast kernel properly
- else if (clblast_status == StatusCode::kBuildProgramFailure ||
+ else if (clblast_status == StatusCode::kOpenCLBuildProgramFailure ||
clblast_status == StatusCode::kNotImplemented) {
PrintTestResult(kSkippedCompilation);
ReportSkipped();
diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp
index 422da9ed..c7fb4407 100644
--- a/test/correctness/tester.hpp
+++ b/test/correctness/tester.hpp
@@ -28,7 +28,7 @@
#endif
#include "clblast.h"
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
@@ -39,26 +39,26 @@ class Tester {
public:
// Maximum number of test results printed on a single line
- static constexpr auto kResultsPerLine = size_t{64};
+ static const size_t kResultsPerLine;
// Error percentage is not applicable: error was caused by an incorrect status
- static constexpr auto kStatusError = -1.0f;
+ static const float kStatusError;
// Constants holding start and end strings for terminal-output in colour
- const std::string kPrintError{"\x1b[31m"};
- const std::string kPrintSuccess{"\x1b[32m"};
- const std::string kPrintWarning{"\x1b[35m"};
- const std::string kPrintMessage{"\x1b[1m"};
- const std::string kPrintEnd{"\x1b[0m"};
+ static const std::string kPrintError;
+ static const std::string kPrintSuccess;
+ static const std::string kPrintWarning;
+ static const std::string kPrintMessage;
+ static const std::string kPrintEnd;
// Sets the output error coding
- const std::string kSuccessData{kPrintSuccess + ":" + kPrintEnd};
- const std::string kSuccessStatus{kPrintSuccess + "." + kPrintEnd};
- const std::string kErrorData{kPrintError + "X" + kPrintEnd};
- const std::string kErrorStatus{kPrintError + "/" + kPrintEnd};
- const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
- const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};
- const std::string kUnsupportedReference{kPrintWarning + "-" + kPrintEnd};
+ static const std::string kSuccessData;
+ static const std::string kSuccessStatus;
+ static const std::string kErrorData;
+ static const std::string kErrorStatus;
+ static const std::string kSkippedCompilation;
+ static const std::string kUnsupportedPrecision;
+ static const std::string kUnsupportedReference;
// This structure combines the above log-entry with a status code an error percentage
struct ErrorLogEntry {
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index 381ba158..4554c67f 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -31,7 +31,7 @@
#endif
#include "clblast.h"
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
diff --git a/test/wrapper_cblas.hpp b/test/wrapper_cblas.hpp
index 7bc674ab..5f1db54e 100644
--- a/test/wrapper_cblas.hpp
+++ b/test/wrapper_cblas.hpp
@@ -20,7 +20,7 @@ extern "C"
#include <cblas.h>
}
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
namespace clblast {
diff --git a/test/wrapper_clblas.hpp b/test/wrapper_clblas.hpp
index 3f33890a..f1923784 100644
--- a/test/wrapper_clblas.hpp
+++ b/test/wrapper_clblas.hpp
@@ -17,7 +17,7 @@
#include <clBLAS.h>
-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
namespace clblast {