diff options
Diffstat (limited to 'external/clBLAS/src/client/clfunc_xgemm.hpp')
-rw-r--r-- | external/clBLAS/src/client/clfunc_xgemm.hpp | 1092 |
1 files changed, 0 insertions, 1092 deletions
diff --git a/external/clBLAS/src/client/clfunc_xgemm.hpp b/external/clBLAS/src/client/clfunc_xgemm.hpp deleted file mode 100644 index fcd40a79..00000000 --- a/external/clBLAS/src/client/clfunc_xgemm.hpp +++ /dev/null @@ -1,1092 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - - -// $Id - -#ifndef CLBLAS_BENCHMARK_XGEMM_HXX__ -#define CLBLAS_BENCHMARK_XGEMM_HXX__ - -#include "clfunc_common.hpp" - -template <typename T> -struct xGemmBuffer -{ - clblasOrder order_; - size_t m_; - size_t n_; - size_t k_; - size_t lda_; - size_t ldb_; - size_t ldc_; - size_t offA_; - size_t offB_; - size_t offC_; - size_t a_num_vectors_; - size_t b_num_vectors_; - size_t c_num_vectors_; - clblasTranspose trans_a_; - clblasTranspose trans_b_; - T* a_; - T* b_; - T* c_; - cl_mem buf_a_; - cl_mem buf_b_; - cl_mem buf_c_; - T alpha_; - T beta_; -}; // struct buffer - -template <typename T> -class xGemm : public clblasFunc -{ -public: - xGemm(StatisticalTimer& timer, cl_device_type devType) : - clblasFunc(timer, devType) - { - timer.getUniqueID("clGemm", 0); - } - - ~xGemm() - { - } - - void call_func() - { - timer.Start(timer_id); - xGemm_Function(true); - timer.Stop(timer_id); - } - - double gflops() - { - return (2.0*buffer_.m_*buffer_.n_*buffer_.k_)/time_in_ns(); - } - - std::string gflops_formula() - { - return "2.0*M*N*K/time"; - } - - void setup_buffer(int order_option, int side_option, int uplo_option, - int diag_option, int transA_option, int transB_option, - size_t M, size_t N, size_t K, size_t lda, size_t ldb, - size_t ldc, size_t offA, size_t offBX, size_t offCY, - double alpha, double beta) - { - DUMMY_ARGS_USAGE_3(side_option, uplo_option, diag_option); - - initialize_scalars(alpha, beta); - - buffer_.m_ = M; - buffer_.n_ = N; - buffer_.k_ = K; - buffer_.offA_ = offA; - buffer_.offB_ = offBX; - buffer_.offC_ = offCY; - - if (order_option == 0) - { - order_ = clblasRowMajor; - if (transA_option == 0) - { - buffer_.trans_a_ = clblasNoTrans; - buffer_.a_num_vectors_ = M; - if (lda == 0) - { - buffer_.lda_ = K; - } - else if (lda < K) - { - std::cerr << "lda:wrong size\n"; - exit(1); - } - else - { - buffer_.lda_ = lda; - } - } - else - { - buffer_.a_num_vectors_ = K; - if (transA_option == 1) - { - buffer_.trans_a_ = clblasTrans; - } - else if (transA_option == 2) - { - buffer_.trans_a_ = clblasConjTrans; - } - if (lda == 0) - { - buffer_.lda_ = M; - } - else if (lda < M) - { - std::cerr << "lda:wrong size\n"; - exit(1); - } - else - { - buffer_.lda_ = lda; - } - } - - if (transB_option == 0) - { - buffer_.b_num_vectors_ = K; - buffer_.trans_b_ = clblasNoTrans; - if (ldb == 0) - { - buffer_.ldb_ = N; - } - else if (ldb < N) - { - std::cerr << "ldb:wrong size\n"; - exit(1); - } - else - { - buffer_.ldb_ = ldb; - } - } - else - { - buffer_.b_num_vectors_ = N; - if (transB_option == 1) - { - buffer_.trans_b_ = clblasTrans; - } - else if (transB_option == 2) - { - buffer_.trans_b_ = clblasConjTrans; - } - - if (ldb == 0) - { - buffer_.ldb_ = K; - } - else if (ldb < K) - { - std::cerr << "ldb:wrong size\n"; - exit(1); - } - else - { - buffer_.ldb_ = ldb; - } - } - - if (ldc == 0) - { - buffer_.ldc_ = N; - } - else if (ldc < N) - { - std::cerr << "ldc:wrong size\n"; - } - else - { - buffer_.ldc_ = ldc; - } - buffer_.c_num_vectors_ = M; - } - else - { - order_ = clblasColumnMajor; - if (transA_option == 0) - { - buffer_.a_num_vectors_ = K; - buffer_.trans_a_ = clblasNoTrans; - if (lda == 0) - { - buffer_.lda_ = M; - } - else if (lda < M) - { - std::cerr << "lda:wrong size\n"; - exit(1); - } - else - { - buffer_.lda_ = lda; - } - } - else - { - buffer_.a_num_vectors_ = M; - if (transA_option == 1) - { - buffer_.trans_a_ = clblasTrans; - } - else if (transA_option == 2) - { - buffer_.trans_a_ = clblasConjTrans; - } - - - if (lda == 0) - { - buffer_.lda_ = K; - } - else if (lda < K) - { - std::cerr << "lda:wrong size\n"; - exit(1); - } - else - { - buffer_.lda_ = lda; - } - } - - if (transB_option == 0) - { - buffer_.b_num_vectors_ = N; - buffer_.trans_b_ = clblasNoTrans; - - if (ldb == 0) - { - buffer_.ldb_ = K; - } - else if (ldb < K) - { - std::cerr << "ldb:wrong size\n"; - exit(1); - } - else - { - buffer_.ldb_ = ldb; - } - } - else - { - buffer_.b_num_vectors_ = K; - if (transB_option == 1) - { - buffer_.trans_b_ = clblasTrans; - } - else if (transB_option == 2) - { - buffer_.trans_b_ = clblasConjTrans; - } - - if (ldb == 0) - { - buffer_.ldb_ = N; - } - else if (ldb < N) - { - std::cerr << "ldb:wrong size\n"; - exit(1); - } - else - { - buffer_.ldb_ = ldb; - } - } - - if (ldc == 0) - { - buffer_.ldc_ = M; - } - else if (ldc < M) - { - std::cerr << "ldc:wrong size\n"; - } - else - { - buffer_.ldc_ = ldc; - } - buffer_.c_num_vectors_ = N; - } - buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; - buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; - buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ]; - - cl_int err; - buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, - (buffer_.lda_*buffer_.a_num_vectors_ + - buffer_.offA_) * sizeof(T), - NULL, &err); - - buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, - (buffer_.ldb_ * buffer_.b_num_vectors_ + - buffer_.offB_) * sizeof(T), - NULL, &err); - - buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, - (buffer_.ldc_ * buffer_.c_num_vectors_ + - buffer_.offC_) * sizeof(T), - NULL, &err); - - } - - void initialize_cpu_buffer() - { - srand(10); - for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) - { - for (size_t j = 0; j < buffer_.lda_; ++j) - { - buffer_.a_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) / - randomScale<T>(); - } - } - - for (size_t i = 0; i < buffer_.b_num_vectors_; ++i) - { - for (size_t j = 0; j < buffer_.ldb_; ++j) - { - buffer_.b_[i*buffer_.ldb_+j] = random<T>(UPPER_BOUND<T>()) / - randomScale<T>(); - } - } - - for (size_t i = 0; i < buffer_.c_num_vectors_; ++i) - { - for (size_t j = 0; j < buffer_.ldc_; ++j) - { - buffer_.c_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) / - randomScale<T>(); - } - } - } - - void initialize_gpu_buffer() - { - - cl_int err; - - err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE, - buffer_.offA_ * sizeof(T), - buffer_.lda_ * buffer_.a_num_vectors_ * - sizeof(T), - buffer_.a_, 0, NULL, NULL); - - err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(T), - buffer_.ldb_ * buffer_.b_num_vectors_ * - sizeof(T), - buffer_.b_, 0, NULL, NULL); - - err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), - buffer_.ldc_ * buffer_.c_num_vectors_ * - sizeof(T), - buffer_.c_, 0, NULL, NULL); - - - } - - void reset_gpu_write_buffer() - { - cl_int err; - err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), - buffer_.ldc_ * buffer_.c_num_vectors_ * - sizeof(T), - buffer_.c_, 0, NULL, NULL); - } - - void read_gpu_buffer() - { - cl_int err; - err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * - sizeof(T), - buffer_.c_, 0, NULL, NULL); - } - - void roundtrip_func() - { - timer.Start(timer_id); - cl_int err; - buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, - (buffer_.lda_*buffer_.a_num_vectors_ + - buffer_.offA_) * sizeof(T), - NULL, &err); - - buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, - (buffer_.ldb_ * buffer_.b_num_vectors_ + - buffer_.offB_) * sizeof(T), - NULL, &err); - - buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, - (buffer_.ldc_ * buffer_.c_num_vectors_ + - buffer_.offC_) * sizeof(T), - NULL, &err); - err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE, - buffer_.offA_ * sizeof(T), - buffer_.lda_ * buffer_.a_num_vectors_ * - sizeof(T), - buffer_.a_, 0, NULL, NULL); - - err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(T), - buffer_.ldb_ * buffer_.b_num_vectors_ * - sizeof(T), - buffer_.b_, 0, NULL, NULL); - - err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), - buffer_.ldc_ * buffer_.c_num_vectors_ * - sizeof(T), - buffer_.c_, 0, NULL, NULL); - xGemm_Function(false); - err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * - sizeof(T), - buffer_.c_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); - } - void roundtrip_func_rect() - { - timer.Start(timer_id); - cl_int err; - //rect - size_t a_buffer_origin[3] = {0,0,0}; - size_t a_host_origin[3] = {0,0,0}; - size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1}; - size_t a_buffer_row_pitch=0*sizeof(T);//lda - size_t a_buffer_slice_pitch=0; - size_t a_host_row_pitch=buffer_.lda_*sizeof(T); - size_t a_host_slice_pitch=0; - - size_t b_buffer_origin[3] = {0,0,0}; - size_t b_host_origin[3] = {0,0,0}; - size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1}; - size_t b_buffer_row_pitch=0*sizeof(T);//ldb - size_t b_buffer_slice_pitch=0; - size_t b_host_row_pitch=buffer_.ldb_*sizeof(T); - size_t b_host_slice_pitch=0; - - size_t c_buffer_origin[3] = {0,0,0}; - size_t c_host_origin[3] = {0,0,0}; - size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1}; - size_t c_buffer_row_pitch=0*sizeof(T);//ldc - size_t c_buffer_slice_pitch=0; - size_t c_host_row_pitch=buffer_.ldc_*sizeof(T); - size_t c_host_slice_pitch=0; - - buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, - (buffer_.k_*buffer_.m_ + - buffer_.offA_) * sizeof(T), - NULL, &err); - - buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, - (buffer_.k_ * buffer_.n_ + - buffer_.offB_) * sizeof(T), - NULL, &err); - - buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, - (buffer_.m_ * buffer_.n_ + - buffer_.offC_) * sizeof(T), - NULL, &err); - /* - err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE, - buffer_.offA_ * sizeof(T), - buffer_.lda_ * buffer_.a_num_vectors_ * - sizeof(T), - buffer_.a_, 0, NULL, NULL); - - err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(T), - buffer_.ldb_ * buffer_.b_num_vectors_ * - sizeof(T), - buffer_.b_, 0, NULL, NULL); - - err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), - buffer_.ldc_ * buffer_.c_num_vectors_ * - sizeof(T), - buffer_.c_, 0, NULL, NULL);*/ - err = clEnqueueWriteBufferRect(queue_, buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch, - a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL); - err = clEnqueueWriteBufferRect(queue_, buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch, - b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL); - err = clEnqueueWriteBufferRect(queue_, buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, - c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL); - - if(buffer_.trans_a_==clblasNoTrans) - { - buffer_.lda_=buffer_.m_; - } - else - { - buffer_.lda_=buffer_.k_; - } - if(buffer_.trans_b_==clblasNoTrans) - { - buffer_.ldb_=buffer_.k_; - } - else - { - buffer_.ldb_=buffer_.n_; - } - buffer_.ldc_=buffer_.m_; - xGemm_Function(false); - /* - err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * - sizeof(T), - buffer_.c_, 0, NULL, &event_); - */ - err = ::clEnqueueReadBufferRect(queue_, buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, - c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); - } - void allochostptr_roundtrip_func() - { - timer.Start(timer_id); - - cl_int err; - // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy - buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, - (buffer_.lda_*buffer_.a_num_vectors_ + - buffer_.offA_) * sizeof(T), - NULL, &err); - - buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, - (buffer_.ldb_ * buffer_.b_num_vectors_ + - buffer_.offB_) * sizeof(T), - NULL, &err); - - buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - (buffer_.ldc_ * buffer_.c_num_vectors_ + - buffer_.offC_) * sizeof(T), - NULL, &err); - - // map the buffers to pointers at host device - T *map_a,*map_b,*map_c; - map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.a_num_vectors_ + - buffer_.offA_) * sizeof(T), - 0, NULL, NULL, &err); - map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.ldb_*buffer_.b_num_vectors_ + - buffer_.offB_) * sizeof(T), - 0, NULL, NULL, &err); - map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + - buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - // memcpy the input A, B, C to the host pointers - memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); - memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - // unmap the buffers - clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL); - clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL); - clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, NULL); - // calling clBLAS - xGemm_Function(false); - // map the C buffer again to read output - map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + - buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, &event_); - clWaitForEvents(1, &event_); - - timer.Stop(timer_id); - } - void usehostptr_roundtrip_func() - { - timer.Start(timer_id); - cl_int err; - buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, - (buffer_.lda_*buffer_.a_num_vectors_ + - buffer_.offA_) * sizeof(T), - buffer_.a_, &err); - - buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, - (buffer_.ldb_ * buffer_.b_num_vectors_ + - buffer_.offB_) * sizeof(T), - buffer_.b_, &err); - - buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, - (buffer_.ldc_ * buffer_.c_num_vectors_ + - buffer_.offC_) * sizeof(T), - buffer_.c_, &err); - xGemm_Function(true); - timer.Stop(timer_id); - } - void copyhostptr_roundtrip_func() - { - timer.Start(timer_id); - cl_int err; - buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, - (buffer_.lda_*buffer_.a_num_vectors_ + - buffer_.offA_) * sizeof(T), - buffer_.a_, &err); - - buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, - (buffer_.ldb_ * buffer_.b_num_vectors_ + - buffer_.offB_) * sizeof(T), - buffer_.b_, &err); - - buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - (buffer_.ldc_ * buffer_.c_num_vectors_ + - buffer_.offC_) * sizeof(T), - buffer_.c_, &err); - xGemm_Function(false); - err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * - sizeof(T), - buffer_.c_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); - } - void usepersismem_roundtrip_func() - { -#if defined(CL_MEM_USE_PERSISTENT_MEM_AMD) - timer.Start(timer_id); - - cl_int err; - - buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD, - (buffer_.lda_*buffer_.a_num_vectors_ + - buffer_.offA_) * sizeof(T), - NULL, &err); - - buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD, - (buffer_.ldb_ * buffer_.b_num_vectors_ + - buffer_.offB_) * sizeof(T), - NULL, &err); - - buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD, - (buffer_.ldc_ * buffer_.c_num_vectors_ + - buffer_.offC_) * sizeof(T), - NULL, &err); - - // map the buffers to pointers at host devices - T *map_a,*map_b,*map_c; - map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.a_num_vectors_ + - buffer_.offA_) * sizeof(T), - 0, NULL, NULL, &err); - map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.ldb_*buffer_.b_num_vectors_ + - buffer_.offB_) * sizeof(T), - 0, NULL, NULL, &err); - map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + - buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - // memcpy the input A, B, C to the host pointers - memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); - memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - // unmap the buffers - clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL); - clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL); - clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, NULL); - // calling clBLAS - xGemm_Function(false); - // map the C buffer again to read output - map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + - buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, &event_); - clWaitForEvents(1, &event_); - - timer.Stop(timer_id); -#else - std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl; -#endif - - } - void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, - int diag_option, int transA_option, int transB_option, - size_t M, size_t N, size_t K, size_t lda, size_t ldb, - size_t ldc, size_t offA, size_t offBX, size_t offCY, - double alpha, double beta) - { - DUMMY_ARGS_USAGE_3(side_option, uplo_option, diag_option); - - initialize_scalars(alpha, beta); - - buffer_.m_ = M; - buffer_.n_ = N; - buffer_.k_ = K; - buffer_.offA_ = offA; - buffer_.offB_ = offBX; - buffer_.offC_ = offCY; - - if (order_option == 0) - { - order_ = clblasRowMajor; - if (transA_option == 0) - { - buffer_.trans_a_ = clblasNoTrans; - buffer_.a_num_vectors_ = M; - if (lda == 0) - { - buffer_.lda_ = K; - } - else if (lda < K) - { - std::cerr << "lda:wrong size\n"; - exit(1); - } - else - { - buffer_.lda_ = lda; - } - } - else - { - buffer_.a_num_vectors_ = K; - if (transA_option == 1) - { - buffer_.trans_a_ = clblasTrans; - } - else if (transA_option == 2) - { - buffer_.trans_a_ = clblasConjTrans; - } - if (lda == 0) - { - buffer_.lda_ = M; - } - else if (lda < M) - { - std::cerr << "lda:wrong size\n"; - exit(1); - } - else - { - buffer_.lda_ = lda; - } - } - - if (transB_option == 0) - { - buffer_.b_num_vectors_ = K; - buffer_.trans_b_ = clblasNoTrans; - if (ldb == 0) - { - buffer_.ldb_ = N; - } - else if (ldb < N) - { - std::cerr << "ldb:wrong size\n"; - exit(1); - } - else - { - buffer_.ldb_ = ldb; - } - } - else - { - buffer_.b_num_vectors_ = N; - if (transB_option == 1) - { - buffer_.trans_b_ = clblasTrans; - } - else if (transB_option == 2) - { - buffer_.trans_b_ = clblasConjTrans; - } - - if (ldb == 0) - { - buffer_.ldb_ = K; - } - else if (ldb < K) - { - std::cerr << "ldb:wrong size\n"; - exit(1); - } - else - { - buffer_.ldb_ = ldb; - } - } - - if (ldc == 0) - { - buffer_.ldc_ = N; - } - else if (ldc < N) - { - std::cerr << "ldc:wrong size\n"; - } - else - { - buffer_.ldc_ = ldc; - } - buffer_.c_num_vectors_ = M; - } - else - { - order_ = clblasColumnMajor; - if (transA_option == 0) - { - buffer_.a_num_vectors_ = K; - buffer_.trans_a_ = clblasNoTrans; - if (lda == 0) - { - buffer_.lda_ = M; - } - else if (lda < M) - { - std::cerr << "lda:wrong size\n"; - exit(1); - } - else - { - buffer_.lda_ = lda; - } - } - else - { - buffer_.a_num_vectors_ = M; - if (transA_option == 1) - { - buffer_.trans_a_ = clblasTrans; - } - else if (transA_option == 2) - { - buffer_.trans_a_ = clblasConjTrans; - } - - - if (lda == 0) - { - buffer_.lda_ = K; - } - else if (lda < K) - { - std::cerr << "lda:wrong size\n"; - exit(1); - } - else - { - buffer_.lda_ = lda; - } - } - - if (transB_option == 0) - { - buffer_.b_num_vectors_ = N; - buffer_.trans_b_ = clblasNoTrans; - - if (ldb == 0) - { - buffer_.ldb_ = K; - } - else if (ldb < K) - { - std::cerr << "ldb:wrong size\n"; - exit(1); - } - else - { - buffer_.ldb_ = ldb; - } - } - else - { - buffer_.b_num_vectors_ = K; - if (transB_option == 1) - { - buffer_.trans_b_ = clblasTrans; - } - else if (transB_option == 2) - { - buffer_.trans_b_ = clblasConjTrans; - } - - if (ldb == 0) - { - buffer_.ldb_ = N; - } - else if (ldb < N) - { - std::cerr << "ldb:wrong size\n"; - exit(1); - } - else - { - buffer_.ldb_ = ldb; - } - } - - if (ldc == 0) - { - buffer_.ldc_ = M; - } - else if (ldc < M) - { - std::cerr << "ldc:wrong size\n"; - } - else - { - buffer_.ldc_ = ldc; - } - buffer_.c_num_vectors_ = N; - } - buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; - buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; - buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ]; - - } - void releaseGPUBuffer_deleteCPUBuffer() - { - //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) - //need to do this before we eventually hit the destructor - delete buffer_.a_; - delete buffer_.b_; - delete buffer_.c_; - OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), - "releasing buffer A"); - OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_), - "releasing buffer B"); - OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_), - "releasing buffer C"); - } - -protected: - void initialize_scalars(double alpha, double beta) - { - buffer_.alpha_ = makeScalar<T>(alpha); - buffer_.beta_ = makeScalar<T>(beta); - } - -private: - xGemmBuffer<T> buffer_; - void xGemm_Function(bool flush); - - -}; // class xgemm - -template<> -void -xGemm<cl_float>:: -xGemm_Function(bool flush) -{ - clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_, - buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, - buffer_.buf_a_, buffer_.offA_, buffer_.lda_, - buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, - buffer_.beta_, buffer_.buf_c_, buffer_.offC_, - buffer_.ldc_, 1, &queue_, 0, NULL, &event_); - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { - clWaitForEvents(1, &event_); - } -} - -template<> -void -xGemm<cl_double>:: -xGemm_Function(bool flush) -{ - clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_, - buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, - buffer_.buf_a_, buffer_.offA_, buffer_.lda_, - buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, - buffer_.beta_, buffer_.buf_c_, buffer_.offC_, - buffer_.ldc_, 1, &queue_, 0, NULL, &event_); - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { - clWaitForEvents(1, &event_); - } -} - -template<> -void -xGemm<cl_float2>:: -xGemm_Function(bool flush) -{ - clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_, - buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, - buffer_.buf_a_, buffer_.offA_, buffer_.lda_, - buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, - buffer_.beta_, buffer_.buf_c_, buffer_.offC_, - buffer_.ldc_, 1, &queue_, 0, NULL, &event_); - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { - clWaitForEvents(1, &event_); - } -} - -template<> -void -xGemm<cl_double2>:: -xGemm_Function(bool flush) -{ - clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_, - buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, - buffer_.buf_a_, buffer_.offA_, buffer_.lda_, - buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, - buffer_.beta_, buffer_.buf_c_, buffer_.offC_, - buffer_.ldc_, 1, &queue_, 0, NULL, &event_); - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { - clWaitForEvents(1, &event_); - } -} - -template<> -double -xGemm<cl_float2>:: -gflops() -{ - return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/time_in_ns(); -} - -template<> -double -xGemm<cl_double2>:: -gflops() -{ - return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/time_in_ns(); -} - -template<> -std::string -xGemm<cl_float2>:: -gflops_formula() -{ - return "8.0*M*N*K/time"; -} - -template<> -std::string -xGemm<cl_double2>:: -gflops_formula() -{ - return "8.0*M*N*K/time"; -} - -#endif // ifndef CLBLAS_BENCHMARK_XGEMM_HXX__ |