From b84d2296b87ac212474af855d916b12adf96bdb7 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 1 Apr 2017 13:36:24 +0200 Subject: Separated host-device and device-host memory copies from execution of the CBLAS reference code; for fair timing and code de-duplication --- test/routines/levelx/xaxpybatched.hpp | 13 ++++---- test/routines/levelx/xgemmbatched.hpp | 17 ++++------- test/routines/levelx/xinvert.hpp | 56 ++++++++++++++++------------------- test/routines/levelx/xomatcopy.hpp | 43 +++++++++++++-------------- 4 files changed, 57 insertions(+), 72 deletions(-) (limited to 'test/routines/levelx') diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp index ee15ff92..05141bbb 100644 --- a/test/routines/levelx/xaxpybatched.hpp +++ b/test/routines/levelx/xaxpybatched.hpp @@ -45,6 +45,8 @@ class TestXaxpyBatched { kArgXInc, kArgYInc, kArgBatchCount, kArgAlpha}; } + static std::vector BuffersIn() { return {kBufVecX, kBufVecY}; } + static std::vector BuffersOut() { return {kBufVecY}; } // Helper for the sizes per batch static size_t PerBatchSizeX(const Arguments &args) { return args.n * args.x_inc; } @@ -123,17 +125,12 @@ class TestXaxpyBatched { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { - std::vector x_vec_cpu(args.x_size, static_cast(0)); - std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { cblasXaxpy(args.n, args.alphas[batch], - x_vec_cpu, args.x_offsets[batch], args.x_inc, - y_vec_cpu, args.y_offsets[batch], args.y_inc); + buffers_host.x_vec, args.x_offsets[batch], args.x_inc, + buffers_host.y_vec, args.y_offsets[batch], args.y_inc); } - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/levelx/xgemmbatched.hpp b/test/routines/levelx/xgemmbatched.hpp index 80a30e4d..ab5f20c5 100644 --- a/test/routines/levelx/xgemmbatched.hpp +++ b/test/routines/levelx/xgemmbatched.hpp @@ -45,6 +45,8 @@ class TestXgemmBatched { kArgAOffset, kArgBOffset, kArgCOffset, kArgBatchCount, kArgAlpha, kArgBeta}; } + static std::vector BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } + static std::vector BuffersOut() { return {kBufMatC}; } // Helper for the sizes per batch static size_t PerBatchSizeA(const Arguments &args) { @@ -152,23 +154,16 @@ class TestXgemmBatched { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { - std::vector a_mat_cpu(args.a_size, static_cast(0)); - std::vector b_mat_cpu(args.b_size, static_cast(0)); - std::vector c_mat_cpu(args.c_size, static_cast(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { cblasXgemm(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), convertToCBLAS(args.b_transpose), args.m, args.n, args.k, args.alphas[batch], - a_mat_cpu, args.a_offsets[batch], args.a_ld, - b_mat_cpu, args.b_offsets[batch], args.b_ld, args.betas[batch], - c_mat_cpu, args.c_offsets[batch], args.c_ld); + buffers_host.a_mat, args.a_offsets[batch], args.a_ld, + buffers_host.b_mat, args.b_offsets[batch], args.b_ld, args.betas[batch], + buffers_host.c_mat, args.c_offsets[batch], args.c_ld); } - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp index b470dbf3..ffb484b0 100644 --- a/test/routines/levelx/xinvert.hpp +++ b/test/routines/levelx/xinvert.hpp @@ -25,17 +25,10 @@ namespace clblast { // ================================================================================================= template -StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &queue) { +StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) || (args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor)); - // Data transfer from OpenCL to std::vector - std::vector a_mat_cpu(args.a_size, T{0.0}); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - - // Creates the output buffer - std::vector b_mat_cpu(args.b_size, T{0.0}); - // Helper variables const auto block_size = args.m; const auto num_blocks = CeilDiv(args.n, block_size); @@ -60,11 +53,11 @@ StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &qu auto a_value = T{1.0}; if (args.diagonal == Diagonal::kNonUnit) { if (i + block_id * block_size < args.n) { - if (a_mat_cpu[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; } - a_value = T{1.0} / a_mat_cpu[i * a_ld + i + a_offset]; + if (buffers_host.a_mat[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; } + a_value = T{1.0} / buffers_host.a_mat[i * a_ld + i + a_offset]; } } - b_mat_cpu[i * b_ld + i + b_offset] = a_value; + buffers_host.b_mat[i * b_ld + i + b_offset] = a_value; } // Inverts the upper triangle row by row @@ -75,11 +68,11 @@ StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &qu for (auto k = i + 1; k <= j; ++k) { auto a_value = T{0.0}; if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) { - a_value = a_mat_cpu[k * a_ld + i + a_offset]; + a_value = buffers_host.a_mat[k * a_ld + i + a_offset]; } - sum += a_value * b_mat_cpu[j * b_ld + k + b_offset]; + sum += a_value * buffers_host.b_mat[j * b_ld + k + b_offset]; } - b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset]; + buffers_host.b_mat[j * b_ld + i + b_offset] = - sum * buffers_host.b_mat[i * b_ld + i + b_offset]; } } } @@ -92,35 +85,32 @@ StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &qu for (auto k = j; k < i; ++k) { auto a_value = T{0.0}; if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) { - a_value = a_mat_cpu[k * a_ld + i + a_offset]; + a_value = buffers_host.a_mat[k * a_ld + i + a_offset]; } - sum += a_value * b_mat_cpu[j * b_ld + k + b_offset]; + sum += a_value * buffers_host.b_mat[j * b_ld + k + b_offset]; } - b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset]; + buffers_host.b_mat[j * b_ld + i + b_offset] = - sum * buffers_host.b_mat[i * b_ld + i + b_offset]; } } } } - - // Data transfer back to OpenCL - buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); return StatusCode::kSuccess; } // Half-precision version calling the above reference implementation after conversions template <> -StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &queue) { - auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue()); - auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue()); - auto dummy = clblast::Buffer(0); - auto buffers2 = Buffers{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy}; +StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { + auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat); + auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat); + auto dummy = std::vector(0); + auto buffers2 = BuffersHost{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy}; auto args2 = Arguments(); args2.a_size = args.a_size; args2.b_size = args.b_size; args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n; args2.a_offset = args.a_offset; args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal; - auto status = RunReference(args2, buffers2, queue); - FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue()); + auto status = RunReference(args2, buffers2); + FloatToHalfBuffer(buffers_host.b_mat, b_buffer2); return status; } @@ -140,6 +130,8 @@ class TestXinvert { kArgLayout, kArgTriangle, kArgDiagonal, kArgALeadDim, kArgAOffset}; } + static std::vector BuffersIn() { return {kBufMatA, kBufMatB}; } + static std::vector BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { @@ -190,11 +182,15 @@ class TestXinvert { // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + auto buffers_host = BuffersHost(); + DeviceToHost(args, buffers, buffers_host, queue, BuffersIn()); + const auto status = RunReference(args, buffers_host); + HostToDevice(args, buffers, buffers_host, queue, BuffersOut()); + return status; } - static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { + return RunReference(args, buffers_host); } // Describes how to download the results of the computation (more importantly: which buffer) diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp index d1064d0c..d5973b4c 100644 --- a/test/routines/levelx/xomatcopy.hpp +++ b/test/routines/levelx/xomatcopy.hpp @@ -23,13 +23,7 @@ namespace clblast { // ================================================================================================= template -StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &queue) { - - // Data transfer from OpenCL to std::vector - std::vector a_mat_cpu(args.a_size, static_cast(0)); - std::vector b_mat_cpu(args.b_size, static_cast(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); +StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { // Checking for invalid arguments const auto a_rotated = (args.layout == Layout::kRowMajor); @@ -40,8 +34,8 @@ StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &qu if ((args.m == 0) || (args.n == 0)) { return StatusCode::kInvalidDimension; } if ((args.a_ld < args.m && !a_rotated) || (args.a_ld < args.n && a_rotated)) { return StatusCode::kInvalidLeadDimA; } if ((args.b_ld < args.m && !b_rotated) || (args.b_ld < args.n && b_rotated)) { return StatusCode::kInvalidLeadDimB; } - if (buffers.a_mat.GetSize() < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; } - if (buffers.b_mat.GetSize() < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; } + if (buffers_host.a_mat.size() * sizeof(T) < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; } + if (buffers_host.b_mat.size() * sizeof(T) < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; } // Matrix copy, scaling, and/or transpose for (auto id1 = size_t{0}; id1 < args.m; ++id1) { @@ -52,30 +46,27 @@ StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &qu const auto b_two = (b_rotated) ? id1 : id2; const auto a_index = a_two * args.a_ld + a_one + args.a_offset; const auto b_index = b_two * args.b_ld + b_one + args.b_offset; - b_mat_cpu[b_index] = args.alpha * a_mat_cpu[a_index]; + buffers_host.b_mat[b_index] = args.alpha * buffers_host.a_mat[a_index]; } } - - // Data transfer back to OpenCL - buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); return StatusCode::kSuccess; } // Half-precision version calling the above reference implementation after conversions template <> -StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &queue) { - auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue()); - auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue()); - auto dummy = clblast::Buffer(0); - auto buffers2 = Buffers{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy}; +StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { + auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat); + auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat); + auto dummy = std::vector(0); + auto buffers2 = BuffersHost{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy}; auto args2 = Arguments(); args2.a_size = args.a_size; args2.b_size = args.b_size; args2.a_ld = args.a_ld; args2.b_ld = args.b_ld; args2.m = args.m; args2.n = args.n; args2.a_offset = args.a_offset; args2.b_offset = args.b_offset; args2.layout = args.layout; args2.a_transpose = args.a_transpose; args2.alpha = HalfToFloat(args.alpha); - auto status = RunReference(args2, buffers2, queue); - FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue()); + auto status = RunReference(args2, buffers2); + FloatToHalfBuffer(buffers_host.b_mat, b_buffer2); return status; } @@ -97,6 +88,8 @@ class TestXomatcopy { kArgAOffset, kArgBOffset, kArgAlpha}; } + static std::vector BuffersIn() { return {kBufMatA, kBufMatB}; } + static std::vector BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { @@ -148,11 +141,15 @@ class TestXomatcopy { // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + auto buffers_host = BuffersHost(); + DeviceToHost(args, buffers, buffers_host, queue, BuffersIn()); + const auto status = RunReference(args, buffers_host); + HostToDevice(args, buffers, buffers_host, queue, BuffersOut()); + return status; } - static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { + return RunReference(args, buffers_host); } // Describes how to download the results of the computation (more importantly: which buffer) -- cgit v1.2.3