diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-03-08 20:10:20 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-03-08 20:10:20 +0100 |
commit | fa0a9c689fc21a2a24aeadf82ae0acdf6d8bf831 (patch) | |
tree | 404e85900a4c9038d407addb38798d06bb48868c /test/routines/levelx/xaxpybatched.hpp | |
parent | 6aba0bbae71702c4eebd88d0fe17739b509185c1 (diff) |
Make batched routines based on offsets instead of a vector of cl_mem objects - undoing many earlier changes
Diffstat (limited to 'test/routines/levelx/xaxpybatched.hpp')
-rw-r--r-- | test/routines/levelx/xaxpybatched.hpp | 66 |
1 files changed, 36 insertions, 30 deletions
diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp index 7922359d..8f6a5985 100644 --- a/test/routines/levelx/xaxpybatched.hpp +++ b/test/routines/levelx/xaxpybatched.hpp @@ -51,18 +51,28 @@ class TestXaxpyBatched { return alpha_base + Constant<T>(batch_id); } - // Describes how to obtain the sizes of the buffers (per item, not for the full batch) + // Helper for the sizes per batch + static size_t PerBatchSizeX(const Arguments<T> &args) { return args.n * args.x_inc; } + static size_t PerBatchSizeY(const Arguments<T> &args) { return args.n * args.y_inc; } + + // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { - return args.n * args.x_inc; + return PerBatchSizeX(args) * args.batch_count + args.x_offset; } static size_t GetSizeY(const Arguments<T> &args) { - return args.n * args.y_inc; + return PerBatchSizeY(args) * args.batch_count + args.y_offset; } - // Describes how to set the sizes of all the buffers (per item, not for the full batch) + // Describes how to set the sizes of all the buffers static void SetSizes(Arguments<T> &args) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); + args.x_offsets = std::vector<size_t>(args.batch_count); + args.y_offsets = std::vector<size_t>(args.batch_count); + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + args.x_offsets[batch] = batch * PerBatchSizeX(args) + args.x_offset; + args.y_offsets[batch] = batch * PerBatchSizeY(args) + args.y_offset; + } } // Describes what the default values of the leading dimensions of the matrices are @@ -81,20 +91,16 @@ class TestXaxpyBatched { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alphas = std::vector<T>(); - auto x_buffers = std::vector<cl_mem>(); - auto y_buffers = std::vector<cl_mem>(); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { alphas.push_back(GetAlpha(args.alpha, batch)); - x_buffers.push_back(buffers[batch].x_vec()); - y_buffers.push_back(buffers[batch].y_vec()); } auto status = AxpyBatched(args.n, alphas.data(), - x_buffers.data(), args.x_inc, - y_buffers.data(), args.y_inc, + buffers.x_vec(), args.x_offsets.data(), args.x_inc, + buffers.y_vec(), args.y_offsets.data(), args.y_inc, args.batch_count, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } @@ -103,13 +109,13 @@ class TestXaxpyBatched { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { auto event = cl_event{}; auto status = clblasXaxpy(args.n, GetAlpha(args.alpha, batch), - buffers[batch].x_vec, 0, args.x_inc, - buffers[batch].y_vec, 0, args.y_inc, + buffers.x_vec, args.x_offsets[batch], args.x_inc, + buffers.y_vec, args.y_offsets[batch], args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); if (static_cast<StatusCode>(status) != StatusCode::kSuccess) { @@ -122,41 +128,41 @@ class TestXaxpyBatched { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers[batch].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[batch].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXaxpy(args.n, GetAlpha(args.alpha, batch), - x_vec_cpu, 0, args.x_inc, - y_vec_cpu, 0, args.y_inc); - buffers[batch].y_vec.Write(queue, args.y_size, y_vec_cpu); + x_vec_cpu, args.x_offsets[batch], args.x_inc, + y_vec_cpu, args.y_offsets[batch], args.y_inc); } + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif - // Describes how to download the results of the computation (per item, not for the full batch) + // Describes how to download the results of the computation static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } - // Describes how to compute the indices of the result buffer (per item, not for the full batch) + // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments<T> &args) { return args.n; } - static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine - static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) { - return id1 * args.y_inc; + static size_t ResultID2(const Arguments<T> &args) { return args.batch_count; } + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) { + return (id1 * args.y_inc) + args.y_offsets[id2]; } - // Describes how to compute performance metrics (per item, not for the full batch) + // Describes how to compute performance metrics static size_t GetFlops(const Arguments<T> &args) { - return 2 * args.n; + return args.batch_count * (2 * args.n); } static size_t GetBytes(const Arguments<T> &args) { - return (3 * args.n) * sizeof(T); + return args.batch_count * (3 * args.n) * sizeof(T); } }; |