diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2020-03-15 11:34:31 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-03-15 11:34:31 +0100 |
commit | 78300ccbeacdfe9688f04639b2e15fb2f24e2a16 (patch) | |
tree | 6341a093b6026cf8b8732efb324600658121a3a1 | |
parent | e3ce88154ab4a6f9a8efe77f551a4b3d0710bcfb (diff) | |
parent | 5f97d645053948df6f54bddb11c3fb3bf5486408 (diff) |
Merge pull request #378 from CNugteren/CLBlast-377-fix-amax-amin
Change amax/amin behaviour
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | doc/api.md | 8 | ||||
-rw-r--r-- | samples/samax.c | 102 | ||||
-rwxr-xr-x | scripts/generator/generator.py | 8 | ||||
-rw-r--r-- | src/kernels/level1/xamax.opencl | 8 |
6 files changed, 116 insertions, 13 deletions
@@ -1,4 +1,5 @@ Development version (next version) +- Changed XAMAX/XAMIN to more likely return first rather than last min/max index, updated API docs - Various minor fixes and enhancements Version 1.5.1 diff --git a/CMakeLists.txt b/CMakeLists.txt index 09d93d88..7b7d45a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -228,7 +228,7 @@ set(PRECISIONS 32 64 3232 6464 16) # Sample programs if(OPENCL) set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched dtrsm tuning_api) - set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) + set(SAMPLE_PROGRAMS_C sasum samax dgemv sgemm haxpy cache) if(NETLIB) set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) endif() @@ -511,7 +511,7 @@ Arguments to SUM: xAMAX: Index of absolute maximum value in a vector ------------- -Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. +Finds the index of a maximum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. C++ API: ``` @@ -562,7 +562,7 @@ Arguments to AMAX: xAMIN: Index of absolute minimum value in a vector (non-BLAS function) ------------- -Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. +Finds the index of a minimum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. C++ API: ``` @@ -613,7 +613,7 @@ Arguments to AMIN: xMAX: Index of maximum value in a vector (non-BLAS function) ------------- -Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine. +Finds the index of a maximum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine. C++ API: ``` @@ -664,7 +664,7 @@ Arguments to MAX: xMIN: Index of minimum value in a vector (non-BLAS function) ------------- -Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine. +Finds the index of a minimum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine. C++ API: ``` diff --git a/samples/samax.c b/samples/samax.c new file mode 100644 index 00000000..36e78846 --- /dev/null +++ b/samples/samax.c @@ -0,0 +1,102 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the iSAMAX routine. It is pure C99 and demonstrates the use of +// the C API to the CLBlast library. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#define CL_TARGET_OPENCL_VERSION 110 +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the CLBlast library (C interface) +#include <clblast_c.h> + +// ================================================================================================= + +// Example use of the single-precision routine iSAMAX +int main(void) { + + // OpenCL platform/device settings + const size_t platform_id = 0; + const size_t device_id = 0; + + // Example iSAMAX arguments + const size_t n = 1000; + + // Initializes the OpenCL platform + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + cl_platform_id platform = platforms[platform_id]; + + // Initializes the OpenCL device + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + cl_device_id device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); + cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); + cl_event event = NULL; + + // Populate host data structures with some example data + float* host_input = (float*)malloc(sizeof(float)*n); + unsigned int* host_output = (unsigned int*)malloc(sizeof(unsigned int)*1); + for (size_t i=0; i<n; ++i) { host_input[i] = (float)(i % 10); } // staircase modulo 10 + for (size_t i=0; i<1; ++i) { host_output[i] = 77; } // some temp value to be overwritten later + + // Copy the data-structures to the device + cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL); + cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(unsigned int), NULL, NULL); + clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(unsigned int), host_output, 0, NULL, NULL); + + // Call the iSAMAX routine. + CLBlastStatusCode status = CLBlastiSamax(n, + device_output, 0, + device_input, 0, 1, + &queue, &event); + + // Wait for completion + if (status == CLBlastSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } + + // Copies the result back to the host + clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(unsigned int), host_output, 0, NULL, NULL); + + // Example completed. See "clblast_c.h" for status codes (0 -> success). + printf("Completed iSAMAX with status %d: array of %d values with staircases from 0..9 repeated, max at index %zu with value %.0lf\n", + status, n, host_output[0], host_input[host_output[0]]); + + // Clean-up + free(platforms); + free(devices); + free(host_input); + free(host_output); + clReleaseMemObject(device_input); + clReleaseMemObject(device_output); + clReleaseCommandQueue(queue); + clReleaseContext(context); + return 0; +} + +// ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 76c5dc1c..798cd535 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -132,10 +132,10 @@ ROUTINES = [ Routine(True, True, 0, False, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), Routine(True, True, 0, False, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), Routine(True, False, 0, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, 0, False, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, 0, False, "1", "amin", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of absolute minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.", []), - Routine(True, False, 0, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, 0, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(True, True, 0, False, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of a maximum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, 0, False, "1", "amin", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of absolute minimum value in a vector (non-BLAS function)", "Finds the index of a minimum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.", []), + Routine(True, False, 0, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of a maximum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, 0, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of a minimum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector Routine(True, True, 0, False, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index e65acf17..85cbdc86 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -64,7 +64,7 @@ void Xamax(const int n, #else x = fabs(x); #endif - if (x >= max) { + if (x > max) { max = x; imax = id*x_inc + x_offset; } @@ -77,7 +77,7 @@ void Xamax(const int n, // Performs reduction in local memory for (int s=WGS1/2; s>0; s=s>>1) { if (lid < s) { - if (maxlm[lid + s] >= maxlm[lid]) { + if (maxlm[lid + s] > maxlm[lid]) { maxlm[lid] = maxlm[lid + s]; imaxlm[lid] = imaxlm[lid + s]; } @@ -105,7 +105,7 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm, const int lid = get_local_id(0); // Performs the first step of the reduction while loading the data - if (maxgm[lid + WGS2] >= maxgm[lid]) { + if (maxgm[lid + WGS2] > maxgm[lid]) { maxlm[lid] = maxgm[lid + WGS2]; imaxlm[lid] = imaxgm[lid + WGS2]; } @@ -118,7 +118,7 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm, // Performs reduction in local memory for (int s=WGS2/2; s>0; s=s>>1) { if (lid < s) { - if (maxlm[lid + s] >= maxlm[lid]) { + if (maxlm[lid + s] > maxlm[lid]) { maxlm[lid] = maxlm[lid + s]; imaxlm[lid] = imaxlm[lid + s]; } |