Merge pull request #378 from CNugteren/CLBlast-377-fix-amax-amin

Change amax/amin behaviour
author: Cedric Nugteren <web@cedricnugteren.nl> 2020-03-15 11:34:31 +0100
committer: GitHub <noreply@github.com> 2020-03-15 11:34:31 +0100
commit: 78300ccbeacdfe9688f04639b2e15fb2f24e2a16 (patch)
tree: 6341a093b6026cf8b8732efb324600658121a3a1
parent: e3ce88154ab4a6f9a8efe77f551a4b3d0710bcfb (diff)
parent: 5f97d645053948df6f54bddb11c3fb3bf5486408 (diff)
6 files changed, 116 insertions, 13 deletions
diff --git a/CHANGELOG b/CHANGELOG
index b13c9381..3ceb234e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,5 @@
 Development version (next version)
+- Changed XAMAX/XAMIN to more likely return first rather than last min/max index, updated API docs
 - Various minor fixes and enhancements
 
 Version 1.5.1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 09d93d88..7b7d45a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -228,7 +228,7 @@ set(PRECISIONS 32 64 3232 6464 16)
 # Sample programs
 if(OPENCL)
   set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched dtrsm tuning_api)
-  set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
+  set(SAMPLE_PROGRAMS_C sasum samax dgemv sgemm haxpy cache)
   if(NETLIB)
     set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
   endif()
diff --git a/doc/api.md b/doc/api.md
index 996505f1..9d4bedfa 100644
--- a/doc/api.md
+++ b/doc/api.md
@@ -511,7 +511,7 @@ Arguments to SUM:
 xAMAX: Index of absolute maximum value in a vector
 -------------
 
-Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.
+Finds the index of a maximum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.
 
 C++ API:
 ```
@@ -562,7 +562,7 @@ Arguments to AMAX:
 xAMIN: Index of absolute minimum value in a vector (non-BLAS function)
 -------------
 
-Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.
+Finds the index of a minimum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.
 
 C++ API:
 ```
@@ -613,7 +613,7 @@ Arguments to AMIN:
 xMAX: Index of maximum value in a vector (non-BLAS function)
 -------------
 
-Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.
+Finds the index of a maximum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.
 
 C++ API:
 ```
@@ -664,7 +664,7 @@ Arguments to MAX:
 xMIN: Index of minimum value in a vector (non-BLAS function)
 -------------
 
-Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.
+Finds the index of a minimum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.
 
 C++ API:
 ```
diff --git a/samples/samax.c b/samples/samax.c
new file mode 100644
index 00000000..36e78846
--- /dev/null
+++ b/samples/samax.c
@@ -0,0 +1,102 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the iSAMAX routine. It is pure C99 and demonstrates the use of
+// the C API to the CLBlast library.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define CL_TARGET_OPENCL_VERSION 110
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine iSAMAX
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Example iSAMAX arguments
+  const size_t n = 1000;
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host data structures with some example data
+  float* host_input = (float*)malloc(sizeof(float)*n);
+  unsigned int* host_output = (unsigned int*)malloc(sizeof(unsigned int)*1);
+  for (size_t i=0; i<n; ++i) { host_input[i] = (float)(i % 10); } // staircase modulo 10
+  for (size_t i=0; i<1; ++i) { host_output[i] = 77; }  // some temp value to be overwritten later
+
+  // Copy the data-structures to the device
+  cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
+  cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(unsigned int), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(unsigned int), host_output, 0, NULL, NULL);
+
+  // Call the iSAMAX routine.
+  CLBlastStatusCode status = CLBlastiSamax(n,
+                                           device_output, 0,
+                                           device_input, 0, 1,
+                                           &queue, &event);
+
+  // Wait for completion
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+
+  // Copies the result back to the host
+  clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(unsigned int), host_output, 0, NULL, NULL);
+
+  // Example completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed iSAMAX with status %d: array of %d values with staircases from 0..9 repeated, max at index %zu with value %.0lf\n",
+         status, n, host_output[0], host_input[host_output[0]]);
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  free(host_input);
+  free(host_output);
+  clReleaseMemObject(device_input);
+  clReleaseMemObject(device_output);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 76c5dc1c..798cd535 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -132,10 +132,10 @@ ROUTINES = [
   Routine(True,  True,  0, False, "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [xn,"1"],      [],           "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
   Routine(True,  True,  0, False, "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [xn,"1"],      [],           "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
   Routine(True,  False, 0, False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [xn,"1"],      [],           "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
-  Routine(True,  True,  0, False, "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
-  Routine(True,  False, 0, False, "1", "amin",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of absolute minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.", []),
-  Routine(True,  False, 0, False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
-  Routine(True,  False, 0, False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+  Routine(True,  True,  0, False, "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of absolute maximum value in a vector", "Finds the index of a maximum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+  Routine(True,  False, 0, False, "1", "amin",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of absolute minimum value in a vector (non-BLAS function)", "Finds the index of a minimum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.", []),
+  Routine(True,  False, 0, False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of a maximum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+  Routine(True,  False, 0, False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of a minimum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
 ],
 [  # Level 2: matrix-vector
   Routine(True,  True,  0, False, "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index e65acf17..85cbdc86 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -64,7 +64,7 @@ void Xamax(const int n,
     #else
       x = fabs(x);
     #endif
-    if (x >= max) {
+    if (x > max) {
       max = x;
       imax = id*x_inc + x_offset;
     }
@@ -77,7 +77,7 @@ void Xamax(const int n,
   // Performs reduction in local memory
   for (int s=WGS1/2; s>0; s=s>>1) {
     if (lid < s) {
-      if (maxlm[lid + s] >= maxlm[lid]) {
+      if (maxlm[lid + s] > maxlm[lid]) {
         maxlm[lid] = maxlm[lid + s];
         imaxlm[lid] = imaxlm[lid + s];
       }
@@ -105,7 +105,7 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm,
   const int lid = get_local_id(0);
 
   // Performs the first step of the reduction while loading the data
-  if (maxgm[lid + WGS2] >= maxgm[lid]) {
+  if (maxgm[lid + WGS2] > maxgm[lid]) {
     maxlm[lid] = maxgm[lid + WGS2];
     imaxlm[lid] = imaxgm[lid + WGS2];
   }
@@ -118,7 +118,7 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm,
   // Performs reduction in local memory
   for (int s=WGS2/2; s>0; s=s>>1) {
     if (lid < s) {
-      if (maxlm[lid + s] >= maxlm[lid]) {
+      if (maxlm[lid + s] > maxlm[lid]) {
         maxlm[lid] = maxlm[lid + s];
         imaxlm[lid] = imaxlm[lid + s];
       }
author	Cedric Nugteren <web@cedricnugteren.nl>	2020-03-15 11:34:31 +0100
committer	GitHub <noreply@github.com>	2020-03-15 11:34:31 +0100
commit	78300ccbeacdfe9688f04639b2e15fb2f24e2a16 (patch)
tree	6341a093b6026cf8b8732efb324600658121a3a1
parent	e3ce88154ab4a6f9a8efe77f551a4b3d0710bcfb (diff)
parent	5f97d645053948df6f54bddb11c3fb3bf5486408 (diff)