// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xhad kernel. It contains one fast vectorized version in case of unit
// strides (incx=incy=incz=1) and no offsets (offx=offy=offz=0). Another version is more general,
// but doesn't support vector data-types. Based on the XAXPY kernels.
//
// This kernel uses the level-1 BLAS common tuning parameters.
//
// =================================================================================================

// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(

// =================================================================================================

// A vector-vector multiply function. See also level1.opencl for a vector-scalar version
INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV bvec) {
  #if VW == 1
    Multiply(cvec, aval, bvec);
  #elif VW == 2
    Multiply(cvec.x, aval.x, bvec.x);
    Multiply(cvec.y, aval.y, bvec.y);
  #elif VW == 4
    Multiply(cvec.x, aval.x, bvec.x);
    Multiply(cvec.y, aval.y, bvec.y);
    Multiply(cvec.z, aval.z, bvec.z);
    Multiply(cvec.w, aval.w, bvec.w);
  #elif VW == 8
    Multiply(cvec.s0, aval.s0, bvec.s0);
    Multiply(cvec.s1, aval.s1, bvec.s1);
    Multiply(cvec.s2, aval.s2, bvec.s2);
    Multiply(cvec.s3, aval.s3, bvec.s3);
    Multiply(cvec.s4, aval.s4, bvec.s4);
    Multiply(cvec.s5, aval.s5, bvec.s5);
    Multiply(cvec.s6, aval.s6, bvec.s6);
    Multiply(cvec.s7, aval.s7, bvec.s7);
  #elif VW == 16
    Multiply(cvec.s0, aval.s0, bvec.s0);
    Multiply(cvec.s1, aval.s1, bvec.s1);
    Multiply(cvec.s2, aval.s2, bvec.s2);
    Multiply(cvec.s3, aval.s3, bvec.s3);
    Multiply(cvec.s4, aval.s4, bvec.s4);
    Multiply(cvec.s5, aval.s5, bvec.s5);
    Multiply(cvec.s6, aval.s6, bvec.s6);
    Multiply(cvec.s7, aval.s7, bvec.s7);
    Multiply(cvec.s8, aval.s8, bvec.s8);
    Multiply(cvec.s9, aval.s9, bvec.s9);
    Multiply(cvec.sA, aval.sA, bvec.sA);
    Multiply(cvec.sB, aval.sB, bvec.sB);
    Multiply(cvec.sC, aval.sC, bvec.sC);
    Multiply(cvec.sD, aval.sD, bvec.sD);
    Multiply(cvec.sE, aval.sE, bvec.sE);
    Multiply(cvec.sF, aval.sF, bvec.sF);
  #endif
  return cvec;
}

// =================================================================================================

// Full version of the kernel with offsets and strided accesses
#if RELAX_WORKGROUP_SIZE == 1
  __kernel
#elif
  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
#endif
void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
          const __global real* restrict xgm, const int x_offset, const int x_inc,
          const __global real* restrict ygm, const int y_offset, const int y_inc,
          __global real* zgm, const int z_offset, const int z_inc) {
  const real alpha = GetRealArg(arg_alpha);
  const real beta = GetRealArg(arg_beta);

  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
  for (int id = get_global_id(0); id < n; id += get_global_size(0)) {
    real xvalue = xgm[id*x_inc + x_offset];
    real yvalue = ygm[id*y_inc + y_offset];
    real zvalue = zgm[id*z_inc + z_offset];
    real result;
    real alpha_times_x;
    Multiply(alpha_times_x, alpha, xvalue);
    Multiply(result, alpha_times_x, yvalue);
    MultiplyAdd(result, beta, zvalue);
    zgm[id*z_inc + z_offset] = result;
  }
}

// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
// assumes that 'n' is dividable by 'VW' and 'WPT'.
#if RELAX_WORKGROUP_SIZE == 1
  __kernel
#elif
  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
#endif
void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
                const __global realV* restrict xgm, const __global realV* restrict ygm,
                __global realV* zgm) {
  const real alpha = GetRealArg(arg_alpha);
  const real beta = GetRealArg(arg_beta);

  const int num_desired_threads = n / (VW * WPT);

  if (get_global_id(0) < num_desired_threads) {
    #pragma unroll
    for (int _w = 0; _w < WPT; _w += 1) {
      const int id = _w * num_desired_threads + get_global_id(0);
      realV xvalue = xgm[id];
      realV yvalue = ygm[id];
      realV zvalue = zgm[id];
      realV result;
      realV alpha_times_x;
      alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
      result = MultiplyVectorVector(result, alpha_times_x, yvalue);
      zgm[id] = MultiplyAddVector(result, beta, zvalue);
    }
  }
}

// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
#if RELAX_WORKGROUP_SIZE == 1
  __kernel
#elif
  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
#endif
void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
                 const __global realV* restrict xgm, const __global realV* restrict ygm,
                 __global realV* zgm) {
  const real alpha = GetRealArg(arg_alpha);
  const real beta = GetRealArg(arg_beta);

  #pragma unroll
  for (int _w = 0; _w < WPT; _w += 1) {
    const int id = _w*get_global_size(0) + get_global_id(0);
    realV xvalue = xgm[id];
    realV yvalue = ygm[id];
    realV zvalue = zgm[id];
    realV result;
    realV alpha_times_x;
    alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
    result = MultiplyVectorVector(result, alpha_times_x, yvalue);
    zgm[id] = MultiplyAddVector(result, beta, zvalue);
  }
}


// =================================================================================================

// End of the C++11 raw string literal
)"

// =================================================================================================