summaryrefslogtreecommitdiff
path: root/include/clblast_c.h
blob: c779c556e5def41ccaacb4886cffc765766f0ca6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the plain C interface to the CLBlast BLAS routines, the counter-part of the
// normal 'clblast.h' C++ header.
//
// =================================================================================================

#ifndef CLBLAST_CLBLAST_C_H_
#define CLBLAST_CLBLAST_C_H_

// Includes the normal OpenCL C header
#if defined(__APPLE__) || defined(__MACOSX)
  #include <OpenCL/opencl.h>
#else
  #include <CL/opencl.h>
#endif

// Exports library functions under Windows when building a DLL. See also:
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
#if defined(_WIN32) && defined(CLBLAST_DLL)
  #if defined(COMPILING_DLL)
    #define PUBLIC_API __declspec(dllexport)
  #else
    #define PUBLIC_API __declspec(dllimport)
  #endif
#else
  #define PUBLIC_API
#endif

// Version numbering (v1.6.0)
#define CLBLAST_VERSION_MAJOR 1
#define CLBLAST_VERSION_MINOR 6
#define CLBLAST_VERSION_PATCH 0

// The C interface
#ifdef __cplusplus
extern "C" {
#endif

// =================================================================================================

// Status codes. These codes can be returned by functions declared in this header file. The error
// codes match either the standard OpenCL error codes or the clBLAS error codes. 
typedef enum CLBlastStatusCode_ {

  // Status codes in common with the OpenCL standard
  CLBlastSuccess                   =   0, // CL_SUCCESS
  CLBlastOpenCLCompilerNotAvailable=  -3, // CL_COMPILER_NOT_AVAILABLE
  CLBlastTempBufferAllocFailure    =  -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
  CLBlastOpenCLOutOfResources      =  -5, // CL_OUT_OF_RESOURCES
  CLBlastOpenCLOutOfHostMemory     =  -6, // CL_OUT_OF_HOST_MEMORY
  CLBlastOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
  CLBlastInvalidValue              = -30, // CL_INVALID_VALUE
  CLBlastInvalidCommandQueue       = -36, // CL_INVALID_COMMAND_QUEUE
  CLBlastInvalidMemObject          = -38, // CL_INVALID_MEM_OBJECT
  CLBlastInvalidBinary             = -42, // CL_INVALID_BINARY
  CLBlastInvalidBuildOptions       = -43, // CL_INVALID_BUILD_OPTIONS
  CLBlastInvalidProgram            = -44, // CL_INVALID_PROGRAM
  CLBlastInvalidProgramExecutable  = -45, // CL_INVALID_PROGRAM_EXECUTABLE
  CLBlastInvalidKernelName         = -46, // CL_INVALID_KERNEL_NAME
  CLBlastInvalidKernelDefinition   = -47, // CL_INVALID_KERNEL_DEFINITION
  CLBlastInvalidKernel             = -48, // CL_INVALID_KERNEL
  CLBlastInvalidArgIndex           = -49, // CL_INVALID_ARG_INDEX
  CLBlastInvalidArgValue           = -50, // CL_INVALID_ARG_VALUE
  CLBlastInvalidArgSize            = -51, // CL_INVALID_ARG_SIZE
  CLBlastInvalidKernelArgs         = -52, // CL_INVALID_KERNEL_ARGS
  CLBlastInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
  CLBlastInvalidLocalThreadsTotal  = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
  CLBlastInvalidLocalThreadsDim    = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
  CLBlastInvalidGlobalOffset       = -56, // CL_INVALID_GLOBAL_OFFSET
  CLBlastInvalidEventWaitList      = -57, // CL_INVALID_EVENT_WAIT_LIST
  CLBlastInvalidEvent              = -58, // CL_INVALID_EVENT
  CLBlastInvalidOperation          = -59, // CL_INVALID_OPERATION
  CLBlastInvalidBufferSize         = -61, // CL_INVALID_BUFFER_SIZE
  CLBlastInvalidGlobalWorkSize     = -63, // CL_INVALID_GLOBAL_WORK_SIZE

  // Status codes in common with the clBLAS library
  CLBlastNotImplemented            = -1024, // Routine or functionality not implemented yet
  CLBlastInvalidMatrixA            = -1022, // Matrix A is not a valid OpenCL buffer
  CLBlastInvalidMatrixB            = -1021, // Matrix B is not a valid OpenCL buffer
  CLBlastInvalidMatrixC            = -1020, // Matrix C is not a valid OpenCL buffer
  CLBlastInvalidVectorX            = -1019, // Vector X is not a valid OpenCL buffer
  CLBlastInvalidVectorY            = -1018, // Vector Y is not a valid OpenCL buffer
  CLBlastInvalidDimension          = -1017, // Dimensions M, N, and K have to be larger than zero
  CLBlastInvalidLeadDimA           = -1016, // LD of A is smaller than the matrix's first dimension
  CLBlastInvalidLeadDimB           = -1015, // LD of B is smaller than the matrix's first dimension
  CLBlastInvalidLeadDimC           = -1014, // LD of C is smaller than the matrix's first dimension
  CLBlastInvalidIncrementX         = -1013, // Increment of vector X cannot be zero
  CLBlastInvalidIncrementY         = -1012, // Increment of vector Y cannot be zero
  CLBlastInsufficientMemoryA       = -1011, // Matrix A's OpenCL buffer is too small
  CLBlastInsufficientMemoryB       = -1010, // Matrix B's OpenCL buffer is too small
  CLBlastInsufficientMemoryC       = -1009, // Matrix C's OpenCL buffer is too small
  CLBlastInsufficientMemoryX       = -1008, // Vector X's OpenCL buffer is too small
  CLBlastInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small

  // Custom additional status codes for CLBlast
  CLBlastInsufficientMemoryTemp    = -2050, // Temporary buffer provided to GEMM routine is too small
  CLBlastInvalidBatchCount         = -2049, // The batch count needs to be positive
  CLBlastInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
  CLBlastMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
  CLBlastInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  CLBlastNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  CLBlastNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
  CLBlastInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
  CLBlastInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
  CLBlastDatabaseError             = -2041, // Entry for the device was not found in the database
  CLBlastUnknownError              = -2040, // A catch-all error code representing an unspecified error
  CLBlastUnexpectedError           = -2039, // A catch-all error code representing an unexpected exception
} CLBlastStatusCode;

// Matrix layout and transpose types
typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101,
                              CLBlastLayoutColMajor = 102 } CLBlastLayout;
typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112,
                                 CLBlastTransposeConjugate = 113 } CLBlastTranspose;
typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121,
                                CLBlastTriangleLower = 122 } CLBlastTriangle;
typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
                                CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
typedef enum CLBlastKernelMode_ { CLBlastKernelModeCrossCorrelation = 151, CLBlastKernelModeConvolution = 152 } CLBlastKernelMode;

// Precision enum (values in bits)
typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32,
                                 CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232,
                                 CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision;

// =================================================================================================
// BLAS level-1 (vector-vector) routines
// =================================================================================================

// Generate givens plane rotation: SROTG/DROTG
CLBlastStatusCode PUBLIC_API CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset,
                                          cl_mem sb_buffer, const size_t sb_offset,
                                          cl_mem sc_buffer, const size_t sc_offset,
                                          cl_mem ss_buffer, const size_t ss_offset,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
                                          cl_mem sb_buffer, const size_t sb_offset,
                                          cl_mem sc_buffer, const size_t sc_offset,
                                          cl_mem ss_buffer, const size_t ss_offset,
                                          cl_command_queue* queue, cl_event* event);

// Generate modified givens plane rotation: SROTMG/DROTMG
CLBlastStatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
                                           cl_mem sd2_buffer, const size_t sd2_offset,
                                           cl_mem sx1_buffer, const size_t sx1_offset,
                                           const cl_mem sy1_buffer, const size_t sy1_offset,
                                           cl_mem sparam_buffer, const size_t sparam_offset,
                                           cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
                                           cl_mem sd2_buffer, const size_t sd2_offset,
                                           cl_mem sx1_buffer, const size_t sx1_offset,
                                           const cl_mem sy1_buffer, const size_t sy1_offset,
                                           cl_mem sparam_buffer, const size_t sparam_offset,
                                           cl_command_queue* queue, cl_event* event);

// Apply givens plane rotation: SROT/DROT
CLBlastStatusCode PUBLIC_API CLBlastSrot(const size_t n,
                                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         const float cos,
                                         const float sin,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDrot(const size_t n,
                                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         const double cos,
                                         const double sin,
                                         cl_command_queue* queue, cl_event* event);

// Apply modified givens plane rotation: SROTM/DROTM
CLBlastStatusCode PUBLIC_API CLBlastSrotm(const size_t n,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem sparam_buffer, const size_t sparam_offset,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDrotm(const size_t n,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem sparam_buffer, const size_t sparam_offset,
                                          cl_command_queue* queue, cl_event* event);

// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
CLBlastStatusCode PUBLIC_API CLBlastSswap(const size_t n,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDswap(const size_t n,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCswap(const size_t n,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZswap(const size_t n,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHswap(const size_t n,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
CLBlastStatusCode PUBLIC_API CLBlastSscal(const size_t n,
                                          const float alpha,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDscal(const size_t n,
                                          const double alpha,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCscal(const size_t n,
                                          const cl_float2 alpha,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZscal(const size_t n,
                                          const cl_double2 alpha,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHscal(const size_t n,
                                          const cl_half alpha,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
CLBlastStatusCode PUBLIC_API CLBlastScopy(const size_t n,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDcopy(const size_t n,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCcopy(const size_t n,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZcopy(const size_t n,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHcopy(const size_t n,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
CLBlastStatusCode PUBLIC_API CLBlastSaxpy(const size_t n,
                                          const float alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDaxpy(const size_t n,
                                          const double alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCaxpy(const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZaxpy(const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHaxpy(const size_t n,
                                          const cl_half alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Dot product of two vectors: SDOT/DDOT/HDOT
CLBlastStatusCode PUBLIC_API CLBlastSdot(const size_t n,
                                         cl_mem dot_buffer, const size_t dot_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDdot(const size_t n,
                                         cl_mem dot_buffer, const size_t dot_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHdot(const size_t n,
                                         cl_mem dot_buffer, const size_t dot_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         cl_command_queue* queue, cl_event* event);

// Dot product of two complex vectors: CDOTU/ZDOTU
CLBlastStatusCode PUBLIC_API CLBlastCdotu(const size_t n,
                                          cl_mem dot_buffer, const size_t dot_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZdotu(const size_t n,
                                          cl_mem dot_buffer, const size_t dot_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
CLBlastStatusCode PUBLIC_API CLBlastCdotc(const size_t n,
                                          cl_mem dot_buffer, const size_t dot_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZdotc(const size_t n,
                                          cl_mem dot_buffer, const size_t dot_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
CLBlastStatusCode PUBLIC_API CLBlastSnrm2(const size_t n,
                                          cl_mem nrm2_buffer, const size_t nrm2_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDnrm2(const size_t n,
                                          cl_mem nrm2_buffer, const size_t nrm2_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastScnrm2(const size_t n,
                                          cl_mem nrm2_buffer, const size_t nrm2_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDznrm2(const size_t n,
                                          cl_mem nrm2_buffer, const size_t nrm2_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHnrm2(const size_t n,
                                          cl_mem nrm2_buffer, const size_t nrm2_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
CLBlastStatusCode PUBLIC_API CLBlastSasum(const size_t n,
                                          cl_mem asum_buffer, const size_t asum_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDasum(const size_t n,
                                          cl_mem asum_buffer, const size_t asum_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastScasum(const size_t n,
                                          cl_mem asum_buffer, const size_t asum_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDzasum(const size_t n,
                                          cl_mem asum_buffer, const size_t asum_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHasum(const size_t n,
                                          cl_mem asum_buffer, const size_t asum_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
CLBlastStatusCode PUBLIC_API CLBlastSsum(const size_t n,
                                         cl_mem sum_buffer, const size_t sum_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDsum(const size_t n,
                                         cl_mem sum_buffer, const size_t sum_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastScsum(const size_t n,
                                         cl_mem sum_buffer, const size_t sum_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDzsum(const size_t n,
                                         cl_mem sum_buffer, const size_t sum_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHsum(const size_t n,
                                         cl_mem sum_buffer, const size_t sum_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);

// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
CLBlastStatusCode PUBLIC_API CLBlastiSamax(const size_t n,
                                          cl_mem imax_buffer, const size_t imax_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiDamax(const size_t n,
                                          cl_mem imax_buffer, const size_t imax_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiCamax(const size_t n,
                                          cl_mem imax_buffer, const size_t imax_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiZamax(const size_t n,
                                          cl_mem imax_buffer, const size_t imax_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiHamax(const size_t n,
                                          cl_mem imax_buffer, const size_t imax_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN
CLBlastStatusCode PUBLIC_API CLBlastiSamin(const size_t n,
                                          cl_mem imin_buffer, const size_t imin_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiDamin(const size_t n,
                                          cl_mem imin_buffer, const size_t imin_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiCamin(const size_t n,
                                          cl_mem imin_buffer, const size_t imin_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiZamin(const size_t n,
                                          cl_mem imin_buffer, const size_t imin_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiHamin(const size_t n,
                                          cl_mem imin_buffer, const size_t imin_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
CLBlastStatusCode PUBLIC_API CLBlastiSmax(const size_t n,
                                         cl_mem imax_buffer, const size_t imax_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiDmax(const size_t n,
                                         cl_mem imax_buffer, const size_t imax_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiCmax(const size_t n,
                                         cl_mem imax_buffer, const size_t imax_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiZmax(const size_t n,
                                         cl_mem imax_buffer, const size_t imax_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiHmax(const size_t n,
                                         cl_mem imax_buffer, const size_t imax_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);

// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
CLBlastStatusCode PUBLIC_API CLBlastiSmin(const size_t n,
                                         cl_mem imin_buffer, const size_t imin_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiDmin(const size_t n,
                                         cl_mem imin_buffer, const size_t imin_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiCmin(const size_t n,
                                         cl_mem imin_buffer, const size_t imin_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiZmin(const size_t n,
                                         cl_mem imin_buffer, const size_t imin_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastiHmin(const size_t n,
                                         cl_mem imin_buffer, const size_t imin_offset,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_command_queue* queue, cl_event* event);

// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================

// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
CLBlastStatusCode PUBLIC_API CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                          const size_t m, const size_t n,
                                          const float alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const float beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                          const size_t m, const size_t n,
                                          const double alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const double beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                          const size_t m, const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_float2 beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                          const size_t m, const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_double2 beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                          const size_t m, const size_t n,
                                          const cl_half alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_half beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
CLBlastStatusCode PUBLIC_API CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                          const size_t m, const size_t n, const size_t kl, const size_t ku,
                                          const float alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const float beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                          const size_t m, const size_t n, const size_t kl, const size_t ku,
                                          const double alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const double beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                          const size_t m, const size_t n, const size_t kl, const size_t ku,
                                          const cl_float2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_float2 beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                          const size_t m, const size_t n, const size_t kl, const size_t ku,
                                          const cl_double2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_double2 beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                          const size_t m, const size_t n, const size_t kl, const size_t ku,
                                          const cl_half alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_half beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
CLBlastStatusCode PUBLIC_API CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_float2 beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_double2 beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
CLBlastStatusCode PUBLIC_API CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n, const size_t k,
                                          const cl_float2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_float2 beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n, const size_t k,
                                          const cl_double2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_double2 beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
CLBlastStatusCode PUBLIC_API CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_float2 beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_double2 beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
CLBlastStatusCode PUBLIC_API CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const float alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const float beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const double alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const double beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_half alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_half beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
CLBlastStatusCode PUBLIC_API CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n, const size_t k,
                                          const float alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const float beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n, const size_t k,
                                          const double alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const double beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n, const size_t k,
                                          const cl_half alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_half beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
CLBlastStatusCode PUBLIC_API CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const float alpha,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const float beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const double alpha,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const double beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_half alpha,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_half beta,
                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_command_queue* queue, cl_event* event);

// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
CLBlastStatusCode PUBLIC_API CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
CLBlastStatusCode PUBLIC_API CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n, const size_t k,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n, const size_t k,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n, const size_t k,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n, const size_t k,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n, const size_t k,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
CLBlastStatusCode PUBLIC_API CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
CLBlastStatusCode PUBLIC_API CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
CLBlastStatusCode PUBLIC_API CLBlastStbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n, const size_t k,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n, const size_t k,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n, const size_t k,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n, const size_t k,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
CLBlastStatusCode PUBLIC_API CLBlastStpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t n,
                                          const cl_mem ap_buffer, const size_t ap_offset,
                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          cl_command_queue* queue, cl_event* event);

// General rank-1 matrix update: SGER/DGER/HGER
CLBlastStatusCode PUBLIC_API CLBlastSger(const CLBlastLayout layout,
                                         const size_t m, const size_t n,
                                         const float alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDger(const CLBlastLayout layout,
                                         const size_t m, const size_t n,
                                         const double alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHger(const CLBlastLayout layout,
                                         const size_t m, const size_t n,
                                         const cl_half alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                         cl_command_queue* queue, cl_event* event);

// General rank-1 complex matrix update: CGERU/ZGERU
CLBlastStatusCode PUBLIC_API CLBlastCgeru(const CLBlastLayout layout,
                                          const size_t m, const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZgeru(const CLBlastLayout layout,
                                          const size_t m, const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_command_queue* queue, cl_event* event);

// General rank-1 complex conjugated matrix update: CGERC/ZGERC
CLBlastStatusCode PUBLIC_API CLBlastCgerc(const CLBlastLayout layout,
                                          const size_t m, const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZgerc(const CLBlastLayout layout,
                                          const size_t m, const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_command_queue* queue, cl_event* event);

// Hermitian rank-1 matrix update: CHER/ZHER
CLBlastStatusCode PUBLIC_API CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                         const size_t n,
                                         const float alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                         const size_t n,
                                         const double alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                         cl_command_queue* queue, cl_event* event);

// Hermitian packed rank-1 matrix update: CHPR/ZHPR
CLBlastStatusCode PUBLIC_API CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                         const size_t n,
                                         const float alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem ap_buffer, const size_t ap_offset,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                         const size_t n,
                                         const double alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem ap_buffer, const size_t ap_offset,
                                         cl_command_queue* queue, cl_event* event);

// Hermitian rank-2 matrix update: CHER2/ZHER2
CLBlastStatusCode PUBLIC_API CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_command_queue* queue, cl_event* event);

// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
CLBlastStatusCode PUBLIC_API CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem ap_buffer, const size_t ap_offset,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem ap_buffer, const size_t ap_offset,
                                          cl_command_queue* queue, cl_event* event);

// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
CLBlastStatusCode PUBLIC_API CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                         const size_t n,
                                         const float alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                         const size_t n,
                                         const double alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                         const size_t n,
                                         const cl_half alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                         cl_command_queue* queue, cl_event* event);

// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
CLBlastStatusCode PUBLIC_API CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                         const size_t n,
                                         const float alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem ap_buffer, const size_t ap_offset,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                         const size_t n,
                                         const double alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem ap_buffer, const size_t ap_offset,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                         const size_t n,
                                         const cl_half alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         cl_mem ap_buffer, const size_t ap_offset,
                                         cl_command_queue* queue, cl_event* event);

// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
CLBlastStatusCode PUBLIC_API CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const float alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const double alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_half alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_command_queue* queue, cl_event* event);

// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
CLBlastStatusCode PUBLIC_API CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const float alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem ap_buffer, const size_t ap_offset,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const double alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem ap_buffer, const size_t ap_offset,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
                                          const size_t n,
                                          const cl_half alpha,
                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                          cl_mem ap_buffer, const size_t ap_offset,
                                          cl_command_queue* queue, cl_event* event);

// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// =================================================================================================

// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
CLBlastStatusCode PUBLIC_API CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                          const size_t m, const size_t n, const size_t k,
                                          const float alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const float beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                          const size_t m, const size_t n, const size_t k,
                                          const double alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const double beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                          const size_t m, const size_t n, const size_t k,
                                          const cl_float2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const cl_float2 beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                          const size_t m, const size_t n, const size_t k,
                                          const cl_double2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const cl_double2 beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                          const size_t m, const size_t n, const size_t k,
                                          const cl_half alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const cl_half beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);

// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
CLBlastStatusCode PUBLIC_API CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
                                          const size_t m, const size_t n,
                                          const float alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const float beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
                                          const size_t m, const size_t n,
                                          const double alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const double beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
                                          const size_t m, const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const cl_float2 beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
                                          const size_t m, const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const cl_double2 beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
                                          const size_t m, const size_t n,
                                          const cl_half alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const cl_half beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);

// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
CLBlastStatusCode PUBLIC_API CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
                                          const size_t m, const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const cl_float2 beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
                                          const size_t m, const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          const cl_double2 beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);

// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
CLBlastStatusCode PUBLIC_API CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
                                          const size_t n, const size_t k,
                                          const float alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const float beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
                                          const size_t n, const size_t k,
                                          const double alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const double beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
                                          const size_t n, const size_t k,
                                          const cl_float2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_float2 beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
                                          const size_t n, const size_t k,
                                          const cl_double2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_double2 beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
                                          const size_t n, const size_t k,
                                          const cl_half alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const cl_half beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);

// Rank-K update of a hermitian matrix: CHERK/ZHERK
CLBlastStatusCode PUBLIC_API CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
                                          const size_t n, const size_t k,
                                          const float alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const float beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
                                          const size_t n, const size_t k,
                                          const double alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          const double beta,
                                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                          cl_command_queue* queue, cl_event* event);

// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
CLBlastStatusCode PUBLIC_API CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
                                           const size_t n, const size_t k,
                                           const float alpha,
                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           const float beta,
                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                           cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
                                           const size_t n, const size_t k,
                                           const double alpha,
                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           const double beta,
                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                           cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
                                           const size_t n, const size_t k,
                                           const cl_float2 alpha,
                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           const cl_float2 beta,
                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                           cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
                                           const size_t n, const size_t k,
                                           const cl_double2 alpha,
                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           const cl_double2 beta,
                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                           cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
                                           const size_t n, const size_t k,
                                           const cl_half alpha,
                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           const cl_half beta,
                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                           cl_command_queue* queue, cl_event* event);

// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
CLBlastStatusCode PUBLIC_API CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
                                           const size_t n, const size_t k,
                                           const cl_float2 alpha,
                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           const float beta,
                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                           cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
                                           const size_t n, const size_t k,
                                           const cl_double2 alpha,
                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           const double beta,
                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                           cl_command_queue* queue, cl_event* event);

// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
CLBlastStatusCode PUBLIC_API CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t m, const size_t n,
                                          const float alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t m, const size_t n,
                                          const double alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t m, const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t m, const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t m, const size_t n,
                                          const cl_half alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);

// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t m, const size_t n,
                                          const float alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t m, const size_t n,
                                          const double alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t m, const size_t n,
                                          const cl_float2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t m, const size_t n,
                                          const cl_double2 alpha,
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);

// =================================================================================================
// Extra non-BLAS routines (level-X)
// =================================================================================================

// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
CLBlastStatusCode PUBLIC_API CLBlastShad(const size_t n,
                                         const float alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         const float beta,
                                         cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDhad(const size_t n,
                                         const double alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         const double beta,
                                         cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastChad(const size_t n,
                                         const cl_float2 alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         const cl_float2 beta,
                                         cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZhad(const size_t n,
                                         const cl_double2 alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         const cl_double2 beta,
                                         cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
                                         cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHhad(const size_t n,
                                         const cl_half alpha,
                                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                         const cl_half beta,
                                         cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
                                         cl_command_queue* queue, cl_event* event);

// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
CLBlastStatusCode PUBLIC_API CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                              const size_t m, const size_t n,
                                              const float alpha,
                                              const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                              cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                              cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                              const size_t m, const size_t n,
                                              const double alpha,
                                              const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                              cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                              cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                              const size_t m, const size_t n,
                                              const cl_float2 alpha,
                                              const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                              cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                              cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                              const size_t m, const size_t n,
                                              const cl_double2 alpha,
                                              const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                              cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                              cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                              const size_t m, const size_t n,
                                              const cl_half alpha,
                                              const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                              cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                              cl_command_queue* queue, cl_event* event);

// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL
CLBlastStatusCode PUBLIC_API CLBlastSim2col(const CLBlastKernelMode kernel_mode,
                                            const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem im_buffer, const size_t im_offset,
                                            cl_mem col_buffer, const size_t col_offset,
                                            cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDim2col(const CLBlastKernelMode kernel_mode,
                                            const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem im_buffer, const size_t im_offset,
                                            cl_mem col_buffer, const size_t col_offset,
                                            cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCim2col(const CLBlastKernelMode kernel_mode,
                                            const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem im_buffer, const size_t im_offset,
                                            cl_mem col_buffer, const size_t col_offset,
                                            cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZim2col(const CLBlastKernelMode kernel_mode,
                                            const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem im_buffer, const size_t im_offset,
                                            cl_mem col_buffer, const size_t col_offset,
                                            cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHim2col(const CLBlastKernelMode kernel_mode,
                                            const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem im_buffer, const size_t im_offset,
                                            cl_mem col_buffer, const size_t col_offset,
                                            cl_command_queue* queue, cl_event* event);

// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
CLBlastStatusCode PUBLIC_API CLBlastScol2im(const CLBlastKernelMode kernel_mode,
                                            const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem col_buffer, const size_t col_offset,
                                            cl_mem im_buffer, const size_t im_offset,
                                            cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDcol2im(const CLBlastKernelMode kernel_mode,
                                            const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem col_buffer, const size_t col_offset,
                                            cl_mem im_buffer, const size_t im_offset,
                                            cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCcol2im(const CLBlastKernelMode kernel_mode,
                                            const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem col_buffer, const size_t col_offset,
                                            cl_mem im_buffer, const size_t im_offset,
                                            cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZcol2im(const CLBlastKernelMode kernel_mode,
                                            const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem col_buffer, const size_t col_offset,
                                            cl_mem im_buffer, const size_t im_offset,
                                            cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHcol2im(const CLBlastKernelMode kernel_mode,
                                            const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem col_buffer, const size_t col_offset,
                                            cl_mem im_buffer, const size_t im_offset,
                                            cl_command_queue* queue, cl_event* event);

// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
CLBlastStatusCode PUBLIC_API CLBlastSconvgemm(const CLBlastKernelMode kernel_mode,
                                              const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
                                              const cl_mem im_buffer, const size_t im_offset,
                                              const cl_mem kernel_buffer, const size_t kernel_offset,
                                              cl_mem result_buffer, const size_t result_offset,
                                              cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDconvgemm(const CLBlastKernelMode kernel_mode,
                                              const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
                                              const cl_mem im_buffer, const size_t im_offset,
                                              const cl_mem kernel_buffer, const size_t kernel_offset,
                                              cl_mem result_buffer, const size_t result_offset,
                                              cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHconvgemm(const CLBlastKernelMode kernel_mode,
                                              const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
                                              const cl_mem im_buffer, const size_t im_offset,
                                              const cl_mem kernel_buffer, const size_t kernel_offset,
                                              cl_mem result_buffer, const size_t result_offset,
                                              cl_command_queue* queue, cl_event* event);

// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n,
                                                 const float *alphas,
                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
                                                 const size_t batch_count,
                                                 cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n,
                                                 const double *alphas,
                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
                                                 const size_t batch_count,
                                                 cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n,
                                                 const cl_float2 *alphas,
                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
                                                 const size_t batch_count,
                                                 cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n,
                                                 const cl_double2 *alphas,
                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
                                                 const size_t batch_count,
                                                 cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n,
                                                 const cl_half *alphas,
                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
                                                 const size_t batch_count,
                                                 cl_command_queue* queue, cl_event* event);

// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
CLBlastStatusCode PUBLIC_API CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                 const size_t m, const size_t n, const size_t k,
                                                 const float *alphas,
                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
                                                 const float *betas,
                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
                                                 const size_t batch_count,
                                                 cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                 const size_t m, const size_t n, const size_t k,
                                                 const double *alphas,
                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
                                                 const double *betas,
                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
                                                 const size_t batch_count,
                                                 cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                 const size_t m, const size_t n, const size_t k,
                                                 const cl_float2 *alphas,
                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
                                                 const cl_float2 *betas,
                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
                                                 const size_t batch_count,
                                                 cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                 const size_t m, const size_t n, const size_t k,
                                                 const cl_double2 *alphas,
                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
                                                 const cl_double2 *betas,
                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
                                                 const size_t batch_count,
                                                 cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                 const size_t m, const size_t n, const size_t k,
                                                 const cl_half *alphas,
                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
                                                 const cl_half *betas,
                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
                                                 const size_t batch_count,
                                                 cl_command_queue* queue, cl_event* event);

// StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED
CLBlastStatusCode PUBLIC_API CLBlastSgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const float alpha,
                                                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
                                                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
                                                        const float beta,
                                                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
                                                        const size_t batch_count,
                                                        cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const double alpha,
                                                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
                                                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
                                                        const double beta,
                                                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
                                                        const size_t batch_count,
                                                        cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const cl_float2 alpha,
                                                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
                                                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
                                                        const cl_float2 beta,
                                                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
                                                        const size_t batch_count,
                                                        cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const cl_double2 alpha,
                                                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
                                                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
                                                        const cl_double2 beta,
                                                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
                                                        const size_t batch_count,
                                                        cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const cl_half alpha,
                                                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
                                                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
                                                        const cl_half beta,
                                                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
                                                        const size_t batch_count,
                                                        cl_command_queue* queue, cl_event* event);

// =================================================================================================
// General matrix-matrix multiplication with temporary buffer from user (optional, for advanced users): SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
CLBlastStatusCode PUBLIC_API CLBlastSgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const float alpha,
                                                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                                        const float beta,
                                                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                                        cl_command_queue* queue, cl_event* event, cl_mem temp_buffer);
CLBlastStatusCode PUBLIC_API CLBlastDgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const double alpha,
                                                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                                        const double beta,
                                                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                                        cl_command_queue* queue, cl_event* event, cl_mem temp_buffer);
CLBlastStatusCode PUBLIC_API CLBlastCgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const cl_float2 alpha,
                                                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                                        const cl_float2 beta,
                                                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                                        cl_command_queue* queue, cl_event* event, cl_mem temp_buffer);
CLBlastStatusCode PUBLIC_API CLBlastZgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const cl_double2 alpha,
                                                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                                        const cl_double2 beta,
                                                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                                        cl_command_queue* queue, cl_event* event, cl_mem temp_buffer);
CLBlastStatusCode PUBLIC_API CLBlastHgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const cl_half alpha,
                                                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                                        const cl_half beta,
                                                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                                        cl_command_queue* queue, cl_event* event, cl_mem temp_buffer);

// =================================================================================================
// Retrieves the required size of the temporary buffer for the GEMM kernel: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM (optional)
CLBlastStatusCode PUBLIC_API CLBlastSGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const size_t a_offset, const size_t a_ld,
                                                        const size_t b_offset, const size_t b_ld,
                                                        const size_t c_offset, const size_t c_ld,
                                                        cl_command_queue* queue,
                                                        size_t* temp_buffer_size);

CLBlastStatusCode PUBLIC_API CLBlastDGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const size_t a_offset, const size_t a_ld,
                                                        const size_t b_offset, const size_t b_ld,
                                                        const size_t c_offset, const size_t c_ld,
                                                        cl_command_queue* queue,
                                                        size_t* temp_buffer_size);

CLBlastStatusCode PUBLIC_API CLBlastCGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const size_t a_offset, const size_t a_ld,
                                                        const size_t b_offset, const size_t b_ld,
                                                        const size_t c_offset, const size_t c_ld,
                                                        cl_command_queue* queue,
                                                        size_t* temp_buffer_size);

CLBlastStatusCode PUBLIC_API CLBlastZGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const size_t a_offset, const size_t a_ld,
                                                        const size_t b_offset, const size_t b_ld,
                                                        const size_t c_offset, const size_t c_ld,
                                                        cl_command_queue* queue,
                                                        size_t* temp_buffer_size);

CLBlastStatusCode PUBLIC_API CLBlastHGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
                                                        const size_t m, const size_t n, const size_t k,
                                                        const size_t a_offset, const size_t a_ld,
                                                        const size_t b_offset, const size_t b_ld,
                                                        const size_t c_offset, const size_t c_ld,
                                                        cl_command_queue* queue,
                                                        size_t* temp_buffer_size);

// =================================================================================================

// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
// for the same device. This cache can be cleared to free up system memory or in case of debugging.
CLBlastStatusCode PUBLIC_API CLBlastClearCache();

// The cache can also be pre-initialized for a specific device with all possible CLBlast kernels.
// Further CLBlast routine calls will then run at maximum speed.
CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device);

// =================================================================================================

// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
// the target routine is called it will re-compile and use the new parameters from then on.
CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
                                                       const CLBlastPrecision precision, const size_t num_parameters,
                                                       const char** parameters_names, const size_t* parameters_values);

// =================================================================================================

#ifdef __cplusplus
} // extern "C"
#endif

// CLBLAST_CLBLAST_C_H_
#endif