diff options
68 files changed, 2406 insertions, 2004 deletions
diff --git a/.appveyor.yml b/.appveyor.yml index 8597e43e..eb7f1c97 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,8 +1,8 @@ environment: global: - CLBLAST_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\clblast" + CLBLAST_ROOT: "%APPVEYOR_BUILD_FOLDER%\\..\\bin\\clblast" OPENCL_REGISTRY: "https://www.khronos.org/registry/cl" - OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\opencl" + OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\..\\bin\\opencl" platform: - x64 @@ -2,5 +2,6 @@ build stash .* *.pyc -*.db +database.json +database_best.json cl.hpp
\ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 8e1a80db..0465afa4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,49 +17,21 @@ addons: - kubuntu-backports packages: - cmake + - ocl-icd-opencl-dev env: global: - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast - - OPENCL_REGISTRY=https://www.khronos.org/registry/cl - - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl before_install: - cmake --version; - ${CC} --version; - ${CXX} --version; -install: - # The following linux logic is necessary because of Travis's move to the GCE platform, which does not - # currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221 - # We build our own linkable .so file - - if [ ${TRAVIS_OS_NAME} == "linux" ]; then - mkdir -p ${OPENCL_ROOT}; - pushd ${OPENCL_ROOT}; - travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git; - mv ./OpenCL-ICD-Loader/* .; - travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL; - pushd inc/CL; - travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp; - popd; - mkdir -p lib; - pushd lib; - cmake -G "Unix Makefiles" ..; - make; - cp ./bin/libOpenCL.so .; - popd; - pushd inc/CL; - travis_retry git fetch origin opencl12:opencl12; - git checkout opencl12; - popd; - mv inc/ include/; - popd; - fi - before_script: - mkdir -p ${CLBLAST_ROOT} - pushd ${CLBLAST_ROOT} - - cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DTESTS=ON -DCLIENTS=ON ${TRAVIS_BUILD_DIR} + - cmake -DTESTS=ON -DCLIENTS=ON ${TRAVIS_BUILD_DIR} script: - make @@ -1,13 +1,20 @@ Development version (next release) +- It is now possible to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS + +Version 0.9.0 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header +- Improved performance significantly of rotated GEMV computations +- Improved performance of unseen/un-tuned devices by a better default tuning parameter selection - Fixed proper MSVC dllimport and dllexport declarations - Fixed memory leaks related to events not being released - Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems - Fixed a bug related to the cache and retrieval of programs based on the OpenCL context - Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels +- Fixed a bug in the OpenCL kernels: now placing __kernel before __attribute__ +- Fixed a bug in level-3 routines when beta is zero and matrix C contains NaNs - Added an option (-warm_up) to do a warm-up run before timing in the performance clients -- Improved performance significantly of rotated GEMV computations +- Various minor fixes and enhancements - Added tuned parameters for various devices (see README) Version 0.8.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index 95d1d500..178ac9bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla # CMake project details project("clblast" C CXX) set(clblast_VERSION_MAJOR 0) -set(clblast_VERSION_MINOR 8) +set(clblast_VERSION_MINOR 9) set(clblast_VERSION_PATCH 0) # Options and their default values @@ -75,6 +75,12 @@ else() if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0) set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable") endif() + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0.0) + # GCC does not support attributes on template arguments + # in particular we hit this with the alignment attributes on cl_XXX types + # which are then used to instantiate various templates in CLBlast + set(FLAGS "${FLAGS} -Wno-ignored-attributes") + endif() elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang) set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch") @@ -127,11 +133,6 @@ endif() # ================================================================================================== -# Includes directories: CLBlast and OpenCL -include_directories(${clblast_SOURCE_DIR}/include ${clblast_SOURCE_DIR}/src ${OPENCL_INCLUDE_DIRS}) - -# ================================================================================================== - # Sets the supported routines and the used kernels. New routines and kernels should be added here. set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) @@ -173,26 +174,36 @@ endforeach() add_library(clblast SHARED ${SOURCES}) target_link_libraries(clblast ${OPENCL_LIBRARIES}) +# Includes directories: CLBlast and OpenCL +target_include_directories(clblast PUBLIC + $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include> + $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src> + $<INSTALL_INTERFACE:include> + ${OPENCL_INCLUDE_DIRS}) + # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built if(MSVC) target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11 endif() # Installs the library -install(TARGETS clblast DESTINATION lib) +install(TARGETS clblast EXPORT CLBlast DESTINATION lib) install(FILES include/clblast.h DESTINATION include) install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) +# Installs the config for find_package in dependent projects +install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake) + # ================================================================================================== -# Sets a default platform ($DEVICEPLATFORM) and device ($DEFAULT_DEVICE) to run tuners and tests on +# Sets a default platform ($DEVICEPLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests on set(DEVICEPLATFORM ) -if(DEFINED ENV{DEFAULT_DEVICE}) - set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{DEFAULT_DEVICE}) +if(DEFINED ENV{CLBLAST_DEVICE}) + set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{CLBLAST_DEVICE}) endif() -if(DEFINED ENV{DEFAULT_PLATFORM}) - set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{DEFAULT_PLATFORM}) +if(DEFINED ENV{CLBLAST_PLATFORM}) + set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{CLBLAST_PLATFORM}) endif() # ================================================================================================== @@ -225,9 +236,6 @@ endif() # the CLTune library (not included as part of the source). if(TUNERS) - # Includes CLTune - include_directories(${CLTUNE_INCLUDE_DIRS}) - # Visual Studio requires the sources of non-exported objects/libraries set(TUNERS_COMMON ) if(MSVC) @@ -238,6 +246,7 @@ if(TUNERS) foreach(KERNEL ${KERNELS}) add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp) target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES}) + target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS}) install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin) endforeach() @@ -281,9 +290,6 @@ if(CLIENTS OR TESTS) endif() endif() - # Sets the include directories - include_directories(${clblast_SOURCE_DIR} ${REF_INCLUDES}) - endif() # ================================================================================================== @@ -299,6 +305,11 @@ if(CLIENTS) else() # Creates the common performance-tests objects (requires CMake 2.8.8) add_library(test_performance_common OBJECT test/performance/client.cpp) + + # Adds CLBlast's interface include paths because we can't link to CLBlast here + target_include_directories(test_performance_common PRIVATE + $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> + ${clblast_SOURCE_DIR}) set(CLIENTS_COMMON ${CLIENTS_COMMON} $<TARGET_OBJECTS:test_performance_common>) endif() @@ -321,6 +332,7 @@ if(CLIENTS) endforeach() foreach(ROUTINE ${ROUTINES}) target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) install(TARGETS clblast_client_${ROUTINE} DESTINATION bin) endforeach() @@ -342,6 +354,9 @@ if(TESTS) # Creates the common correctness-tests objects (requires CMake 2.8.8) add_library(test_correctness_common OBJECT test/correctness/tester.cpp test/correctness/testblas.cpp) + target_include_directories(test_correctness_common PUBLIC + $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> + ${clblast_SOURCE_DIR}) set(TESTS_COMMON ${TESTS_COMMON} $<TARGET_OBJECTS:test_correctness_common>) endif() @@ -365,6 +380,7 @@ if(TESTS) foreach(ROUTINE ${ROUTINES}) target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) install(TARGETS clblast_test_${ROUTINE} DESTINATION bin) + target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM}) endforeach() @@ -90,6 +90,8 @@ Afterwards, any of CLBlast's routines can be called directly: there is no need t cmake -DSAMPLES=ON .. +Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler. + Using the tuners (optional) ------------- @@ -118,6 +120,7 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC * Intel GPUs: - HD Graphics 530 - HD Graphics Haswell Ultrabook GT2 Mobile + - HD Graphics 5500 BroadWell U-Processor GT2 - HD Graphics Skylake ULT GT2 - Iris - Iris Pro @@ -135,7 +138,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher). -Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake. +Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake. The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl). @@ -167,7 +170,7 @@ To build these tests, another BLAS library is needed to serve as a reference. Th Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further. -All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake. +All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake. Compiling the performance tests/clients (optional) @@ -283,9 +286,11 @@ The contributing authors (code, pull requests, testing) so far are: * [Cedric Nugteren](http://www.cedricnugteren.nl) - main author * [Anton Lokhmotov](https://github.com/psyhtest) * [Dragan Djuric](https://github.com/blueberry) -* [Marco Hutter](https://github.com/gpus) +* [Marco Hutter](http://marco-hutter.de/) * [Hugh Perkins](https://github.com/hughperkins) * [Gian-Carlo Pascutto](https://github.com/gcp) +* [Ivan Shapovalov](https://github.com/intelfx) +* [Dimitri Van Assche](https://github.com/dvasschemacq) Tuning and testing on a variety of OpenCL devices was made possible by: diff --git a/scripts/database/database.py b/scripts/database/database.py index e115d68c..f758a2b7 100755 --- a/scripts/database/database.py +++ b/scripts/database/database.py @@ -11,8 +11,6 @@ import os.path import glob import argparse -import pandas as pd - import database.io as io import database.db as db import database.clblast as clblast @@ -20,15 +18,15 @@ import database.bests as bests import database.defaults as defaults # Server storing a copy of the database -DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db" +DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.json" # OpenCL vendor names and their short name -VENDOR_TRANSLATION_TABLE = {"device_vendor": { +VENDOR_TRANSLATION_TABLE = { "GenuineIntel": "Intel", "Intel(R) Corporation": "Intel", "Advanced Micro Devices, Inc.": "AMD", "NVIDIA Corporation": "NVIDIA", -}} +} def main(argv): @@ -41,7 +39,8 @@ def main(argv): cl_args = parser.parse_args(argv) # Parses the path arguments - database_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database.db") + database_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database.json") + database_best_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database_best.json") json_files = os.path.join(cl_args.source_folder, "*.json") cpp_database_path = os.path.join(cl_args.clblast_root, "src", "database", "kernels") @@ -52,11 +51,6 @@ def main(argv): if len(glob.glob(json_files)) < 1: print("[database] The path '" + cl_args.source_folder + "' does not contain any JSON files") - # Pandas options - pd.set_option('display.width', 1000) - if cl_args.verbose: - print("[database] Using pandas version " + pd.__version__) - # Downloads the database if a local copy is not present if not os.path.isfile(database_filename): io.download_database(database_filename, DATABASE_SERVER_URL) @@ -68,38 +62,36 @@ def main(argv): for file_json in glob.glob(json_files): # Loads the newly imported data - sys.stdout.write("[database] Processing '"+file_json+"' ") # No newline printed - imported_data = io.load_json_to_pandas(file_json) + sys.stdout.write("[database] Processing '" + file_json + "' ") # No newline printed + imported_data = io.load_tuning_results(file_json) # Fixes the problem that some vendors use multiple different names - imported_data = db.find_and_replace(imported_data, VENDOR_TRANSLATION_TABLE) + for target in VENDOR_TRANSLATION_TABLE: + if imported_data["device_vendor"] == target: + imported_data["device_vendor"] = VENDOR_TRANSLATION_TABLE[target] # Adds the new data to the database - old_size = len(database.index) - database = db.concatenate_database(database, imported_data) - database = db.remove_duplicates(database) - new_size = len(database.index) + old_size = db.length(database) + database = db.add_section(database, imported_data) + new_size = db.length(database) print("with " + str(new_size - old_size) + " new items") # Newline printed here # Stores the modified database back to disk if len(glob.glob(json_files)) >= 1: io.save_database(database, database_filename) - # Optional: update the database here. Default is disabled, code below is just an example - if False: # TODO: Use command-line arguments to enable updates in a flexible way - database = db.update_database(database, - ((database["kernel"] == "CopyMatrixFast") & - (database["precision"] == "3232")), - "arg_alpha", "2+0.5i") - io.save_database(database, database_filename) - # Retrieves the best performing results print("[database] Calculating the best results per device/kernel...") database_best_results = bests.get_best_results(database) # Determines the defaults for other vendors and per vendor - database_defaults = defaults.calculate_defaults(database_best_results) - database_best_results = db.concatenate_database(database_best_results, database_defaults) + print("[database] Calculating the default values...") + database_defaults = defaults.calculate_defaults(database, cl_args.verbose) + database_best_results["sections"].extend(database_defaults["sections"]) + + # Optionally outputs the database to disk + if cl_args.verbose: + io.save_database(database_best_results, database_best_filename) # Outputs the database as a C++ database print("[database] Producing a C++ database in '" + cpp_database_path + "'...") diff --git a/scripts/database/database/bests.py b/scripts/database/database/bests.py index edb81733..c924efde 100644 --- a/scripts/database/database/bests.py +++ b/scripts/database/database/bests.py @@ -5,16 +5,54 @@ # Author(s): # Cedric Nugteren <www.cedricnugteren.nl> -import pandas as pd -import clblast - - -def get_best_results(df): - """Retrieves the results with the lowests execution times""" - database_bests = pd.DataFrame() - database_entries = df.groupby(clblast.ATTRIBUTES + ["kernel"]) - for name, database_entry in database_entries: - best_time = database_entry["time"].min() - best_parameters = database_entry[database_entry["time"] == best_time].iloc[0] - database_bests = database_bests.append(best_parameters, ignore_index=True) - return database_bests +import sys + + +def get_best_results(database): + """Retrieves the results with the lowest execution times""" + sections_best = [] + for section in database["sections"]: + section_best = {} + + # Stores all the section's meta data + for attribute in section.keys(): + if attribute != "results": + section_best[attribute] = section[attribute] + + # Find the best result + parameters_best = None + time_best = sys.float_info.max + for result in section["results"]: + if result["time"] < time_best: + time_best = result["time"] + parameters_best = result["parameters"] + + # Stores the best result + section_best["results"] = [{"time": time_best, "parameters": parameters_best}] + sections_best.append(section_best) + + return {"sections": sections_best} + + +def get_relative_bests(name, common_results, common_parameters, verbose=False): + """Retrieves the parameters with the relative best execution time over different devices""" + + # Helper function + def argmax(iterable): + return max(enumerate(iterable), key=lambda x: x[1])[0] + + # Computes the sum of the execution times over the different devices + performance_sums = [] + for parameters in common_parameters: + performance_sum = sum([r["relative_performance"] for r in common_results if r["parameters"] == parameters]) + performance_sums.append(performance_sum) + + # Retrieves the entry with the highest performance + best_index = argmax(performance_sums) + best_performance = performance_sums[best_index] + best_parameters = common_parameters[best_index] + + # Completed, report and return the results + if verbose: + print("[database] " + str(name) + " with performance " + str(best_performance)) + return best_parameters diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py index 46b711cc..8190f225 100644 --- a/scripts/database/database/clblast.py +++ b/scripts/database/database/clblast.py @@ -18,6 +18,7 @@ DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"] KERNEL_ATTRIBUTES = ["precision", "kernel_family"] ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"] ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES +GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES def precision_to_string(precision): @@ -81,42 +82,63 @@ def print_cpp_database(database, output_dir): """Outputs the database as C++ code""" # Iterates over the kernel families - for family_name, family_database in database.groupby(["kernel_family"]): - family_database = family_database.dropna(axis=1, how='all') + kernel_families = sorted(set([s["kernel_family"] for s in database["sections"]])) + for family_name in kernel_families: + family_database = [s for s in database["sections"] if s["kernel_family"] == family_name] # Opens a new file for each kernel family - full_path = os.path.join(output_dir, family_name+'.hpp') + full_path = os.path.join(output_dir, family_name + ".hpp") with open(full_path, 'w+') as f: f.write(get_cpp_header(family_name)) # Loops over the different precision (e.g. 16, 32, 3232, 64, 6464) - for precision, precision_database in family_database.groupby(["precision"]): + precisions = sorted(set([s["precision"] for s in database["sections"]])) # Based on full database + for precision in precisions: + precision_database = [s for s in family_database if s["precision"] == precision] f.write(get_cpp_precision(family_name, precision)) - # Loops over a combination of device vendors and device types (e.g. AMD GPU) - for vendor, vendor_database in precision_database.groupby(["device_vendor"]): - for device_type, device_type_database in vendor_database.groupby(["device_type"]): + # In case there is nothing found at all (e.g. 16-bit): continue as if this was a precision of 32 but + # with the defaults only + if len(precision_database) == 0: + print("[database] No results found for %s:%s, retrieving defaults from %s:32" % + (family_name, precision, family_name)) + precision_database = [s for s in family_database if s["precision"] == "32" + and s["device_vendor"] == VENDOR_DEFAULT + and s["device_type"] == DEVICE_TYPE_DEFAULT + and s["device"] == DEVICE_NAME_DEFAULT] + + # Loops over device vendors (e.g. AMD) + device_vendors = sorted(set([s["device_vendor"] for s in precision_database])) + for vendor in device_vendors: + vendor_database = [s for s in precision_database if s["device_vendor"] == vendor] + + # Loops over device types (e.g. GPU) + device_types = sorted(set([s["device_type"] for s in vendor_database])) + for device_type in device_types: + type_database = [s for s in vendor_database if s["device_type"] == device_type] f.write(get_cpp_device_vendor(vendor, device_type)) # Loops over every device of this vendor-type combination - for device_name, device_database in device_type_database.groupby(["device"]): + devices = sorted(set([s["device"] for s in type_database])) + for device_name in devices: + device_database = [s for s in type_database if s["device"] == device_name] device_name_quoted = "\"%s\"," % device_name device_name_cpp = " { %-50s { " % device_name_quoted f.write(device_name_cpp) # Collects the parameters for this entry parameters = [] - for kernel, kernel_database in device_database.groupby(["kernel"]): - kernel_database = kernel_database.dropna(axis=1) + kernels = sorted(set([s["kernel"] for s in device_database])) + for kernel in kernels: + kernel_database = [s for s in device_database if s["kernel"] == kernel] - # Only consider the actual parameters, not the precision - def is_parameter(column): - return column.startswith('parameters.') and column != "parameters.PRECISION" - column_names = [col for col in list(kernel_database) if is_parameter(col)] + assert len(kernel_database) == 1 + results = kernel_database[0]["results"] - for p in column_names: - parameter_name = p.replace("parameters.", "") - parameter_value = int(kernel_database[p].iloc[0]) + assert len(results) == 1 + new_parameters = results[0]["parameters"] + for parameter_name in sorted(new_parameters): + parameter_value = new_parameters[parameter_name] parameters.append("{\"" + parameter_name + "\"," + str(parameter_value) + "}") # Prints the entry diff --git a/scripts/database/database/db.py b/scripts/database/database/db.py index 60cfbcfa..94948b1a 100644 --- a/scripts/database/database/db.py +++ b/scripts/database/database/db.py @@ -5,46 +5,60 @@ # Author(s): # Cedric Nugteren <www.cedricnugteren.nl> -import pandas as pd +import clblast -def get_entries_by_field(database, field, value): - """Retrieves entries from the database with a specific value for a given field""" - return database[database[field] == value] +def length(database): + """Computes the total number of tuning entries""" + num_tuning_entries = 0 + for section in database["sections"]: + num_tuning_entries += len(section["results"]) + return num_tuning_entries -def concatenate_database(database1, database2): - """Concatenates two databases row-wise and returns the result""" - return pd.concat([database1, database2]) +def add_section(database, new_section): + """Adds a new section to the database""" + for old_section in database["sections"]: + # Verify whether the sections match + equal = True + for attribute in new_section.keys(): + if attribute != "results": + if attribute not in old_section or new_section[attribute] != old_section[attribute]: + equal = False + break -def remove_duplicates(database): - """Removes duplicates from a database""" - return database.drop_duplicates() + # They match: append the new section's results to the corresponding entry in the database and return + if equal: + old_section["results"] = combine_results(old_section["results"], new_section["results"]) + return database - -def find_and_replace(database, dictionary): - """Finds and replaces entries in a database based on a dictionary. Example: - dictionary = { "key_to_edit": { find1: replace1, find2, replace2 } }""" - return database.replace(dictionary) + # No match found: append the whole new section to the database + database["sections"].append(new_section) + return database -def remove_entries_by_key_value(database, key, value): - """Removes entries in the databased which have a specific value for a given key""" - return database[database[key] != value] +def combine_results(old_results, new_results): + """Adds new results to the results JSON list""" + for new_result in new_results: + old_results = combine_result(old_results, new_result) + return old_results -def remove_entries_by_device(database, device_name): - """Shorthand for the above, specifically removes entries for a given device""" - return remove_entries_by_key_value(database, "device", device_name) +def combine_result(old_results, new_result): + """Adds a new result to the results JSON list; filters for duplicate entries and saves the best performing one""" + # Loops over all existing results to test for already existing entries with these parameters + for old_result in old_results: -def remove_entries_by_kernel_family(database, kernel_family_name): - """Shorthand for the above, specifically removes entries for a given kernel family""" - return remove_entries_by_key_value(database, "kernel_family", kernel_family_name) + # Verify whether the results match + equal = new_result["parameters"] == old_result["parameters"] + # They match: keep only the one with the minimum execution time + if equal: + old_result["time"] = min(old_result["time"], new_result["time"]) + return old_results -def update_database(database, condition, field, value): - """Updates the database by writing a specific value to a given field, given certain conditions""" - database.loc[condition, field] = value - return database + # No match found: append a new result + old_results.append(new_result) + return old_results diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py index 357c3a3a..00405908 100644 --- a/scripts/database/database/defaults.py +++ b/scripts/database/database/defaults.py @@ -5,54 +5,176 @@ # Author(s): # Cedric Nugteren <www.cedricnugteren.nl> -import pandas as pd + import clblast +import bests -def set_default_device(database_entry): +def set_default_device(section): """Sets the device name and parameters to some default values""" - database_entry["device"] = clblast.DEVICE_NAME_DEFAULT - database_entry["device_compute_units"] = 0 - database_entry["device_core_clock"] = 0 - return database_entry - - -def set_default_time(database_entry): - """Sets the execution time to some default value""" - database_entry["time"] = 0.0 - return database_entry - - -def calculate_defaults(df): - """# Sets defaults for devices of the same type/vendor based on the smallest values of all known entries. The average - might be better for performance but some parameters might not be supported on other devices.""" - database_defaults = pd.DataFrame() - - # Defaults per combination of device vendors and device types (e.g. AMD GPU) - database_type_vendor = df.groupby(clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"] + - clblast.ARGUMENT_ATTRIBUTES) - for group_name, database_group in database_type_vendor: - default_values = database_group.min(axis=0) - default_values = set_default_device(default_values) - default_values = set_default_time(default_values) - database_defaults = database_defaults.append(default_values, ignore_index=True) - - # Checks for mis-matched arguments - groups = database_defaults.groupby(clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"]) - for group_name, database_group in groups: - if len(database_group) != 1: - description = database_group["kernel"].min() + " " + database_group["device_vendor"].min() - print("[WARNING] Entries for a single kernel with multiple argument values: " + description) - - # Defaults over all device types and vendors - groups = df.groupby(clblast.KERNEL_ATTRIBUTES + ["kernel"] + clblast.ARGUMENT_ATTRIBUTES) - for group_name, database_group in groups: - default_values = database_group.min(axis=0) - default_values["device_vendor"] = clblast.VENDOR_DEFAULT - default_values["device_type"] = clblast.DEVICE_TYPE_DEFAULT - default_values = set_default_device(default_values) - default_values = set_default_time(default_values) - database_defaults = database_defaults.append(default_values, ignore_index=True) + section["device"] = clblast.DEVICE_NAME_DEFAULT + section["device_compute_units"] = 0 + section["device_core_clock"] = 0 + return section + + +def set_identifiers(database, group_by_attributes, identifier_name): + """Sets a group-identifier based on a given set of attributes. Modifies the database but also returns a list of + unique identifiers.""" + identifiers = [] + for section in database["sections"]: + identifier = [] + for attribute in group_by_attributes: + if attribute in section: + identifier.append(section[attribute]) + section[identifier_name] = ";".join(identifier) + identifiers.append(section[identifier_name]) + return sorted(set(identifiers)) + + +def remove_identifiers(database, identifier_name): + """Removes an identifier from all sections in the database""" + for section in database["sections"]: + section.pop(identifier_name, None) + + +def get_groups_by_identifier(database, group_identifiers, identifier_name): + """Returns a list of (group, group_identifier) tuples based a previously made grouping""" + groups = [] + for group_identifier in group_identifiers: + + # Get all sections in this group + group = [] + for section in database["sections"]: + if section[identifier_name] == group_identifier: + group.append(section) + + groups.append((group, group_identifier)) + return groups + + +def calculate_defaults(database, verbose): + """Sets defaults for devices of the same type/vendor""" + + # Groups the database by kernel, vendor and device type (e.g. AMD GPU) + group_identifiers = set_identifiers(database, clblast.GROUP_ATTRIBUTES, "group_identifier") + groups = get_groups_by_identifier(database, group_identifiers, "group_identifier") + + # Loops over all groups + default_sections = {"sections": []} + for group, group_identifier in groups: + + # Computes the best parameters + default_parameters = get_common_best_parameters(group, group_identifier, verbose) + + # Stores all the section's data + assert len(group) > 0 + default_section = {} + for attribute in group[0].keys(): + if attribute != "results" and attribute != "group_identifier": + default_section[attribute] = group[0][attribute] + default_section = set_default_device(default_section) + default_section["results"] = [{"time": 0.0, "parameters": default_parameters}] + default_sections["sections"].append(default_section) + + # Groups the database by kernel, vendor and device type (e.g. AMD GPU) - but not by arguments! This is to check for + # mis-matched arguments. + attributes = clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"] + group_identifiers = set_identifiers(default_sections, attributes, "temp_identifier") + groups = get_groups_by_identifier(default_sections, group_identifiers, "temp_identifier") + for group, group_identifier in groups: + if len(group) != 1: + print("[ERROR] Entries for a single kernel with multiple argument values: " + str(group_identifier)) + assert len(group) == 1 + remove_identifiers(default_sections, "temp_identifier") + + # Groups the database by kernel only + group_identifiers = set_identifiers(database, clblast.KERNEL_ATTRIBUTES + ["kernel"], "group_identifier") + groups = get_groups_by_identifier(database, group_identifiers, "group_identifier") + + # Loops over all groups + for group, group_identifier in groups: + + # Computes the best parameters + default_parameters = get_common_best_parameters(group, group_identifier, verbose) + + # Stores all the section's data + assert len(group) > 0 + default_section = {} + for attribute in group[0].keys(): + if attribute != "results" and attribute != "group_identifier": + default_section[attribute] = group[0][attribute] + default_section = set_default_device(default_section) + default_section["device_vendor"] = clblast.VENDOR_DEFAULT + default_section["device_type"] = clblast.DEVICE_TYPE_DEFAULT + default_section["results"] = [{"time": 0.0, "parameters": default_parameters}] + default_sections["sections"].append(default_section) # Database with both types of defaults only - return database_defaults + return default_sections + + +def get_smallest_best_parameters(group): + """Sets defaults based on the smallest values of all known entries. The average might be better for performance but + some parameters might not be supported on other devices.""" + + # Counts the number of devices in this group + assert len(group) > 0 + + # Find the smallest values of the parameters + min_parameters = {} + for section in group: + assert len(section["results"]) > 0 + minimum_time = min([result["time"] for result in section["results"]]) + for result in section["results"]: + if result["time"] == minimum_time: + for parameter in result["parameters"]: + if parameter in min_parameters: + min_parameters[parameter] = min(min_parameters[parameter], result["parameters"][parameter]) + else: + min_parameters[parameter] = result["parameters"][parameter] + + return min_parameters + + +def get_common_best_parameters(group, group_identifier, verbose): + """Sets defaults based on the best values of entries supported by all devices. This might cause a problem in case + not every device was tuned with the same parameters. In that case it falls back to the above method to retrieve + the smallest best execution time""" + + # Counts the number of devices in this group + num_devices = len(group) + assert num_devices > 0 + + # Inserts the relative execution times into the database + for section in group: + assert len(section["results"]) > 0 + minimum_time = min([result["time"] for result in section["results"]]) + for result in section["results"]: + result["relative_performance"] = minimum_time / result["time"] + + # Determine which parameters are available for all devices + common_parameters = [result["parameters"] for result in group[0]["results"]] # Parameters of the first section + for i in range(1, num_devices): + section_parameters = [result["parameters"] for result in group[i]["results"]] + common_parameters = [p for p in section_parameters if p in common_parameters] # Intersection of the parameters + + # Fall back to another method in case there are no shared entries at all across devices + if len(common_parameters) == 0: + if verbose: + print("[database] No common kernels for: " + str(group_identifier) + " with devices: %d " % num_devices) + smallest_best_parameters = get_smallest_best_parameters(group) + if verbose: + print("[database] " + str(group_identifier)) + return smallest_best_parameters + + # Removes entries with parameters which are not common + common_results = [] + for section in group: + for result in section["results"]: + if result["parameters"] in common_parameters: + common_results.append(result) + + # Retrieves the entries with the highest relative performance + relative_best_parameters = bests.get_relative_bests(group_identifier, common_results, common_parameters, verbose) + return relative_best_parameters diff --git a/scripts/database/database/io.py b/scripts/database/database/io.py index ad2f7ae9..d14f1297 100644 --- a/scripts/database/database/io.py +++ b/scripts/database/database/io.py @@ -13,46 +13,48 @@ try: except ImportError: from urllib2 import urlopen # Python 2 -import pandas as pd - -import clblast - def download_database(filename, database_url): """Downloads a database and saves it to disk""" print("[database] Downloading database from '" + database_url + "'...") database = urlopen(database_url) - with open(filename, 'wb') as f: + with open(filename, "wb") as f: f.write(database.read()) def load_database(filename): """Loads a database from disk""" print("[database] Loading database from '" + filename + "'") - return pd.read_pickle(filename) + with open(filename) as f: + return json.load(f) def save_database(database, filename): """Saves a database to disk""" print("[database] Saving database to '" + filename + "'") - database.to_pickle(filename) + with open(filename, "wb") as f: + json.dump(database, f, sort_keys=True, indent=4) -def load_json_to_pandas(filename): - """Loads JSON data from file and converts it to a pandas database""" +def load_tuning_results(filename): + """Loads JSON data from file and pre-processes it""" with open(filename) as f: json_data = json.load(f) - # Gathers all results and stores them in a new database - json_database = pd.DataFrame(json_data) - new_database = pd.io.json.json_normalize(json_database["results"]) - - # Sets the common attributes to each entry in the results - for attribute in clblast.ATTRIBUTES: - if attribute == "kernel_family": - new_database[attribute] = re.sub(r'_\d+', '', json_data[attribute]) - elif attribute in json_data: - new_database[attribute] = json_data[attribute] - else: - new_database[attribute] = 0 # For example a parameters that was not used by this kernel - return new_database + # Removes the numbering following the kernel family name + json_data["kernel_family"] = re.sub(r'_\d+', '', json_data["kernel_family"]) + + # Adds the kernel name to the section instead of to the individual results + assert len(json_data["results"]) > 0 + json_data["kernel"] = json_data["results"][0]["kernel"] + for result in json_data["results"]: + assert json_data["kernel"] == result["kernel"] + result.pop("kernel", None) + + # Removes the 'PRECISION' parameter from the individual results: it is redundant + for result in json_data["results"]: + assert json_data["precision"] == str(result["parameters"]["PRECISION"]) + result["parameters"].pop("PRECISION", None) + + # All done + return json_data diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py deleted file mode 100644 index 5bff95d1..00000000 --- a/scripts/generator/datatype.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python - -# ================================================================================================== -# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. -# -# Author(s): -# Cedric Nugteren <www.cedricnugteren.nl> -# -# This file contains the 'DataType' class, used in the generator script to generate the CLBlast API -# interface and implementation. -# -# ================================================================================================== - -# Short-hands for data-types -HLF = "half" -FLT = "float" -DBL = "double" -FLT2 = "float2" -DBL2 = "double2" - -HCL = "cl_half" -F2CL = "cl_float2" -D2CL = "cl_double2" - -# Structure holding data-type and precision information -class DataType(): - def __init__(self, precision_name, name, template, scalars, buffertype): - self.precision_name = precision_name - self.name = name - self.template = template - self.alpha_cpp = scalars[0] - self.beta_cpp = scalars[1] - self.alpha_cl = scalars[2] - self.beta_cl = scalars[3] - self.buffertype = buffertype - - # Outputs the name of the data-type (alpha/beta), possibly transforming into the right type - def UseAlpha(self): - if self.alpha_cpp in [FLT2, DBL2]: - return self.alpha_cpp+"{alpha.s[0], alpha.s[1]}" - return "alpha" - def UseBeta(self): - if self.beta_cpp in [FLT2, DBL2]: - return self.beta_cpp+"{beta.s[0], beta.s[1]}" - return "beta" - - # As above, but the transformation is in the opposite direction - def UseAlphaCL(self): - if self.alpha_cpp in [FLT2, DBL2]: - return self.alpha_cl+"{{alpha.real(), alpha.imag()}}" - return "alpha" - def UseBetaCL(self): - if self.beta_cpp in [FLT2, DBL2]: - return self.beta_cl+"{{beta.real(), beta.imag()}}" - return "beta" - - # Returns the template as used in the correctness/performance tests - def TestTemplate(self): - if self.buffertype != self.beta_cpp: - return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp - return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp - - # Current scalar is complex - def IsComplex(self, scalar): - return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or - (scalar == "beta" and self.beta_cpp in [FLT2, DBL2])) - - -# ================================================================================================== diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 6aa6fc18..d82b13a6 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -1,14 +1,13 @@ #!/usr/bin/env python -# ================================================================================================== -# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren <www.cedricnugteren.nl> # -# This script automatically generates the bodies of the following files, creating the full CLBlast -# API interface and implementation (C, C++, and reference BLAS wrappers): +# This script automatically generates the bodies of the following files, creating the full CLBlast API interface and +# implementation (C, C++, and reference BLAS wrappers): # clblast.h # clblast.cpp # clblast_c.h @@ -19,45 +18,20 @@ # test/correctness/routines/levelX/xYYYY.cpp # test/performance/routines/levelX/xYYYY.cpp # It also produces the API documentation found in doc/clblast.md -# -# ================================================================================================== -# System modules + import sys import os.path +import argparse -# Local files -from routine import Routine -from datatype import DataType, HLF, FLT, DBL, FLT2, DBL2, HCL, F2CL, D2CL +import generator.cpp as cpp +import generator.doc as doc +from generator.routine import Routine +from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU -# ================================================================================================== -# Regular data-types -H = DataType("H", "H", HLF, [HLF, HLF, HCL, HCL], HLF ) # half (16) -S = DataType("S", "S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32) -D = DataType("D", "D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64) -C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232) -Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464) - -# Special cases -Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output -Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output -iH = DataType("H", "iH", HLF, [HLF, HLF, HLF, HLF], HLF ) # As H, but with integer output -iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output -iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output -iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output -iZ = DataType("Z", "iZ", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # As Z, but with integer output -Css = DataType("C", "C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S -Zdd = DataType("Z", "Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D -Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S -Zzd = DataType("Z", "Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D - -# C++ template data-types -T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine -Tc = DataType("Tc", "typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk -TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k - -# ================================================================================================== +HEADER_LINES = [96, 73, 97, 22, 29, 41] +FOOTER_LINES = [17, 75, 19, 14, 6, 6] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." @@ -77,472 +51,162 @@ cld_n = "The value of `c_ld` must be at least `n`." # ================================================================================================== # Populates a list of routines -routines = [ -[ # Level 1: vector-vector - Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), - Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), - Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), +ROUTINES = [ +[ # Level 1: vector-vector + Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), + Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], -[ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), - Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), +[ # Level 2: matrix-vector + Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), + Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), ], -[ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), - Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), - Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), +[ # Level 3: matrix-matrix + Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), + Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), + Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), ], -[ # Level X: extra routines (not part of BLAS) - Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), +[ # Level X: extra routines (not part of BLAS) + Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), ]] -# ================================================================================================== -# Translates an option name to a CLBlast data-type -def PrecisionToFullName(x): - return { - 'H': "Half", - 'S': "Single", - 'D': "Double", - 'C': "ComplexSingle", - 'Z': "ComplexDouble", - }[x] - -# ================================================================================================== - -# Separators for the BLAS levels -separators = [""" -// ================================================================================================= -// BLAS level-1 (vector-vector) routines -// =================================================================================================""", -""" -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// =================================================================================================""", -""" -// ================================================================================================= -// BLAS level-3 (matrix-matrix) routines -// =================================================================================================""", -""" -// ================================================================================================= -// Extra non-BLAS routines (level-X) -// ================================================================================================="""] - -# Names of the level sub-folders -levelnames = ["1", "2", "3", "x"] - -# Main header/footer for source files -header = """ -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// ================================================================================================= -""" -footer = """ -// ================================================================================================= -""" - -# ================================================================================================== - -# The C++ API header (.h) -def clblast_h(routines): - result = "" - for routine in routines: - result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" - result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n" - return result - -# The C++ API implementation (.cpp) -def clblast_cc(routines): - result = "" - for routine in routines: - indent1 = " "*(20 + routine.Length()) - result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" - if routine.implemented: - result += routine.RoutineHeaderCPP(12, "")+" {\n" - result += " auto queue_cpp = Queue(*queue);\n" - result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event);\n" - result += " auto status = routine.SetUp();\n" - result += " if (status != StatusCode::kSuccess) { return status; }\n" - result += " return routine.Do"+routine.name.capitalize()+"(" - result += (",\n"+indent1).join([a for a in routine.ArgumentsCladuc(routine.template, indent1)]) - result += ");\n" - else: - result += routine.RoutineHeaderTypeCPP(12)+" {\n" - result += " return StatusCode::kNotImplemented;\n" - result += "}\n" - for flavour in routine.flavours: - indent2 = " "*(34 + routine.Length() + len(flavour.template)) - result += "template StatusCode PUBLIC_API "+routine.name.capitalize()+"<"+flavour.template+">(" - result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)]) - result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n" - return result - -# ================================================================================================== - -# The C API header (.h) -def clblast_c_h(routines): - result = "" - for routine in routines: - result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" - for flavour in routine.flavours: - result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n" - return result - -# The C API implementation (.cpp) -def clblast_c_cc(routines): - result = "" - for routine in routines: - result += "\n// "+routine.name.upper()+"\n" - for flavour in routine.flavours: - template = "<"+flavour.template+">" if routine.NoScalars() else "" - indent = " "*(26 + routine.Length() + len(template)) - result += routine.RoutineHeaderC(flavour, 20, "")+" {\n" - result += " auto status = clblast::"+routine.name.capitalize()+template+"(" - result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)]) - result += ",\n"+indent+"queue, event);" - result += "\n return static_cast<StatusCode>(status);\n}\n" - return result - -# ================================================================================================== - -# The wrapper to the reference clBLAS routines (for performance/correctness testing) -def wrapper_clblas(routines): - result = "" - for routine in routines: - if routine.has_tests: - result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNamesTested()) - if routine.NoScalars(): - result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n" - for flavour in routine.flavours: - result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n" - - # There is a version available in clBLAS - if flavour.precision_name in ["S","D","C","Z"]: - indent = " "*(17 + routine.Length()) - arguments = routine.ArgumentsWrapperCL(flavour) - if routine.scratch: - result += " auto queue = Queue(queues[0]);\n" - result += " auto context = queue.GetContext();\n" - result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n" - arguments += ["scratch_buffer()"] - result += " return clblas"+flavour.name+routine.name+"(" - result += (",\n"+indent).join([a for a in arguments]) - result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" - - # There is no clBLAS available, forward the call to one of the available functions - else: # Half-precision - indent = " "*(24 + routine.Length()) - - # Convert to float (note: also integer buffers are stored as half/float) - for buf in routine.inputs + routine.outputs: - result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer, queues[0]);\n" - - # Call the float routine - result += " auto status = clblasX"+routine.name+"(" - result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()]) - result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" - result += "\n" - - # Convert back to half - for buf in routine.outputs: - result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis, queues[0]);\n" - result += " return status;" - - # Complete - result += "\n}\n" - return result - -# The wrapper to the reference CBLAS routines (for performance/correctness testing) -def wrapper_cblas(routines): - result = "" - for routine in routines: - if routine.has_tests: - result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNamesTested()) - for flavour in routine.flavours: - result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n" - - # There is a version available in CBLAS - if flavour.precision_name in ["S","D","C","Z"]: - indent = " "*(10 + routine.Length()) - arguments = routine.ArgumentsWrapperC(flavour) - # Complex scalars - for scalar in routine.scalars: - if flavour.IsComplex(scalar): - result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n" - - # Special case for scalar outputs - assignment = "" - postfix = "" - endofline = "" - extra_argument = "" - for output_buffer in routine.outputs: - if output_buffer in routine.ScalarBuffersFirst(): - if flavour in [C,Z]: - postfix += "_sub" - indent += " " - extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])" - elif output_buffer in routine.IndexBuffers(): - assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = " - indent += " "*len(assignment) - else: - assignment = output_buffer+"_buffer["+output_buffer+"_offset]" - if (flavour.name in ["Sc","Dz"]): - assignment = assignment+".real(" - endofline += ")" - else: - assignment = assignment+" = " - indent += " "*len(assignment) - - result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"(" - result += (",\n"+indent).join([a for a in arguments]) - result += extra_argument+endofline+");\n" - - # There is no CBLAS available, forward the call to one of the available functions - else: # Half-precision - indent = " "*(9 + routine.Length()) - - # Convert to float (note: also integer buffers are stored as half/float) - for buf in routine.inputs + routine.outputs: - result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer);\n" - - # Call the float routine - result += " cblasX"+routine.name+"(" - result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()]) - result += ");\n" - - # Convert back to half - for buf in routine.outputs: - result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis);\n" - - # Complete - result += "}\n" - return result - -# ================================================================================================== - -# Checks for the number of command-line arguments -if len(sys.argv) != 2: - print "[ERROR] Usage: generator.py <root_of_clblast>" - sys.exit() - -# Parses the command-line arguments -path_clblast = sys.argv[1] -files = [ - path_clblast+"/include/clblast.h", - path_clblast+"/src/clblast.cpp", - path_clblast+"/include/clblast_c.h", - path_clblast+"/src/clblast_c.cpp", - path_clblast+"/test/wrapper_clblas.hpp", - path_clblast+"/test/wrapper_cblas.hpp", -] -header_lines = [96, 73, 97, 22, 29, 41] -footer_lines = [17, 75, 19, 14, 6, 6] - -# Checks whether the command-line arguments are valid; exists otherwise -for f in files: - if not os.path.isfile(f): - print "[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library" - sys.exit() - -# ================================================================================================== - -# Iterates over all files to output -for i in xrange(0,len(files)): - - # Stores the header and the footer of the original file - with open(files[i]) as f: - original = f.readlines() - file_header = original[:header_lines[i]] - file_footer = original[-footer_lines[i]:] - - # Re-writes the body of the file - with open(files[i], "w") as f: - body = "" - levels = [1,2,3] if (i == 4 or i == 5) else [1,2,3,4] - for level in levels: - body += separators[level-1]+"\n" - if i == 0: - body += clblast_h(routines[level-1]) - if i == 1: - body += clblast_cc(routines[level-1]) - if i == 2: - body += clblast_c_h(routines[level-1]) - if i == 3: - body += clblast_c_cc(routines[level-1]) - if i == 4: - body += wrapper_clblas(routines[level-1]) - if i == 5: - body += wrapper_cblas(routines[level-1]) - f.write("".join(file_header)) - f.write(body) - f.write("".join(file_footer)) - -# ================================================================================================== - -# Outputs all the correctness-test implementations -for level in [1,2,3,4]: - for routine in routines[level-1]: - if routine.has_tests: - filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp" - with open(filename, "w") as f: - body = "" - body += "#include \"test/correctness/testblas.hpp\"\n" - body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n" - body += "// Shortcuts to the clblast namespace\n" - body += "using float2 = clblast::float2;\n" - body += "using double2 = clblast::double2;\n\n" - body += "// Main function (not within the clblast namespace)\n" - body += "int main(int argc, char *argv[]) {\n" - body += " auto errors = size_t{0};\n" - not_first = "false" - for flavour in routine.flavours: - body += " errors += clblast::RunTests<clblast::TestX"+routine.name+flavour.TestTemplate() - body += ">(argc, argv, "+not_first+", \""+flavour.name+routine.name.upper()+"\");\n" - not_first = "true" - body += " if (errors > 0) { return 1; } else { return 0; }\n" - body += "}\n" - f.write(header+"\n") - f.write(body) - f.write(footer) - -# Outputs all the performance-test implementations -for level in [1,2,3,4]: - for routine in routines[level-1]: - if routine.has_tests: - filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp" - with open(filename, "w") as f: - body = "" - body += "#include \"test/performance/client.hpp\"\n" - body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n" - body += "// Shortcuts to the clblast namespace\n" - body += "using float2 = clblast::float2;\n" - body += "using double2 = clblast::double2;\n\n" - body += "// Main function (not within the clblast namespace)\n" - body += "int main(int argc, char *argv[]) {\n" - default = PrecisionToFullName(routine.flavours[0].precision_name) - body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n" - for precision in ["H","S","D","C","Z"]: - body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":" - found = False - for flavour in routine.flavours: - if flavour.precision_name == precision: - body += "\n clblast::RunClient<clblast::TestX"+routine.name+flavour.TestTemplate() - body += ">(argc, argv); break;\n" - found = True - if not found: - body += " throw std::runtime_error(\"Unsupported precision mode\");\n" - body += " }\n" - body += " return 0;\n" - body += "}\n" - f.write(header+"\n") - f.write(body) - f.write(footer) - -# ================================================================================================== - -# Outputs the API documentation -filename = path_clblast+"/doc/clblast.md" -with open(filename, "w") as f: - - # Outputs the header - f.write("CLBlast: API reference\n") - f.write("================\n") - f.write("\n\n") - - # Loops over the routines - for level in [1,2,3,4]: - for routine in routines[level-1]: - if routine.implemented: - - # Routine header - f.write("x"+routine.name.upper()+": "+routine.description+"\n") - f.write("-------------\n") - f.write("\n") - f.write(routine.details+"\n") - f.write("\n") - - # Routine API - f.write("C++ API:\n") - f.write("```\n") - f.write(routine.RoutineHeaderCPP(12, "")+"\n") - f.write("```\n") - f.write("\n") - f.write("C API:\n") - f.write("```\n") - for flavour in routine.flavours: - f.write(routine.RoutineHeaderC(flavour, 20, "")+"\n") - f.write("```\n") - f.write("\n") - - # Routine arguments - f.write("Arguments to "+routine.name.upper()+":\n") - f.write("\n") - for argument in routine.ArgumentsDoc(): - f.write("* "+argument+"\n") - f.write("* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.\n") - f.write("* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.\n") - f.write("\n") - - # Routine requirements - if len(routine.RequirementsDoc()) > 0: - f.write("Requirements for "+routine.name.upper()+":\n") - f.write("\n") - for requirement in routine.RequirementsDoc(): - f.write("* "+requirement+"\n") - f.write("\n") - - # Routine footer - f.write("\n\n") - - -# ================================================================================================== +def main(argv): + + # Parses the command-line arguments + parser = argparse.ArgumentParser() + parser.add_argument("clblast_root", help="Root of the CLBlast sources") + parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script") + cl_args = parser.parse_args(argv) + library_root = cl_args.clblast_root + + # Sets all the files the output + files = [ + library_root + "/include/clblast.h", + library_root + "/src/clblast.cpp", + library_root + "/include/clblast_c.h", + library_root + "/src/clblast_c.cpp", + library_root + "/test/wrapper_clblas.hpp", + library_root + "/test/wrapper_cblas.hpp", + ] + + # Checks whether the command-line arguments are valid; exists otherwise + for f in files: + if not os.path.isfile(f): + print("[ERROR] The path '" + library_root + "' does not point to the root of the CLBlast library") + sys.exit() + + # Iterates over all regular files to output + for i in range(0, len(files)): + + # Stores the header and the footer of the original file + with open(files[i]) as f: + original = f.readlines() + file_header = original[:HEADER_LINES[i]] + file_footer = original[-FOOTER_LINES[i]:] + + # Re-writes the body of the file + with open(files[i], "w") as f: + body = "" + levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4] + for level in levels: + body += cpp.LEVEL_SEPARATORS[level - 1] + "\n" + for routine in ROUTINES[level - 1]: + if i == 0: + body += cpp.clblast_h(routine) + if i == 1: + body += cpp.clblast_cc(routine) + if i == 2: + body += cpp.clblast_c_h(routine) + if i == 3: + body += cpp.clblast_c_cc(routine) + if i == 4: + body += cpp.wrapper_clblas(routine) + if i == 5: + body += cpp.wrapper_cblas(routine) + f.write("".join(file_header)) + f.write(body) + f.write("".join(file_footer)) + + # Outputs all the test implementations + for level in [1, 2, 3, 4]: + for routine in ROUTINES[level - 1]: + if routine.has_tests: + level_string = cpp.LEVEL_NAMES[level - 1] + routine_suffix = "level" + level_string + "/x" + routine.name + ".cpp" + + # Correctness tests + filename = library_root + "/test/correctness/routines/" + routine_suffix + with open(filename, "w") as f: + f.write(cpp.HEADER + "\n") + f.write(cpp.correctness_test(routine, level_string)) + f.write(cpp.FOOTER) + + # Performance tests + filename = library_root + "/test/performance/routines/" + routine_suffix + with open(filename, "w") as f: + f.write(cpp.HEADER + "\n") + f.write(cpp.performance_test(routine, level_string)) + f.write(cpp.FOOTER) + + # Outputs the API documentation + filename = cl_args.clblast_root + "/doc/clblast.md" + with open(filename, "w") as f: + + # Outputs the header + doc_header = doc.header() + f.write(doc_header) + + # Generates the documentation for each routine + for level in [1, 2, 3, 4]: + for routine in ROUTINES[level - 1]: + if routine.implemented: + doc_routine = doc.generate(routine) + f.write(doc_routine) + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/scripts/generator/generator/__init__.py b/scripts/generator/generator/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/scripts/generator/generator/__init__.py diff --git a/scripts/generator/generator/convert.py b/scripts/generator/generator/convert.py new file mode 100644 index 00000000..c0309ec3 --- /dev/null +++ b/scripts/generator/generator/convert.py @@ -0,0 +1,69 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren <www.cedricnugteren.nl> + + +def precision_to_full_name(x): + """Translates an option name to a CLBlast data-type""" + return { + 'H': "Half", + 'S': "Single", + 'D': "Double", + 'C': "ComplexSingle", + 'Z': "ComplexDouble", + }[x] + + +def option_to_clblast(x): + """Translates an option name to a CLBlast data-type""" + return { + 'layout': "Layout", + 'a_transpose': "Transpose", + 'b_transpose': "Transpose", + 'ab_transpose': "Transpose", + 'side': "Side", + 'triangle': "Triangle", + 'diagonal': "Diagonal", + }[x] + + +def option_to_clblas(x): + """As above, but for clBLAS data-types""" + return { + 'layout': "clblasOrder", + 'a_transpose': "clblasTranspose", + 'b_transpose': "clblasTranspose", + 'ab_transpose': "clblasTranspose", + 'side': "clblasSide", + 'triangle': "clblasUplo", + 'diagonal': "clblasDiag", + }[x] + + +def option_to_cblas(x): + """As above, but for CBLAS data-types""" + return { + 'layout': "CBLAS_ORDER", + 'a_transpose': "CBLAS_TRANSPOSE", + 'b_transpose': "CBLAS_TRANSPOSE", + 'ab_transpose': "CBLAS_TRANSPOSE", + 'side': "CBLAS_SIDE", + 'triangle': "CBLAS_UPLO", + 'diagonal': "CBLAS_DIAG", + }[x] + + +def option_to_documentation(x): + """Translates an option name to a documentation string""" + return { + 'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.", + 'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", + 'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", + 'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", + 'side': "The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).", + 'triangle': "The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).", + 'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.", + }[x] diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py new file mode 100644 index 00000000..427eb180 --- /dev/null +++ b/scripts/generator/generator/cpp.py @@ -0,0 +1,257 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren <www.cedricnugteren.nl> + +import generator.datatype as datatype +import generator.convert as convert + + +NL = "\n" +SEPARATOR = "// =================================================================================================" + +# Separators for the BLAS levels +LEVEL_SEPARATORS = [ + NL + SEPARATOR + NL + "// BLAS level-1 (vector-vector) routines" + NL + SEPARATOR, + NL + SEPARATOR + NL + "// BLAS level-2 (matrix-vector) routines" + NL + SEPARATOR, + NL + SEPARATOR + NL + "// BLAS level-3 (matrix-matrix) routines" + NL + SEPARATOR, + NL + SEPARATOR + NL + "// Extra non-BLAS routines (level-X)" + NL + SEPARATOR +] + +# Names of the level sub-folders +LEVEL_NAMES = ["1", "2", "3", "x"] + +# Main header/footer for source files +FOOTER = NL + SEPARATOR + NL +HEADER = NL + SEPARATOR + """ +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +""" + SEPARATOR + NL + + +def clblast_h(routine): + """The C++ API header (.h)""" + result = NL + "// " + routine.description + ": " + routine.short_names() + NL + result += routine.routine_header_cpp(12, " = nullptr") + ";" + NL + return result + + +def clblast_cc(routine): + """The C++ API implementation (.cpp)""" + indent1 = " " * (20 + routine.length()) + result = NL + "// " + routine.description + ": " + routine.short_names() + NL + if routine.implemented: + result += routine.routine_header_cpp(12, "") + " {" + NL + result += " auto queue_cpp = Queue(*queue);" + NL + result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL + result += " auto status = routine.SetUp();" + NL + result += " if (status != StatusCode::kSuccess) { return status; }" + NL + result += " return routine.Do" + routine.name.capitalize() + "(" + result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()]) + result += ");" + NL + else: + result += routine.routine_header_type_cpp(12) + " {" + NL + result += " return StatusCode::kNotImplemented;" + NL + result += "}" + NL + for flavour in routine.flavours: + indent2 = " " * (34 + routine.length() + len(flavour.template)) + result += "template StatusCode PUBLIC_API " + routine.name.capitalize() + "<" + flavour.template + ">(" + result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)]) + result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL + return result + + +def clblast_c_h(routine): + """The C API header (.h)""" + result = NL + "// " + routine.description + ": " + routine.short_names() + NL + for flavour in routine.flavours: + result += routine.routine_header_c(flavour, 31, " PUBLIC_API") + ";" + NL + return result + + +def clblast_c_cc(routine): + """The C API implementation (.cpp)""" + result = NL + "// " + routine.name.upper() + NL + for flavour in routine.flavours: + template = "<" + flavour.template + ">" if routine.no_scalars() else "" + indent = " " * (26 + routine.length() + len(template)) + result += routine.routine_header_c(flavour, 20, "") + " {" + NL + result += " auto status = clblast::" + routine.name.capitalize() + template + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)]) + result += "," + NL + indent + "queue, event);" + result += NL + " return static_cast<StatusCode>(status);" + NL + "}" + NL + return result + + +def wrapper_clblas(routine): + """The wrapper to the reference clBLAS routines (for performance/correctness testing)""" + result = "" + if routine.has_tests: + result += NL + "// Forwards the clBLAS calls for %s" % routine.short_names_tested() + NL + if routine.no_scalars(): + result += routine.routine_header_wrapper_clblas(routine.template, True, 21) + ";" + NL + for flavour in routine.flavours: + result += routine.routine_header_wrapper_clblas(flavour, False, 21) + " {" + NL + + # There is a version available in clBLAS + if flavour.precision_name in ["S", "D", "C", "Z"]: + indent = " " * (17 + routine.length()) + arguments = routine.arguments_wrapper_clblas(flavour) + if routine.scratch: + result += " auto queue = Queue(queues[0]);" + NL + result += " auto context = queue.GetContext();" + NL + result += " auto scratch_buffer = Buffer<" + flavour.template + ">" + result += "(context, " + routine.scratch + ");" + NL + arguments += ["scratch_buffer()"] + result += " return clblas" + flavour.name + routine.name + "(" + result += ("," + NL + indent).join([a for a in arguments]) + result += "," + NL + indent + "num_queues, queues, num_wait_events, wait_events, events);" + + # There is no clBLAS available, forward the call to one of the available functions + else: # Half-precision + indent = " " * (24 + routine.length()) + + # Convert to float (note: also integer buffers are stored as half/float) + for buf in routine.inputs + routine.outputs: + result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL + + # Call the float routine + result += " auto status = clblasX" + routine.name + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_half()]) + result += "," + NL + indent + "num_queues, queues, num_wait_events, wait_events, events);" + result += NL + + # Convert back to half + for buf in routine.outputs: + result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL + result += " return status;" + + # Complete + result += NL + "}" + NL + return result + + +def wrapper_cblas(routine): + """The wrapper to the reference CBLAS routines (for performance/correctness testing)""" + result = "" + if routine.has_tests: + result += NL + "// Forwards the Netlib BLAS calls for %s" % routine.short_names_tested() + NL + for flavour in routine.flavours: + result += routine.routine_header_wrapper_cblas(flavour, 12) + " {" + NL + + # There is a version available in CBLAS + if flavour.precision_name in ["S", "D", "C", "Z"]: + indent = " " * (10 + routine.length()) + arguments = routine.arguments_wrapper_cblas(flavour) + + # Complex scalars + for scalar in routine.scalars: + if flavour.is_complex(scalar): + result += " const auto " + scalar + "_array = std::vector<" + flavour.buffer_type[:-1] + ">" + result += "{" + scalar + ".real(), " + scalar + ".imag()};" + NL + + # Special case for scalar outputs + assignment = "" + postfix = "" + end_of_line = "" + extra_argument = "" + for output_buffer in routine.outputs: + if output_buffer in routine.scalar_buffers_first(): + if flavour in [datatype.C, datatype.Z]: + postfix += "_sub" + indent += " " + extra_argument += "," + NL + indent + extra_argument += "reinterpret_cast<return_pointer_" + flavour.buffer_type[:-1] + ">" + extra_argument += "(&" + output_buffer + "_buffer[" + output_buffer + "_offset])" + elif output_buffer in routine.index_buffers(): + assignment = "((int*)&" + output_buffer + "_buffer[0])[" + output_buffer + "_offset] = " + indent += " " * len(assignment) + else: + assignment = output_buffer + "_buffer[" + output_buffer + "_offset]" + if flavour.name in ["Sc", "Dz"]: + assignment += ".real(" + end_of_line += ")" + else: + assignment += " = " + indent += " " * len(assignment) + + result += " " + assignment + "cblas_" + flavour.name.lower() + routine.name + postfix + "(" + result += ("," + NL + indent).join([a for a in arguments]) + result += extra_argument + end_of_line + ");" + NL + + # There is no CBLAS available, forward the call to one of the available functions + else: # Half-precision + indent = " " * (9 + routine.length()) + + # Convert to float (note: also integer buffers are stored as half/float) + for buf in routine.inputs + routine.outputs: + result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL + + # Call the float routine + result += " cblasX" + routine.name + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_half()]) + result += ");" + NL + + # Convert back to half + for buf in routine.outputs: + result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL + + # Complete + result += "}" + NL + return result + + +def performance_test(routine, level_string): + """Generates the body of a performance test for a specific routine""" + result = "" + result += "#include \"test/performance/client.hpp\"" + NL + result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL + result += "// Shortcuts to the clblast namespace" + NL + result += "using float2 = clblast::float2;" + NL + result += "using double2 = clblast::double2;" + NL + NL + result += "// Main function (not within the clblast namespace)" + NL + result += "int main(int argc, char *argv[]) {" + NL + default = convert.precision_to_full_name(routine.flavours[0].precision_name) + result += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k" + default + ")) {" + NL + for precision in ["H", "S", "D", "C", "Z"]: + result += " case clblast::Precision::k" + convert.precision_to_full_name(precision) + ":" + found = False + for flavour in routine.flavours: + if flavour.precision_name == precision: + result += NL + " clblast::RunClient<clblast::TestX" + routine.name + flavour.test_template() + result += ">(argc, argv); break;" + NL + found = True + if not found: + result += " throw std::runtime_error(\"Unsupported precision mode\");" + NL + result += " }" + NL + result += " return 0;" + NL + result += "}" + NL + return result + + +def correctness_test(routine, level_string): + """Generates the body of a correctness test for a specific routine""" + result = "" + result += "#include \"test/correctness/testblas.hpp\"" + NL + result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL + result += "// Shortcuts to the clblast namespace" + NL + result += "using float2 = clblast::float2;" + NL + result += "using double2 = clblast::double2;" + NL + NL + result += "// Main function (not within the clblast namespace)" + NL + result += "int main(int argc, char *argv[]) {" + NL + result += " auto errors = size_t{0};" + NL + not_first = "false" + for flavour in routine.flavours: + result += " errors += clblast::RunTests<clblast::TestX" + routine.name + flavour.test_template() + result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.name.upper() + "\");" + NL + not_first = "true" + result += " if (errors > 0) { return 1; } else { return 0; }" + NL + result += "}" + NL + return result diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py new file mode 100644 index 00000000..9a6c6c02 --- /dev/null +++ b/scripts/generator/generator/datatype.py @@ -0,0 +1,92 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren <www.cedricnugteren.nl> + + +# Short-hands for data-types +D_HALF = "half" +D_FLOAT = "float" +D_DOUBLE = "double" +D_FLOAT2 = "float2" +D_DOUBLE2 = "double2" +D_HALF_OPENCL = "cl_half" +D_FLOAT2_OPENCL = "cl_float2" +D_DOUBLE2_OPENCL = "cl_double2" + + +class DataType: + """Class holding data-type and precision information""" + + def __init__(self, precision_name, name, template, scalars, buffer_type): + self.precision_name = precision_name + self.name = name + self.template = template + self.alpha_cpp = scalars[0] + self.beta_cpp = scalars[1] + self.alpha_cl = scalars[2] + self.beta_cl = scalars[3] + self.buffer_type = buffer_type + + def use_alpha(self): + """Outputs the name of the data-type (alpha/beta), possibly transforming into the right type""" + if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]: + return self.alpha_cpp + "{alpha.s[0], alpha.s[1]}" + return "alpha" + + def use_beta(self): + """As above, but for beta instead of alpha""" + if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]: + return self.beta_cpp + "{beta.s[0], beta.s[1]}" + return "beta" + + def use_alpha_opencl(self): + """As above, but the transformation is in the opposite direction""" + if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]: + return self.alpha_cl + "{{alpha.real(), alpha.imag()}}" + return "alpha" + + def use_beta_opencl(self): + """As above, but for beta instead of alpha""" + if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]: + return self.beta_cl + "{{beta.real(), beta.imag()}}" + return "beta" + + def test_template(self): + """Returns the template as used in the correctness/performance tests""" + if self.buffer_type != self.beta_cpp: + return "<" + self.buffer_type + "," + self.beta_cpp + ">, " + self.buffer_type + ", " + self.beta_cpp + return "<" + self.buffer_type + ">, " + self.buffer_type + ", " + self.beta_cpp + + def is_complex(self, scalar): + """Current scalar is complex""" + return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or + (scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2])) + + +# Regular data-types +H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF) # half (16) +S = DataType("S", "S", D_FLOAT, [D_FLOAT] * 4, D_FLOAT) # single (32) +D = DataType("D", "D", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE) # double (64) +C = DataType("C", "C", D_FLOAT2, [D_FLOAT2] * 2 + [D_FLOAT2_OPENCL] * 2, D_FLOAT2) # single-complex (3232) +Z = DataType("Z", "Z", D_DOUBLE2, [D_DOUBLE2] * 2 + [D_DOUBLE2_OPENCL] * 2, D_DOUBLE2) # double-complex (6464) + +# Special cases +Sc = DataType("C", "Sc", D_FLOAT2, [D_FLOAT2] * 4, D_FLOAT2) # As C, but with real output +Dz = DataType("Z", "Dz", D_DOUBLE2, [D_DOUBLE2] * 4, D_DOUBLE2) # As Z, but with real output +iH = DataType("H", "iH", D_HALF, [D_HALF] * 4, D_HALF) # As H, but with integer output +iS = DataType("S", "iS", D_FLOAT, [D_FLOAT] * 4, D_FLOAT) # As S, but with integer output +iD = DataType("D", "iD", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE) # As D, but with integer output +iC = DataType("C", "iC", D_FLOAT2, [D_FLOAT2] * 2 + [D_FLOAT2_OPENCL] * 2, D_FLOAT2) # As C, but with integer output +iZ = DataType("Z", "iZ", D_DOUBLE2, [D_DOUBLE2] * 2 + [D_DOUBLE2_OPENCL] * 2, D_DOUBLE2) # As Z, but with int output +Css = DataType("C", "C", D_FLOAT, [D_FLOAT, D_FLOAT, D_FLOAT, D_FLOAT], D_FLOAT2) # As C, but with constants from S +Zdd = DataType("Z", "Z", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE2) # As Z, but with constants from D +Ccs = DataType("C", "C", D_FLOAT2 + "," + D_FLOAT, [D_FLOAT2, D_FLOAT, D_FLOAT2_OPENCL, D_FLOAT], D_FLOAT2) # As C, but with one constant from S +Zzd = DataType("Z", "Z", D_DOUBLE2 + "," + D_DOUBLE, [D_DOUBLE2, D_DOUBLE, D_DOUBLE2_OPENCL, D_DOUBLE], D_DOUBLE2) # As Z, but with one constant from D + +# C++ template data-types +T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine +Tc = DataType("Tc", "typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk +TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k diff --git a/scripts/generator/generator/doc.py b/scripts/generator/generator/doc.py new file mode 100644 index 00000000..8657ed0d --- /dev/null +++ b/scripts/generator/generator/doc.py @@ -0,0 +1,57 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren <www.cedricnugteren.nl> + +NL = "\n" + + +def header(): + """Generates the header for the API documentation""" + result = "CLBlast: API reference" + NL + result += "================" + NL + NL + NL + return result + + +def generate(routine): + """Generates the API documentation for a given routine""" + result = "" + + # Routine header + result += "x" + routine.name.upper() + ": " + routine.description + NL + result += "-------------" + NL + NL + result += routine.details + NL + NL + + # Routine API + result += "C++ API:" + NL + result += "```" + NL + result += routine.routine_header_cpp(12, "") + NL + result += "```" + NL + NL + result += "C API:" + NL + result += "```" + NL + for flavour in routine.flavours: + result += routine.routine_header_c(flavour, 20, "") + NL + result += "```" + NL + NL + + # Routine arguments + result += "Arguments to " + routine.name.upper() + ":" + NL + NL + for argument in routine.arguments_doc(): + result += "* " + argument + NL + result += "* `cl_command_queue* queue`: " + result += "Pointer to an OpenCL command queue associated with a context and device to execute the routine on." + NL + result += "* `cl_event* event`: " + result += "Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). " + result += "This is an optional argument." + NL + NL + + # Routine requirements + if len(routine.requirements_doc()) > 0: + result += "Requirements for " + routine.name.upper() + ":" + NL + NL + for requirement in routine.requirements_doc(): + result += "* " + requirement + NL + result += NL + + # Routine footer + result += NL + NL + return result diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py new file mode 100644 index 00000000..a4e682c2 --- /dev/null +++ b/scripts/generator/generator/routine.py @@ -0,0 +1,552 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren <www.cedricnugteren.nl> + +from itertools import chain + +import generator.convert as convert + + +class Routine: + """Class holding routine-specific information (e.g. name, which arguments, which precisions)""" + def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options, + inputs, outputs, scalars, scratch, description, details, requirements): + self.implemented = implemented + self.has_tests = has_tests + self.level = level + self.name = name + self.template = template + self.flavours = flavours + self.sizes = sizes + self.options = options + self.inputs = inputs + self.outputs = outputs + self.scalars = scalars + self.scratch = scratch # Scratch buffer (e.g. for xDOT) + self.description = description + self.details = details + self.requirements = requirements + + @staticmethod + def scalar_buffers_first(): + """List of scalar buffers""" + return ["dot", "nrm2", "asum", "sum", "imax", "imin"] + + @staticmethod + def scalar_buffers_second(): + """List of scalar buffers""" + return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"] + + @staticmethod + def other_scalars(): + """List of scalars other than alpha and beta""" + return ["cos", "sin"] + + @staticmethod + def index_buffers(): + """List of buffers with unsigned int type""" + return ["imax", "imin"] + + @staticmethod + def postfix(name): + """Retrieves the postfix for a buffer""" + return "inc" if (name in ["x", "y"]) else "ld" + + @staticmethod + def buffers_vector(): + """Distinguish between vectors and matrices""" + return ["x", "y"] + + @staticmethod + def buffers_matrix(): + """Distinguish between vectors and matrices""" + return ["a", "b", "c", "ap"] + + def non_index_inputs(self): + """Lists of input/output buffers not index (integer)""" + buffers = self.inputs[:] # make a copy + for i in self.index_buffers(): + if i in buffers: + buffers.remove(i) + return buffers + + def non_index_outputs(self): + """Lists of input/output buffers not index (integer)""" + buffers = self.outputs[:] # make a copy + for i in self.index_buffers(): + if i in buffers: + buffers.remove(i) + return buffers + + def buffers_without_ld_inc(self): + """List of buffers without 'inc' or 'ld'""" + return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap"] + + def length(self): + """Retrieves the number of characters in the routine's name""" + return len(self.name) + + def no_scalars(self): + """Determines whether or not this routine has scalar arguments (alpha/beta)""" + return self.scalars == [] + + def short_names(self): + """Returns the upper-case names of these routines (all flavours)""" + return "/".join([f.name + self.name.upper() for f in self.flavours]) + + def short_names_tested(self): + """As above, but excludes some""" + names = [f.name + self.name.upper() for f in self.flavours] + if "H" + self.name.upper() in names: + names.remove("H" + self.name.upper()) + return "/".join(names) + + def buffers_first(self): + """Determines which buffers go first (between alpha and beta) and which ones go after""" + if self.level == "2b": + return ["x", "y"] + return ["ap", "a", "b", "x"] + + def buffers_second(self): + if self.level == "2b": + return ["ap", "a", "b", "c"] + return ["y", "c"] + + def buffer(self, name): + """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')""" + if name in self.inputs or name in self.outputs: + a = [name + "_buffer"] + b = [name + "_offset"] + c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] + return [", ".join(a + b + c)] + return [] + + def buffer_bis(self, name): + """As above but with a '_bis' suffix for the buffer name""" + if name in self.inputs or name in self.outputs: + a = [name + "_buffer_bis"] + b = [name + "_offset"] + c = [name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] + return [", ".join(a + b + c)] + return [] + + def buffer_def(self, name): + """As above but with data-types""" + prefix = "const " if name in self.inputs else "" + if name in self.inputs or name in self.outputs: + a = [prefix + "cl_mem " + name + "_buffer"] + b = ["const size_t " + name + "_offset"] + c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] + return [", ".join(a + b + c)] + return [] + + def buffer_def_wrapper_cl(self, name, flavour): + """As above but with data-types""" + prefix = "const " if name in self.inputs else "" + if name in self.inputs or name in self.outputs: + a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"] + b = ["const size_t " + name + "_offset"] + c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] + return [", ".join(a + b + c)] + return [] + + def buffer_def_vector(self, name, flavour): + """As above but as vectors""" + prefix = "const " if name in self.inputs else "" + if name in self.inputs or name in self.outputs: + a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"] + b = ["const size_t " + name + "_offset"] + c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] + return [", ".join(a + b + c)] + return [] + + def buffer_clcudaapi(self, name): + """As above but with CLCudaAPI buffers""" + if name in self.inputs or name in self.outputs: + buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type + a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"] + b = [name + "_offset"] + c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] + return [", ".join(a + b + c)] + return [] + + def buffer_wrapper_clblas(self, name): + """As above but with a static cast for clBLAS wrapper""" + if name in self.inputs or name in self.outputs: + a = [name + "_buffer()"] + b = [name + "_offset"] + c = [] + if name in ["x", "y"]: + c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"] + elif name in ["a", "b", "c"]: + c = [name + "_" + self.postfix(name)] + return [", ".join(a + b + c)] + return [] + + def buffer_wrapper_cblas(self, name, flavour): + """As above but with a static cast for CBLAS wrapper""" + prefix = "const " if name in self.inputs else "" + if name in self.inputs or name in self.outputs: + if name == "sy1": + a = [name + "_buffer[" + name + "_offset]"] + elif flavour.precision_name in ["C", "Z"]: + a = ["reinterpret_cast<" + prefix + flavour.buffer_type[:-1] + "*>" + + "(&" + name + "_buffer[" + name + "_offset])"] + else: + a = ["&" + name + "_buffer[" + name + "_offset]"] + c = [] + if name in ["x", "y"]: + c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"] + elif name in ["a", "b", "c"]: + c = [name + "_" + self.postfix(name)] + return [", ".join(a + c)] + return [] + + def buffer_type(self, name): + """As above, but only data-types""" + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix + "cl_mem"] + b = ["const size_t"] + c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else [] + return [", ".join(a + b + c)] + return [] + + def buffer_doc(self, name): + """Retrieves the documentation of the buffers""" + prefix = "const " if (name in self.inputs) else "" + inout = "input" if (name in self.inputs) else "output" + if (name in self.inputs) or (name in self.outputs): + math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector" + inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment " + a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."] + b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."] + if name not in self.buffers_without_ld_inc(): + c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " + + inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."] + else: + c = [] + return a + b + c + return [] + + def scalar(self, name): + """Retrieves the name of a scalar (alpha/beta)""" + if name in self.scalars: + return [name] + return [] + + def scalar_half_to_float(self, name): + """As above, but converts from float to half""" + if name in self.scalars: + return ["HalfToFloat(" + name + ")"] + return [] + + def scalar_use(self, name, flavour): + """Retrieves the use of a scalar (alpha/beta)""" + if name in self.scalars: + if name == "alpha": + return [flavour.use_alpha()] + elif name == "beta": + return [flavour.use_beta()] + return [name] + return [] + + def scalar_use_wrapper(self, name, flavour): + """As above, but for the clBLAS wrapper""" + if name in self.scalars: + if name == "alpha": + return [flavour.use_alpha_opencl()] + elif name == "beta": + return [flavour.use_beta_opencl()] + return [name] + return [] + + def scalar_use_wrapper_cblas(self, name, flavour): + """As above, but for the CBLAS wrapper""" + if name in self.scalars: + if flavour.is_complex(name): + return [name + "_array.data()"] + return [name] + return [] + + def scalar_def(self, name, flavour): + """Retrieves the definition of a scalar (alpha/beta)""" + if name in self.scalars: + if name == "alpha": + return ["const " + flavour.alpha_cl + " " + name] + return ["const " + flavour.beta_cl + " " + name] + return [] + + def scalar_def_plain(self, name, flavour): + """As above, but without 'cl_' prefix""" + if name in self.scalars: + if name == "alpha": + return ["const " + flavour.alpha_cpp + " " + name] + return ["const " + flavour.beta_cpp + " " + name] + return [] + + def scalar_type(self, name, flavour): + """Retrieves the type of a scalar (alpha/beta)""" + if name in self.scalars: + if name == "alpha": + return ["const " + flavour.alpha_cpp] + return ["const " + flavour.beta_cpp] + return [] + + def scalar_doc(self, name): + """Retrieves the documentation of a scalar""" + if name in self.scalars: + if name == "alpha": + return ["`const " + self.template.alpha_cpp + " " + name + "`: Input scalar constant."] + return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."] + return [] + + def sizes_list(self): + """Retrieves a list of comma-separated sizes (m, n, k)""" + if self.sizes: + return [", ".join([s for s in self.sizes])] + return [] + + def sizes_def(self): + """Retrieves the definition of the sizes (m,n,k)""" + if self.sizes: + return [", ".join(["const size_t " + s for s in self.sizes])] + return [] + + def sizes_type(self): + """Retrieves the types of the sizes (m,n,k)""" + if self.sizes: + return [", ".join(["const size_t" for s in self.sizes])] + return [] + + def sizes_doc(self): + """# Retrieves the documentation of the sizes""" + if self.sizes: + definitions = ["`const size_t " + s + "`: Integer size argument. This value must be positive." for s in self.sizes] + return definitions + return [] + + def options_list(self): + """Retrieves a list of options""" + if self.options: + return [", ".join(self.options)] + return [] + + def options_cast(self, indent): + """As above, but now casted to CLBlast data-types""" + if self.options: + options = ["static_cast<clblast::" + convert.option_to_clblast(o) + ">(" + o + ")" for o in self.options] + return [(",\n" + indent).join(options)] + return [] + + def options_def(self): + """Retrieves the definitions of the options (layout, transpose, side, etc.)""" + if self.options: + definitions = ["const " + convert.option_to_clblast(o) + " " + o for o in self.options] + return [", ".join(definitions)] + return [] + + def options_def_wrapper_clblas(self): + """As above, but now using clBLAS data-types""" + if self.options: + definitions = ["const " + convert.option_to_clblas(o) + " " + o for o in self.options] + return [", ".join(definitions)] + return [] + + def options_def_wrapper_cblas(self): + """As above, but now using CBLAS data-types""" + if self.options: + definitions = ["const " + convert.option_to_cblas(o) + " " + o for o in self.options] + return [", ".join(definitions)] + return [] + + def options_type(self): + """Retrieves the types of the options (layout, transpose, side, etc.)""" + if self.options: + definitions = ["const " + convert.option_to_clblast(o) for o in self.options] + return [", ".join(definitions)] + return [] + + def options_doc(self): + """Retrieves the documentation of the options""" + if self.options: + definitions = ["`const " + convert.option_to_clblast(o) + " " + o + "`: " + convert.option_to_documentation(o) for o in self.options] + return definitions + return [] + + def arguments(self): + """Retrieves a combination of all the argument names (no types)""" + return (self.options_list() + self.sizes_list() + + list(chain(*[self.buffer(b) for b in self.scalar_buffers_first()])) + + self.scalar("alpha") + + list(chain(*[self.buffer(b) for b in self.buffers_first()])) + + self.scalar("beta") + + list(chain(*[self.buffer(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) + + def arguments_half(self): + """As above, but with conversions from half to float""" + return (self.options_list() + self.sizes_list() + + list(chain(*[self.buffer_bis(b) for b in self.scalar_buffers_first()])) + + self.scalar_half_to_float("alpha") + + list(chain(*[self.buffer_bis(b) for b in self.buffers_first()])) + + self.scalar_half_to_float("beta") + + list(chain(*[self.buffer_bis(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_bis(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) + + def arguments_clcudaapi(self): + """Retrieves a combination of all the argument names, with CLCudaAPI casts""" + return (self.options_list() + self.sizes_list() + + list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_first()])) + + self.scalar("alpha") + + list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_first()])) + + self.scalar("beta") + + list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) + + def arguments_cast(self, flavour, indent): + """As above, but with CLBlast casts""" + return (self.options_cast(indent) + self.sizes_list() + + list(chain(*[self.buffer(b) for b in self.scalar_buffers_first()])) + + self.scalar_use("alpha", flavour) + + list(chain(*[self.buffer(b) for b in self.buffers_first()])) + + self.scalar_use("beta", flavour) + + list(chain(*[self.buffer(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()]))) + + def arguments_wrapper_clblas(self, flavour): + """As above, but for the clBLAS wrapper""" + return (self.options_list() + self.sizes_list() + + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.scalar_buffers_first()])) + + self.scalar_use_wrapper("alpha", flavour) + + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.buffers_first()])) + + self.scalar_use_wrapper("beta", flavour) + + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_use_wrapper(s, flavour) for s in self.other_scalars()]))) + + def arguments_wrapper_cblas(self, flavour): + """As above, but for the CBLAS wrapper""" + return (self.options_list() + self.sizes_list() + + self.scalar_use_wrapper_cblas("alpha", flavour) + + list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_first()])) + + self.scalar_use_wrapper_cblas("beta", flavour) + + list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_second()])) + + list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_use_wrapper_cblas(s, flavour) for s in self.other_scalars()]))) + + def arguments_def(self, flavour): + """Retrieves a combination of all the argument definitions""" + return (self.options_def() + self.sizes_def() + + list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) + + self.scalar_def("alpha", flavour) + + list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) + + self.scalar_def("beta", flavour) + + list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + + def arguments_def_wrapper_clblas(self, flavour): + """As above, but clBLAS wrapper plain data-types""" + return (self.options_def_wrapper_clblas() + self.sizes_def() + + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.scalar_buffers_first()])) + + self.scalar_def_plain("alpha", flavour) + + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.buffers_first()])) + + self.scalar_def_plain("beta", flavour) + + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.buffers_second()])) + + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()]))) + + def arguments_def_wrapper_cblas(self, flavour): + """As above, but CBLAS wrapper plain data-types""" + return (self.options_def_wrapper_cblas() + self.sizes_def() + + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_first()])) + + self.scalar_def_plain("alpha", flavour) + + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.buffers_first()])) + + self.scalar_def_plain("beta", flavour) + + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.buffers_second()])) + + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()]))) + + def arguments_type(self, flavour): + """Retrieves a combination of all the argument types""" + return (self.options_type() + self.sizes_type() + + list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_first()])) + + self.scalar_type("alpha", flavour) + + list(chain(*[self.buffer_type(b) for b in self.buffers_first()])) + + self.scalar_type("beta", flavour) + + list(chain(*[self.buffer_type(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()]))) + + def arguments_doc(self): + """Retrieves a combination of all the argument types""" + return (self.options_doc() + self.sizes_doc() + + list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_first()])) + + list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_first()])) + + self.scalar_doc("alpha") + + list(chain(*[self.buffer_doc(b) for b in self.buffers_first()])) + + self.scalar_doc("beta") + + list(chain(*[self.buffer_doc(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_doc(s) for s in self.other_scalars()]))) + + def requirements_doc(self): + """Retrieves a list of routine requirements for documentation""" + return self.requirements + + def routine_header_cpp(self, spaces, default_event): + """Retrieves the C++ templated definition for a routine""" + indent = " " * (spaces + self.length()) + result = "template <" + self.template.name + ">\n" + result += "StatusCode " + self.name.capitalize() + "(" + result += (",\n" + indent).join([a for a in self.arguments_def(self.template)]) + result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")" + return result + + def routine_header_type_cpp(self, spaces): + """As above, but now without variable names""" + indent = " " * (spaces + self.length()) + result = "template <" + self.template.name + ">\n" + result += "StatusCode " + self.name.capitalize() + "(" + result += (",\n" + indent).join([a for a in self.arguments_type(self.template)]) + result += ",\n" + indent + "cl_command_queue*, cl_event*)" + return result + + def routine_header_c(self, flavour, spaces, extra_qualifier): + """As above, but now for C""" + indent = " " * (spaces + self.length()) + result = "StatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "(" + result += (",\n" + indent).join([a for a in self.arguments_def(flavour)]) + result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)" + return result + + def routine_header_wrapper_clblas(self, flavour, def_only, spaces): + """As above, but now for the clBLAS wrapper""" + template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else "" + indent = " " * (spaces + self.length() + len(template)) + result = "" + if self.no_scalars(): + result += "template <" + if def_only: + result += flavour.name + result += ">\n" + result += "clblasStatus clblasX" + self.name + template + "(" + result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_clblas(flavour)]) + result += ",\n" + indent + "cl_uint num_queues, cl_command_queue *queues" + result += ",\n" + indent + "cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)" + return result + + def routine_header_wrapper_cblas(self, flavour, spaces): + """As above, but now for the CBLAS wrapper""" + indent = " " * (spaces + self.length()) + result = "void cblasX" + self.name + "(" + result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cblas(flavour)]) + ")" + return result diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py deleted file mode 100644 index 00883776..00000000 --- a/scripts/generator/routine.py +++ /dev/null @@ -1,603 +0,0 @@ -#!/usr/bin/env python - -# ================================================================================================== -# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. -# -# Author(s): -# Cedric Nugteren <www.cedricnugteren.nl> -# -# This file contains the 'Routine' class, used in the generator script to generate the CLBlast API -# interface and implementation. -# -# ================================================================================================== - -# System modules -from itertools import chain - -# Translates an option name to a CLBlast data-type -def OptionToCLBlast(x): - return { - 'layout': "Layout", - 'a_transpose': "Transpose", - 'b_transpose': "Transpose", - 'ab_transpose': "Transpose", - 'side': "Side", - 'triangle': "Triangle", - 'diagonal': "Diagonal", - }[x] - -# As above, but for clBLAS data-types -def OptionToWrapperCL(x): - return { - 'layout': "clblasOrder", - 'a_transpose': "clblasTranspose", - 'b_transpose': "clblasTranspose", - 'ab_transpose': "clblasTranspose", - 'side': "clblasSide", - 'triangle': "clblasUplo", - 'diagonal': "clblasDiag", - }[x] - -# As above, but for CBLAS data-types -def OptionToWrapperC(x): - return { - 'layout': "CBLAS_ORDER", - 'a_transpose': "CBLAS_TRANSPOSE", - 'b_transpose': "CBLAS_TRANSPOSE", - 'ab_transpose': "CBLAS_TRANSPOSE", - 'side': "CBLAS_SIDE", - 'triangle': "CBLAS_UPLO", - 'diagonal': "CBLAS_DIAG", - }[x] - -# Translates an option name to a documentation string -def OptionToDoc(x): - return { - 'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.", - 'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", - 'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", - 'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", - 'side': "The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).", - 'triangle': "The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).", - 'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.", - }[x] - -# ================================================================================================== - -# Class holding routine-specific information (e.g. name, which arguments, which precisions) -class Routine(): - def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options, - inputs, outputs, scalars, scratch, description, details, requirements): - self.implemented = implemented - self.has_tests = has_tests - self.level = level - self.name = name - self.template = template - self.flavours = flavours - self.sizes = sizes - self.options = options - self.inputs = inputs - self.outputs = outputs - self.scalars = scalars - self.scratch = scratch # Scratch buffer (e.g. for xDOT) - self.description = description - self.details = details - self.requirements = requirements - - # List of scalar buffers - def ScalarBuffersFirst(self): - return ["dot","nrm2","asum","sum","imax","imin"] - def ScalarBuffersSecond(self): - return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] - - # List of scalars other than alpha and beta - def OtherScalars(self): - return ["cos","sin"] - - # List of buffers with unsigned int type - def IndexBuffers(self): - return ["imax","imin"] - - # Lists of input/output buffers not index (integer) - def NonIndexInputs(self): - buffers = self.inputs[:] # make a copy - for i in self.IndexBuffers(): - if i in buffers: buffers.remove(i) - return buffers - def NonIndexOutputs(self): - buffers = self.outputs[:] # make a copy - for i in self.IndexBuffers(): - if i in buffers: buffers.remove(i) - return buffers - - # List of buffers without 'inc' or 'ld' - def BuffersWithoutLdInc(self): - return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"] - - # Retrieves the number of characters in the routine's name - def Length(self): - return len(self.name) - - # Retrieves the postfix for a buffer - def Postfix(self, name): - return "inc" if (name in ["x","y"]) else "ld" - - # Determines whether or not this routine has scalar arguments (alpha/beta) - def NoScalars(self): - return self.scalars == [] - - # Returns the upper-case names of these routines (all flavours) - def ShortNames(self): - return "/".join([f.name+self.name.upper() for f in self.flavours]) - - # As above, but excludes some - def ShortNamesTested(self): - names = [f.name+self.name.upper() for f in self.flavours] - if "H"+self.name.upper() in names: names.remove("H"+self.name.upper()) - return "/".join(names) - - # Determines which buffers go first (between alpha and beta) and which ones go after - def BuffersFirst(self): - if self.level == "2b": - return ["x","y"] - return ["ap","a","b","x"] - def BuffersSecond(self): - if self.level == "2b": - return ["ap","a","b","c"] - return ["y","c"] - - # Distinguish between vectors and matrices - def BuffersVector(self): - return ["x","y"] - def BuffersMatrix(self): - return ["a","b","c","ap"] - - # ============================================================================================== - - # Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x') - def Buffer(self, name): - if (name in self.inputs) or (name in self.outputs): - a = [name+"_buffer"] - b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but with a '_bis' suffix for the buffer name - def BufferBis(self, name): - #if (name in self.IndexBuffers()): - # return self.Buffer(name) - if (name in self.inputs) or (name in self.outputs): - a = [name+"_buffer_bis"] - b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but with data-types - def BufferDef(self, name): - prefix = "const " if (name in self.inputs) else "" - if (name in self.inputs) or (name in self.outputs): - a = [prefix+"cl_mem "+name+"_buffer"] - b = ["const size_t "+name+"_offset"] - c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but with data-types - def BufferDefWrapperCL(self, name, flavour): - prefix = "const " if (name in self.inputs) else "" - if (name in self.inputs) or (name in self.outputs): - a = [prefix+"Buffer<"+flavour.buffertype+">& "+name+"_buffer"] - b = ["const size_t "+name+"_offset"] - c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but as vectors - def BufferDefVector(self, name, flavour): - prefix = "const " if (name in self.inputs) else "" - if (name in self.inputs) or (name in self.outputs): - a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"] - b = ["const size_t "+name+"_offset"] - c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but with Claduc buffers - def BufferCladuc(self, name): - if (name in self.inputs) or (name in self.outputs): - buffertype = "unsigned int" if (name in self.IndexBuffers()) else self.template.buffertype - a = ["Buffer<"+buffertype+">("+name+"_buffer)"] - b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but with a static cast for clBLAS wrapper - def BufferWrapperCL(self, name): - if (name in self.inputs) or (name in self.outputs): - a = [name+"_buffer()"] - b = [name+"_offset"] - c = [] - if (name in ["x","y"]): - c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"] - elif (name in ["a","b","c"]): - c = [name+"_"+self.Postfix(name)] - return [", ".join(a+b+c)] - return [] - - # As above but with a static cast for CBLAS wrapper - def BufferWrapperC(self, name, flavour): - prefix = "const " if (name in self.inputs) else "" - if (name in self.inputs) or (name in self.outputs): - if name == "sy1": - a = [name+"_buffer["+name+"_offset]"] - elif flavour.precision_name in ["C","Z"]: - a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"] - else: - a = ["&"+name+"_buffer["+name+"_offset]"] - c = [] - if (name in ["x","y"]): - c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"] - elif (name in ["a","b","c"]): - c = [name+"_"+self.Postfix(name)] - return [", ".join(a+c)] - return [] - - # As above, but only data-types - def BufferType(self, name): - prefix = "const " if (name in self.inputs) else "" - if (name in self.inputs) or (name in self.outputs): - a = [prefix+"cl_mem"] - b = ["const size_t"] - c = ["const size_t"] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # Retrieves the documentation of the buffers - def BufferDoc(self, name): - prefix = "const " if (name in self.inputs) else "" - inout = "input" if (name in self.inputs) else "output" - if (name in self.inputs) or (name in self.outputs): - math_name = name.upper()+" matrix" if (name in self.BuffersMatrix()) else name+" vector" - incld_description = "Leading dimension " if (name in self.BuffersMatrix()) else "Stride/increment " - a = ["`"+prefix+"cl_mem "+name+"_buffer`: OpenCL buffer to store the "+inout+" "+math_name+"."] - b = ["`const size_t "+name+"_offset`: The offset in elements from the start of the "+inout+" "+math_name+"."] - c = ["`const size_t "+name+"_"+self.Postfix(name)+"`: "+incld_description+"of the "+inout+" "+math_name+". This value must be greater than 0."] if (name not in self.BuffersWithoutLdInc()) else [] - return a+b+c - return [] - - # ============================================================================================== - - # Retrieves the name of a scalar (alpha/beta) - def Scalar(self, name): - if (name in self.scalars): - return [name] - return [] - - # As above, but converts from float to half - def ScalarHalfToFloat(self, name): - if name in self.scalars: - return ["HalfToFloat("+name+")"] - return [] - - # Retrieves the use of a scalar (alpha/beta) - def ScalarUse(self, name, flavour): - if name in self.scalars: - if name == "alpha": - return [flavour.UseAlpha()] - elif name == "beta": - return [flavour.UseBeta()] - return [name] - return [] - - # As above, but for the clBLAS wrapper - def ScalarUseWrapper(self, name, flavour): - if name in self.scalars: - if name == "alpha": - return [flavour.UseAlphaCL()] - elif name == "beta": - return [flavour.UseBetaCL()] - return [name] - return [] - - # As above, but for the CBLAS wrapper - def ScalarUseWrapperC(self, name, flavour): - if name in self.scalars: - if flavour.IsComplex(name): - return [name+"_array.data()"] - return [name] - return [] - - # Retrieves the definition of a scalar (alpha/beta) - def ScalarDef(self, name, flavour): - if name in self.scalars: - if name == "alpha": - return ["const "+flavour.alpha_cl+" "+name] - return ["const "+flavour.beta_cl+" "+name] - return [] - - # As above, but without 'cl_' prefix - def ScalarDefPlain(self, name, flavour): - if name in self.scalars: - if name == "alpha": - return ["const "+flavour.alpha_cpp+" "+name] - return ["const "+flavour.beta_cpp+" "+name] - return [] - - # Retrieves the type of a scalar (alpha/beta) - def ScalarType(self, name, flavour): - if name in self.scalars: - if name == "alpha": - return ["const "+flavour.alpha_cpp] - return ["const "+flavour.beta_cpp] - return [] - - # Retrieves the documentation of a scalar - def ScalarDoc(self, name): - if name in self.scalars: - if name == "alpha": - return ["`const "+self.template.alpha_cpp+" "+name+"`: Input scalar constant."] - return ["`const "+self.template.beta_cpp+" "+name+"`: Input scalar constant."] - return [] - - # ============================================================================================== - - # Retrieves a list of comma-separated sizes (m, n, k) - def Sizes(self): - if self.sizes: - return [", ".join([s for s in self.sizes])] - return [] - - # Retrieves the definition of the sizes (m,n,k) - def SizesDef(self): - if self.sizes: - return [", ".join(["const size_t "+s for s in self.sizes])] - return [] - - # Retrieves the types of the sizes (m,n,k) - def SizesType(self): - if self.sizes: - return [", ".join(["const size_t" for s in self.sizes])] - return [] - - # Retrieves the documentation of the sizes - def SizesDoc(self): - if self.sizes: - definitions = ["`const size_t "+s+"`: Integer size argument. This value must be positive." for s in self.sizes] - return definitions - return [] - - # ============================================================================================== - - # Retrieves a list of options - def Options(self): - if self.options: - return [", ".join(self.options)] - return [] - - # As above, but now casted to CLBlast data-types - def OptionsCast(self, indent): - if self.options: - options = ["static_cast<clblast::"+OptionToCLBlast(o)+">("+o+")" for o in self.options] - return [(",\n"+indent).join(options)] - return [] - - # Retrieves the definitions of the options (layout, transpose, side, etc.) - def OptionsDef(self): - if self.options: - definitions = ["const "+OptionToCLBlast(o)+" "+o for o in self.options] - return [", ".join(definitions)] - return [] - - # As above, but now using clBLAS data-types - def OptionsDefWrapperCL(self): - if self.options: - definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options] - return [", ".join(definitions)] - return [] - - # As above, but now using CBLAS data-types - def OptionsDefWrapperC(self): - if self.options: - definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options] - return [", ".join(definitions)] - return [] - - # Retrieves the types of the options (layout, transpose, side, etc.) - def OptionsType(self): - if self.options: - definitions = ["const "+OptionToCLBlast(o) for o in self.options] - return [", ".join(definitions)] - return [] - - # Retrieves the documentation of the options - def OptionsDoc(self): - if self.options: - definitions = ["`const "+OptionToCLBlast(o)+" "+o+"`: "+OptionToDoc(o) for o in self.options] - return definitions - return [] - - # ============================================================================================== - - # Retrieves a combination of all the argument names (no types) - def Arguments(self): - return (self.Options() + self.Sizes() + - list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) + - self.Scalar("alpha") + - list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + - self.Scalar("beta") + - list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) + - list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.Scalar(s) for s in self.OtherScalars()]))) - - # As above, but with conversions from half to float - def ArgumentsHalf(self): - return (self.Options() + self.Sizes() + - list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarHalfToFloat("alpha") + - list(chain(*[self.BufferBis(b) for b in self.BuffersFirst()])) + - self.ScalarHalfToFloat("beta") + - list(chain(*[self.BufferBis(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.Scalar(s) for s in self.OtherScalars()]))) - - # Retrieves a combination of all the argument names, with Claduc casts - def ArgumentsCladuc(self, flavour, indent): - return (self.Options() + self.Sizes() + - list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersFirst()])) + - self.Scalar("alpha") + - list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) + - self.Scalar("beta") + - list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.Scalar(s) for s in self.OtherScalars()]))) - - # As above, but with CLBlast casts - def ArgumentsCast(self, flavour, indent): - return (self.OptionsCast(indent) + self.Sizes() + - list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarUse("alpha", flavour) + - list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + - self.ScalarUse("beta", flavour) + - list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) + - list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()]))) - - # As above, but for the clBLAS wrapper - def ArgumentsWrapperCL(self, flavour): - return (self.Options() + self.Sizes() + - list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarUseWrapper("alpha", flavour) + - list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) + - self.ScalarUseWrapper("beta", flavour) + - list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()]))) - - # As above, but for the CBLAS wrapper - def ArgumentsWrapperC(self, flavour): - return (self.Options() + self.Sizes() + - self.ScalarUseWrapperC("alpha", flavour) + - list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) + - self.ScalarUseWrapperC("beta", flavour) + - list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()]))) - - # Retrieves a combination of all the argument definitions - def ArgumentsDef(self, flavour): - return (self.OptionsDef() + self.SizesDef() + - list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarDef("alpha", flavour) + - list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + - self.ScalarDef("beta", flavour) + - list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()]))) - - # As above, but clBLAS wrapper plain datatypes - def ArgumentsDefWrapperCL(self, flavour): - return (self.OptionsDefWrapperCL() + self.SizesDef() + - list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersFirst()])) + - self.ScalarDefPlain("alpha", flavour) + - list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersFirst()])) + - self.ScalarDefPlain("beta", flavour) + - list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) - - # As above, but CBLAS wrapper plain datatypes - def ArgumentsDefWrapperC(self, flavour): - return (self.OptionsDefWrapperC() + self.SizesDef() + - list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) + - self.ScalarDefPlain("alpha", flavour) + - list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) + - self.ScalarDefPlain("beta", flavour) + - list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) - - # Retrieves a combination of all the argument types - def ArgumentsType(self, flavour): - return (self.OptionsType() + self.SizesType() + - list(chain(*[self.BufferType(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarType("alpha", flavour) + - list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) + - self.ScalarType("beta", flavour) + - list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferType(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarType(s, flavour) for s in self.OtherScalars()]))) - - # Retrieves a combination of all the argument types - def ArgumentsDoc(self): - return (self.OptionsDoc() + self.SizesDoc() + - list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) + - list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarDoc("alpha") + - list(chain(*[self.BufferDoc(b) for b in self.BuffersFirst()])) + - self.ScalarDoc("beta") + - list(chain(*[self.BufferDoc(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarDoc(s) for s in self.OtherScalars()]))) - - # ============================================================================================== - - # Retrieves a list of routine requirements for documentation - def RequirementsDoc(self): - return self.requirements - - # ============================================================================================== - - # Retrieves the C++ templated definition for a routine - def RoutineHeaderCPP(self, spaces, default_event): - indent = " "*(spaces + self.Length()) - result = "template <"+self.template.name+">\n" - result += "StatusCode "+self.name.capitalize()+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)]) - result += ",\n"+indent+"cl_command_queue* queue, cl_event* event"+default_event+")" - return result - - # As above, but now without variable names - def RoutineHeaderTypeCPP(self, spaces): - indent = " "*(spaces + self.Length()) - result = "template <"+self.template.name+">\n" - result += "StatusCode "+self.name.capitalize()+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsType(self.template)]) - result += ",\n"+indent+"cl_command_queue*, cl_event*)" - return result - - # As above, but now for C - def RoutineHeaderC(self, flavour, spaces, extra_qualifier): - indent = " "*(spaces + self.Length()) - result = "StatusCode"+extra_qualifier+" CLBlast"+flavour.name+self.name+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)]) - result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" - return result - - # As above, but now for the clBLAS wrapper - def RoutineHeaderWrapperCL(self, flavour, def_only, spaces): - template = "<"+flavour.template+">" if self.NoScalars() and not def_only else "" - indent = " "*(spaces + self.Length() + len(template)) - result = "" - if self.NoScalars(): - result += "template <" - if def_only: - result += flavour.name - result += ">\n" - result += "clblasStatus clblasX"+self.name+template+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)]) - result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues" - result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)" - return result - - # As above, but now for the CBLAS wrapper - def RoutineHeaderWrapperC(self, flavour, def_only, spaces): - indent = " "*(spaces + self.Length()) - result = "void cblasX"+self.name+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")" - return result - -# ================================================================================================== diff --git a/src/database/database.cpp b/src/database/database.cpp index 38974b95..34c44a29 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -35,9 +35,9 @@ const std::vector<Database::DatabaseEntry> Database::database = { XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble, XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble, XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble, - /* XgemvFastRotHalf, */ XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble, + XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble, XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble, - /* XgemmHalf, */ XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, + XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble, diff --git a/src/database/database.hpp b/src/database/database.hpp index 8d6d3863..a6ab49c5 100644 --- a/src/database/database.hpp +++ b/src/database/database.hpp @@ -72,9 +72,9 @@ class Database { static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble; static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble; static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble; - static const DatabaseEntry /* XgemvFastRotHalf, */ XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble; + static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble; static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble; - static const DatabaseEntry /* XgemmHalf, */ XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; + static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble; static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble; static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble; diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp index d592f110..a6b7dfe8 100644 --- a/src/database/kernels/copy.hpp +++ b/src/database/kernels/copy.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::CopyHalf = { "Copy", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, } @@ -41,7 +42,7 @@ const Database::DatabaseEntry Database::CopySingle = { { "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, } }, { // ARM GPUs @@ -61,11 +62,12 @@ const Database::DatabaseEntry Database::CopySingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, } }, { // Intel accelerators @@ -85,15 +87,15 @@ const Database::DatabaseEntry Database::CopySingle = { { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, - { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, } }, } @@ -110,7 +112,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { { "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, { // Intel CPUs @@ -118,17 +120,18 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, } }, { // Intel accelerators @@ -149,12 +152,12 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } }, { "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, } @@ -171,7 +174,7 @@ const Database::DatabaseEntry Database::CopyDouble = { { "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } }, { "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, } }, { // ARM GPUs @@ -185,7 +188,7 @@ const Database::DatabaseEntry Database::CopyDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, } }, { // Intel accelerators @@ -208,12 +211,12 @@ const Database::DatabaseEntry Database::CopyDouble = { { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, } }, } @@ -230,7 +233,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = { { "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, { // ARM GPUs @@ -244,7 +247,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } }, } }, { // Intel accelerators @@ -272,7 +275,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, } diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp index cd034f15..3cfabaf4 100644 --- a/src/database/kernels/pad.hpp +++ b/src/database/kernels/pad.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::PadHalf = { "Pad", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } @@ -41,7 +42,7 @@ const Database::DatabaseEntry Database::PadSingle = { { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, } }, { // ARM GPUs @@ -55,17 +56,18 @@ const Database::DatabaseEntry Database::PadSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } }, { // Intel accelerators @@ -88,12 +90,12 @@ const Database::DatabaseEntry Database::PadSingle = { { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } }, } @@ -110,7 +112,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = { { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // ARM GPUs @@ -124,17 +126,18 @@ const Database::DatabaseEntry Database::PadComplexSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, { "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, } }, { // Intel accelerators @@ -157,12 +160,12 @@ const Database::DatabaseEntry Database::PadComplexSingle = { { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } @@ -179,7 +182,7 @@ const Database::DatabaseEntry Database::PadDouble = { { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, } }, { // ARM GPUs @@ -193,7 +196,7 @@ const Database::DatabaseEntry Database::PadDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, } }, { // Intel accelerators @@ -216,12 +219,12 @@ const Database::DatabaseEntry Database::PadDouble = { { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } @@ -238,7 +241,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = { { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // ARM GPUs @@ -252,7 +255,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, } }, { // Intel accelerators @@ -275,12 +278,12 @@ const Database::DatabaseEntry Database::PadComplexDouble = { { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp index c2034c3e..88bd4ea7 100644 --- a/src/database/kernels/padtranspose.hpp +++ b/src/database/kernels/padtranspose.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::PadtransposeHalf = { "Padtranspose", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, } @@ -55,12 +56,13 @@ const Database::DatabaseEntry Database::PadtransposeSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, @@ -88,12 +90,12 @@ const Database::DatabaseEntry Database::PadtransposeSingle = { { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, } }, } @@ -110,7 +112,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = { { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, { // ARM GPUs @@ -130,11 +132,12 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, } }, { // Intel accelerators @@ -157,12 +160,12 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = { { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, } }, } @@ -179,7 +182,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = { { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, } }, { // ARM GPUs @@ -193,7 +196,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, { // Intel accelerators @@ -216,12 +219,12 @@ const Database::DatabaseEntry Database::PadtransposeDouble = { { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, } }, } @@ -238,7 +241,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = { { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, { // ARM GPUs @@ -252,7 +255,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, { // Intel accelerators @@ -275,12 +278,12 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = { { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, } }, } diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp index 8e852c4b..0e1b608e 100644 --- a/src/database/kernels/transpose.hpp +++ b/src/database/kernels/transpose.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::TransposeHalf = { "Transpose", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, } @@ -41,7 +42,7 @@ const Database::DatabaseEntry Database::TransposeSingle = { { "Oland", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, } }, { // ARM GPUs @@ -61,11 +62,12 @@ const Database::DatabaseEntry Database::TransposeSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, } }, { // Intel accelerators @@ -88,12 +90,12 @@ const Database::DatabaseEntry Database::TransposeSingle = { { "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, } }, } @@ -110,7 +112,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = { { "Oland", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, { // ARM GPUs @@ -124,17 +126,18 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, } }, { // NVIDIA GPUs @@ -151,12 +154,12 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = { { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, } }, } @@ -173,7 +176,7 @@ const Database::DatabaseEntry Database::TransposeDouble = { { "Oland", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, } }, { // ARM GPUs @@ -187,7 +190,7 @@ const Database::DatabaseEntry Database::TransposeDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, } }, { // Intel accelerators @@ -210,12 +213,12 @@ const Database::DatabaseEntry Database::TransposeDouble = { { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, } @@ -232,7 +235,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = { { "Oland", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, } }, { // ARM GPUs @@ -246,7 +249,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, } }, { // NVIDIA GPUs @@ -263,12 +266,12 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = { { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, } }, } diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp index 905ee084..9c1bcd99 100644 --- a/src/database/kernels/xaxpy.hpp +++ b/src/database/kernels/xaxpy.hpp @@ -18,13 +18,14 @@ const Database::DatabaseEntry Database::XaxpyHalf = { "Xaxpy", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, - { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, + { "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, + { "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, } }, } @@ -41,7 +42,7 @@ const Database::DatabaseEntry Database::XaxpySingle = { { "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, } }, { // ARM GPUs @@ -55,13 +56,14 @@ const Database::DatabaseEntry Database::XaxpySingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",8}, {"WGS",256}, {"WPT",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } }, { "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, @@ -77,10 +79,10 @@ const Database::DatabaseEntry Database::XaxpySingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 1070", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 480", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",4} } }, + { "GeForce GTX 480", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 680", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 750", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, @@ -88,12 +90,12 @@ const Database::DatabaseEntry Database::XaxpySingle = { { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, } }, } @@ -110,7 +112,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = { { "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, { // ARM GPUs @@ -124,17 +126,18 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "default", { {"VW",8}, {"WGS",1024}, {"WPT",1} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"VW",4}, {"WGS",64}, {"WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } }, { "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",256}, {"WPT",2} } }, } }, { // Intel accelerators @@ -157,12 +160,12 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = { { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, } @@ -193,7 +196,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",8}, {"WGS",512}, {"WPT",1} } }, } }, { // Intel accelerators @@ -206,7 +209,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",8} } }, - { "GeForce GTX 480", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 750", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, @@ -216,7 +219,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = { { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, { // Default @@ -238,7 +241,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = { { "Oland", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, { // ARM GPUs @@ -252,7 +255,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",4}, {"WGS",1024}, {"WPT",1} } }, } }, { // Intel accelerators @@ -280,7 +283,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, } diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp index e36dd8ca..987a990d 100644 --- a/src/database/kernels/xdot.hpp +++ b/src/database/kernels/xdot.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XdotHalf = { "Xdot", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } }, { "default", { {"WGS1",32}, {"WGS2",32} } }, } @@ -37,7 +38,6 @@ const Database::DatabaseEntry Database::XdotSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, { "Oland", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",128}, {"WGS2",32} } }, @@ -53,10 +53,11 @@ const Database::DatabaseEntry Database::XdotSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WGS2",32} } }, - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } }, { "Iris Pro", { {"WGS1",512}, {"WGS2",64} } }, - { "default", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, } }, { // NVIDIA GPUs @@ -70,12 +71,12 @@ const Database::DatabaseEntry Database::XdotSingle = { { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",128}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",256} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",32} } }, } }, } @@ -88,11 +89,10 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, { "Oland", { {"WGS1",128}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, { // Intel CPUs @@ -104,6 +104,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WGS2",32} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } }, { "Iris Pro", { {"WGS1",32}, {"WGS2",32} } }, @@ -121,12 +122,12 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",512}, {"WGS2",64} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",32} } }, } }, } @@ -139,11 +140,10 @@ const Database::DatabaseEntry Database::XdotDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, { "Oland", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, { // Intel CPUs @@ -163,12 +163,12 @@ const Database::DatabaseEntry Database::XdotDouble = { { "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",64} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",64} } }, } }, } @@ -181,11 +181,10 @@ const Database::DatabaseEntry Database::XdotComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, { "Oland", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",32} } }, } }, { // Intel CPUs @@ -205,12 +204,12 @@ const Database::DatabaseEntry Database::XdotComplexDouble = { { "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } }, { "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",64} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",64} } }, } }, } diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp index 61b7ff05..d19c55b5 100644 --- a/src/database/kernels/xgemm.hpp +++ b/src/database/kernels/xgemm.hpp @@ -14,6 +14,18 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::XgemmHalf = { + "Xgemm", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::XgemmSingle = { "Xgemm", Precision::kSingle, { { // AMD GPUs @@ -43,6 +55,7 @@ const Database::DatabaseEntry Database::XgemmSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, { "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, @@ -75,7 +88,7 @@ const Database::DatabaseEntry Database::XgemmSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, } @@ -112,6 +125,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, { "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, @@ -156,7 +170,7 @@ const Database::DatabaseEntry Database::XgemmDouble = { "Xgemm", Precision::kDouble, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, { "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp index 6d680b06..e5e8845e 100644 --- a/src/database/kernels/xgemv.hpp +++ b/src/database/kernels/xgemv.hpp @@ -18,13 +18,14 @@ const Database::DatabaseEntry Database::XgemvHalf = { "Xgemv", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1} } }, - { "default", { {"WGS1",128}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",128}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, } @@ -48,13 +49,14 @@ const Database::DatabaseEntry Database::XgemvSingle = { kDeviceTypeCPU, "Intel", { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, - { "default", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",4} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WPT1",1} } }, - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WPT1",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } }, { "Iris", { {"WGS1",64}, {"WPT1",2} } }, { "Iris Pro", { {"WGS1",256}, {"WPT1",2} } }, @@ -81,7 +83,7 @@ const Database::DatabaseEntry Database::XgemvSingle = { { "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } }, { "Tesla K20m", { {"WGS1",128}, {"WPT1",1} } }, { "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",256}, {"WPT1",1} } }, } }, { // Default @@ -116,6 +118,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } }, { "Iris", { {"WGS1",256}, {"WPT1",1} } }, @@ -161,14 +164,14 @@ const Database::DatabaseEntry Database::XgemvDouble = { { "Oland", { {"WGS1",256}, {"WPT1",1} } }, { "Pitcairn", { {"WGS1",256}, {"WPT1",1} } }, { "Tahiti", { {"WGS1",256}, {"WPT1",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",256}, {"WPT1",1} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, - { "default", { {"WGS1",64}, {"WPT1",2} } }, + { "default", { {"WGS1",64}, {"WPT1",4} } }, } }, { // Intel accelerators @@ -191,12 +194,12 @@ const Database::DatabaseEntry Database::XgemvDouble = { { "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } }, { "Tesla K20m", { {"WGS1",256}, {"WPT1",1} } }, { "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1} } }, } }, } @@ -220,7 +223,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = { kDeviceTypeCPU, "Intel", { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, - { "default", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",4} } }, } }, { // Intel accelerators @@ -234,7 +237,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = { { "GRID K520", { {"WGS1",128}, {"WPT1",1} } }, { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } }, { "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1} } }, } }, { // Default diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp index 65b15030..52af628c 100644 --- a/src/database/kernels/xgemv_fast.hpp +++ b/src/database/kernels/xgemv_fast.hpp @@ -18,13 +18,14 @@ const Database::DatabaseEntry Database::XgemvFastHalf = { "XgemvFast", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } }, - { "default", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } }, + { "default", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } }, + { "default", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } }, } }, } @@ -48,17 +49,18 @@ const Database::DatabaseEntry Database::XgemvFastSingle = { kDeviceTypeCPU, "Intel", { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } }, - { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } }, + { "default", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "Iris", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } }, { "Iris Pro", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } }, - { "default", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } }, } }, { // Intel accelerators @@ -81,7 +83,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = { { "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Tesla K20m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, - { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, } }, { // Default @@ -116,6 +118,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Iris", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, @@ -189,7 +192,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = { { "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "Tesla K20m", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, - { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, } }, { // Default diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp index 9822fb20..328094e1 100644 --- a/src/database/kernels/xgemv_fast_rot.hpp +++ b/src/database/kernels/xgemv_fast_rot.hpp @@ -14,6 +14,18 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::XgemvFastRotHalf = { + "XgemvFastRot", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::XgemvFastRotSingle = { "XgemvFastRot", Precision::kSingle, { { // AMD GPUs @@ -30,9 +42,11 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = { }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } }, { "Iris Pro", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, - { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, } }, { // NVIDIA GPUs @@ -43,7 +57,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",8} } }, + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, } }, } @@ -67,14 +81,16 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = { }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } }, { "Iris Pro", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, - { "default", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } }, } }, } @@ -104,7 +120,7 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",8} } }, + { "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, } }, } @@ -128,7 +144,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } }, } }, } diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp index 216925fc..3e9c25c1 100644 --- a/src/database/kernels/xger.hpp +++ b/src/database/kernels/xger.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XgerHalf = { "Xger", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, } @@ -41,7 +42,7 @@ const Database::DatabaseEntry Database::XgerSingle = { { "Oland", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, - { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, } }, { // ARM GPUs @@ -54,16 +55,17 @@ const Database::DatabaseEntry Database::XgerSingle = { kDeviceTypeCPU, "Intel", { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, - { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, + { "default", { {"WGS1",128}, {"WGS2",8}, {"WPT",4} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } }, - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } }, { "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } }, - { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",2} } }, + { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, } }, { // NVIDIA GPUs @@ -75,12 +77,12 @@ const Database::DatabaseEntry Database::XgerSingle = { { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, { "GeForce GTX 750", { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } }, { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, } }, } @@ -97,7 +99,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = { { "Oland", { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, - { "default", { {"WGS1",4}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, } }, { // ARM GPUs @@ -110,16 +112,17 @@ const Database::DatabaseEntry Database::XgerComplexSingle = { kDeviceTypeCPU, "Intel", { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, + { "default", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } }, - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",4}, {"WPT",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, { "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } }, - { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, } }, { // NVIDIA GPUs @@ -131,12 +134,12 @@ const Database::DatabaseEntry Database::XgerComplexSingle = { { "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "GeForce GTX 750", { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } }, { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, - { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } }, + { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",4}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } }, } }, } @@ -153,7 +156,7 @@ const Database::DatabaseEntry Database::XgerDouble = { { "Oland", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, - { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, } }, { // ARM GPUs @@ -166,7 +169,7 @@ const Database::DatabaseEntry Database::XgerDouble = { kDeviceTypeCPU, "Intel", { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } }, - { "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } }, + { "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } }, } }, { // NVIDIA GPUs @@ -178,12 +181,12 @@ const Database::DatabaseEntry Database::XgerDouble = { { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, { "GeForce GTX 750", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, - { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, } }, } @@ -200,7 +203,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = { { "Oland", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, { "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, - { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, } }, { // ARM GPUs @@ -225,12 +228,12 @@ const Database::DatabaseEntry Database::XgerComplexDouble = { { "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } }, { "GeForce GTX 750", { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } }, { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, } }, } diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 2fca6b73..b0817242 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -148,6 +148,13 @@ R"( #define SetToOne(a) a = ONE #endif +// Determines whether a variable is zero +#if PRECISION == 3232 || PRECISION == 6464 + #define IsZero(a) ((a.x == ZERO) && (a.y == ZERO)) +#else + #define IsZero(a) (a == ZERO) +#endif + // The absolute value (component-wise) #if PRECISION == 3232 || PRECISION == 6464 #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y) diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 48d0eb5c..48ad2e75 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -30,10 +30,10 @@ R"( // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xamax(const int n, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global singlereal* maxgm, __global unsigned int* imaxgm) { +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xamax(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global singlereal* maxgm, __global unsigned int* imaxgm) { __local singlereal maxlm[WGS1]; __local unsigned int imaxlm[WGS1]; const int lid = get_local_id(0); @@ -95,10 +95,10 @@ __kernel void Xamax(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm, - const __global unsigned int* restrict imaxgm, - __global unsigned int* imax, const int imax_offset) { +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XamaxEpilogue(const __global singlereal* restrict maxgm, + const __global unsigned int* restrict imaxgm, + __global unsigned int* imax, const int imax_offset) { __local singlereal maxlm[WGS2]; __local unsigned int imaxlm[WGS2]; const int lid = get_local_id(0); diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl index 58d0f11b..1fc91be8 100644 --- a/src/kernels/level1/xasum.opencl +++ b/src/kernels/level1/xasum.opencl @@ -30,10 +30,10 @@ R"( // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xasum(const int n, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* output) { +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xasum(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* output) { __local real lm[WGS1]; const int lid = get_local_id(0); const int wgid = get_group_id(0); @@ -74,9 +74,9 @@ __kernel void Xasum(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XasumEpilogue(const __global real* restrict input, - __global real* asum, const int asum_offset) { +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XasumEpilogue(const __global real* restrict input, + __global real* asum, const int asum_offset) { __local real lm[WGS2]; const int lid = get_local_id(0); diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index d533041b..ece8476e 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -22,10 +22,10 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xaxpy(const int n, const real_arg arg_alpha, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* ygm, const int y_offset, const int y_inc) { +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void Xaxpy(const int n, const real_arg arg_alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { const real alpha = GetRealArg(arg_alpha); // Loops over the work that needs to be done (allows for an arbitrary number of threads) @@ -40,10 +40,10 @@ __kernel void Xaxpy(const int n, const real_arg arg_alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XaxpyFast(const int n, const real_arg arg_alpha, - const __global realV* restrict xgm, - __global realV* ygm) { +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XaxpyFast(const int n, const real_arg arg_alpha, + const __global realV* restrict xgm, + __global realV* ygm) { const real alpha = GetRealArg(arg_alpha); #pragma unroll diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl index 97c27ccf..228e0735 100644 --- a/src/kernels/level1/xcopy.opencl +++ b/src/kernels/level1/xcopy.opencl @@ -22,10 +22,10 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xcopy(const int n, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* ygm, const int y_offset, const int y_inc) { +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void Xcopy(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { // Loops over the work that needs to be done (allows for an arbitrary number of threads) #pragma unroll @@ -38,10 +38,10 @@ __kernel void Xcopy(const int n, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XcopyFast(const int n, - const __global realV* restrict xgm, - __global realV* ygm) { +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XcopyFast(const int n, + const __global realV* restrict xgm, + __global realV* ygm) { #pragma unroll for (int w=0; w<WPT; ++w) { const int id = w*get_global_size(0) + get_global_id(0); diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl index e13eb3c1..02f04ea7 100644 --- a/src/kernels/level1/xdot.opencl +++ b/src/kernels/level1/xdot.opencl @@ -30,11 +30,11 @@ R"( // ================================================================================================= // The main reduction kernel, performing the multiplication and the majority of the sum operation -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xdot(const int n, - const __global real* restrict xgm, const int x_offset, const int x_inc, - const __global real* restrict ygm, const int y_offset, const int y_inc, - __global real* output, const int do_conjugate) { +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xdot(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + const __global real* restrict ygm, const int y_offset, const int y_inc, + __global real* output, const int do_conjugate) { __local real lm[WGS1]; const int lid = get_local_id(0); const int wgid = get_group_id(0); @@ -73,9 +73,9 @@ __kernel void Xdot(const int n, // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to // be launched with a single workgroup only. -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XdotEpilogue(const __global real* restrict input, - __global real* dot, const int dot_offset) { +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XdotEpilogue(const __global real* restrict input, + __global real* dot, const int dot_offset) { __local real lm[WGS2]; const int lid = get_local_id(0); diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl index 9803687a..f6d869cb 100644 --- a/src/kernels/level1/xnrm2.opencl +++ b/src/kernels/level1/xnrm2.opencl @@ -30,10 +30,10 @@ R"( // ================================================================================================= // The main reduction kernel, performing the multiplication and the majority of the operation -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xnrm2(const int n, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* output) { +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xnrm2(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* output) { __local real lm[WGS1]; const int lid = get_local_id(0); const int wgid = get_group_id(0); @@ -72,9 +72,9 @@ __kernel void Xnrm2(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void Xnrm2Epilogue(const __global real* restrict input, - __global real* nrm2, const int nrm2_offset) { +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void Xnrm2Epilogue(const __global real* restrict input, + __global real* nrm2, const int nrm2_offset) { __local real lm[WGS2]; const int lid = get_local_id(0); diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl index 59936776..3da9c2fd 100644 --- a/src/kernels/level1/xscal.opencl +++ b/src/kernels/level1/xscal.opencl @@ -22,9 +22,10 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xscal(const int n, const real alpha, - __global real* xgm, const int x_offset, const int x_inc) { +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void Xscal(const int n, const real_arg arg_alpha, + __global real* xgm, const int x_offset, const int x_inc) { + const real alpha = GetRealArg(arg_alpha); // Loops over the work that needs to be done (allows for an arbitrary number of threads) #pragma unroll @@ -40,9 +41,11 @@ __kernel void Xscal(const int n, const real alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XscalFast(const int n, const real alpha, - __global realV* xgm) { +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XscalFast(const int n, const real_arg arg_alpha, + __global realV* xgm) { + const real alpha = GetRealArg(arg_alpha); + #pragma unroll for (int w=0; w<WPT; ++w) { const int id = w*get_global_size(0) + get_global_id(0); diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl index f6487b58..267271c0 100644 --- a/src/kernels/level1/xswap.opencl +++ b/src/kernels/level1/xswap.opencl @@ -22,10 +22,10 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xswap(const int n, - __global real* xgm, const int x_offset, const int x_inc, - __global real* ygm, const int y_offset, const int y_inc) { +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void Xswap(const int n, + __global real* xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { // Loops over the work that needs to be done (allows for an arbitrary number of threads) #pragma unroll @@ -40,10 +40,10 @@ __kernel void Xswap(const int n, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XswapFast(const int n, - __global realV* xgm, - __global realV* ygm) { +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XswapFast(const int n, + __global realV* xgm, + __global realV* ygm) { #pragma unroll for (int w=0; w<WPT; ++w) { const int id = w*get_global_size(0) + get_global_id(0); diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl index 83b6b15d..ff011acd 100644 --- a/src/kernels/level2/xgemv.opencl +++ b/src/kernels/level2/xgemv.opencl @@ -210,8 +210,8 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in // ================================================================================================= // Full version of the kernel -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xgemv(const int m, const int n, +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xgemv(const int m, const int n, const real_arg arg_alpha, const real_arg arg_beta, const int a_rotated, diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl index 210c42c1..02a1f956 100644 --- a/src/kernels/level2/xgemv_fast.opencl +++ b/src/kernels/level2/xgemv_fast.opencl @@ -88,16 +88,16 @@ inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, co // --> 'a_ld' is a multiple of VW2 // --> 'a_rotated' is 0 // --> 'do_conjugate' is 0 -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XgemvFast(const int m, const int n, - const real_arg arg_alpha, - const real_arg arg_beta, - const int a_rotated, - const __global realVF* restrict agm, const int a_offset, const int a_ld, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate, const int parameter, - const int kl_unused, const int ku_unused) { +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XgemvFast(const int m, const int n, + const real_arg arg_alpha, + const real_arg arg_beta, + const int a_rotated, + const __global realVF* restrict agm, const int a_offset, const int a_ld, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc, + const int do_conjugate, const int parameter, + const int kl_unused, const int ku_unused) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); @@ -190,16 +190,16 @@ __kernel void XgemvFast(const int m, const int n, // --> 'a_ld' is a multiple of VW3 // --> 'a_rotated' is 1 // --> 'do_conjugate' is 0 -__attribute__((reqd_work_group_size(WGS3, 1, 1))) -__kernel void XgemvFastRot(const int m, const int n, - const real_arg arg_alpha, - const real_arg arg_beta, - const int a_rotated, - const __global realVFR* restrict agm, const int a_offset, const int a_ld, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate, const int parameter, - const int kl_unused, const int ku_unused) { +__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1))) +void XgemvFastRot(const int m, const int n, + const real_arg arg_alpha, + const real_arg arg_beta, + const int a_rotated, + const __global realVFR* restrict agm, const int a_offset, const int a_ld, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc, + const int do_conjugate, const int parameter, + const int kl_unused, const int ku_unused) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl index f218a346..1b9ded12 100644 --- a/src/kernels/level2/xger.opencl +++ b/src/kernels/level2/xger.opencl @@ -18,13 +18,13 @@ R"( // ================================================================================================= // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC) -__attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xger(const int max1, const int max2, - const real_arg arg_alpha, - const __global real* restrict xgm, const int x_offset, const int x_inc, - const __global real* ygm, const int y_offset, const int y_inc, - __global real* restrict agm, const int a_offset, const int a_ld, - const int is_rowmajor) { +__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +void Xger(const int max1, const int max2, + const real_arg arg_alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + const __global real* ygm, const int y_offset, const int y_inc, + __global real* restrict agm, const int a_offset, const int a_ld, + const int is_rowmajor) { const real alpha = GetRealArg(arg_alpha); // Register storage for X and Y diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl index 1200ee63..b0772218 100644 --- a/src/kernels/level2/xher.opencl +++ b/src/kernels/level2/xher.opencl @@ -18,12 +18,12 @@ R"( // ================================================================================================= // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR) -__attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xher(const int n, - const real_arg arg_alpha, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* restrict agm, const int a_offset, const int a_ld, - const int is_upper, const int is_rowmajor) { +__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +void Xher(const int n, + const real_arg arg_alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* restrict agm, const int a_offset, const int a_ld, + const int is_upper, const int is_rowmajor) { const real alpha = GetRealArg(arg_alpha); // Register storage for X and XT diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl index d0f41571..00a756c9 100644 --- a/src/kernels/level2/xher2.opencl +++ b/src/kernels/level2/xher2.opencl @@ -18,13 +18,13 @@ R"( // ================================================================================================= // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2) -__attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xher2(const int n, - const real_arg arg_alpha, - const __global real* restrict xgm, const int x_offset, const int x_inc, - const __global real* restrict ygm, const int y_offset, const int y_inc, - __global real* restrict agm, const int a_offset, const int a_ld, - const int is_upper, const int is_rowmajor) { +__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +void Xher2(const int n, + const real_arg arg_alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + const __global real* restrict ygm, const int y_offset, const int y_inc, + __global real* restrict agm, const int a_offset, const int a_ld, + const int is_upper, const int is_rowmajor) { const real alpha = GetRealArg(arg_alpha); // Register storage for X and Y diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl index 53cc161a..ed2ded98 100644 --- a/src/kernels/level3/convert_hermitian.opencl +++ b/src/kernels/level3/convert_hermitian.opencl @@ -20,13 +20,13 @@ R"( // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void HermLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void HermLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll @@ -59,13 +59,13 @@ __kernel void HermLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void HermUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void HermUpperToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl index c6ce93ca..8ae53b37 100644 --- a/src/kernels/level3/convert_symmetric.opencl +++ b/src/kernels/level3/convert_symmetric.opencl @@ -20,13 +20,13 @@ R"( // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void SymmLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll @@ -53,13 +53,13 @@ __kernel void SymmLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void SymmUpperToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl index fdd2461a..f848dcc1 100644 --- a/src/kernels/level3/convert_triangular.opencl +++ b/src/kernels/level3/convert_triangular.opencl @@ -20,14 +20,14 @@ R"( // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void TriaLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest, - const int unit_diagonal) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void TriaLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest, + const int unit_diagonal) { // Loops over the work per thread in both dimensions #pragma unroll @@ -55,14 +55,14 @@ __kernel void TriaLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void TriaUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest, - const int unit_diagonal) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void TriaUpperToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest, + const int unit_diagonal) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl index dd975bf1..695b9003 100644 --- a/src/kernels/level3/copy_fast.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -35,11 +35,11 @@ R"( // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of // COPY_VW. Also requires both matrices to be of the same dimensions and without offset. -__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) -__kernel void CopyMatrixFast(const int ld, - __global const realC* restrict src, - __global realC* dest, - const real_arg arg_alpha) { +__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +void CopyMatrixFast(const int ld, + __global const realC* restrict src, + __global realC* dest, + const real_arg arg_alpha) { const real alpha = GetRealArg(arg_alpha); #pragma unroll for (int w_one=0; w_one<COPY_WPT; ++w_one) { diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index d0771c31..29480b25 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -24,15 +24,15 @@ R"( // Copies a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld // value and offset can be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void CopyPadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int do_conjugate) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void CopyPadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int do_conjugate) { const real alpha = GetRealArg(arg_alpha); // Loops over the work per thread in both dimensions @@ -65,16 +65,16 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two, // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but // writes only the actual data back to the destination matrix. Again, the ld value and offset can // be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void CopyMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void CopyMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { const real alpha = GetRealArg(arg_alpha); // Loops over the work per thread in both dimensions diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl index ea343533..70156d3a 100644 --- a/src/kernels/level3/transpose_fast.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -36,11 +36,11 @@ R"( // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without // offset. A more general version is available in 'padtranspose.opencl'. -__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) -__kernel void TransposeMatrixFast(const int ld, - __global const realT* restrict src, - __global realT* dest, - const real_arg arg_alpha) { +__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +void TransposeMatrixFast(const int ld, + __global const realT* restrict src, + __global realT* dest, + const real_arg arg_alpha) { const real alpha = GetRealArg(arg_alpha); // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index 2e20d667..ba0b7062 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -24,15 +24,15 @@ R"( // Transposes a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the transposed source matrix dimensions. -__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void TransposePadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int do_conjugate) { +__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +void TransposePadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int do_conjugate) { const real alpha = GetRealArg(arg_alpha); // Local memory to store a tile of the matrix (for coalescing) @@ -88,16 +88,16 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two, // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a // padded source matrix, but only the actual data is written back to the transposed destination // matrix. This kernel optionally checks for upper/lower triangular matrices. -__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void TransposeMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { +__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +void TransposeMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { const real alpha = GetRealArg(arg_alpha); // Local memory to store a tile of the matrix (for coalescing) diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 1ad0a558..d0ce06ad 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -31,7 +31,7 @@ // o-------o o-----o // // -// This kernel is seperated into two files. This is part 1 out of 2. +// This kernel is seperated into three files. This is part 1 out of 3. // // ================================================================================================= diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index 87e28cb5..e8234a29 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This is part 2 of 2 of the GEMM kernel. See part 1 for more information. +// This is part 2 of 3 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= @@ -133,260 +133,98 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int #endif int idm = mg + GetGroupID0() * (MWG/VWM); int idn = ng + GetGroupID1() * NWG; - - // The final multiplication with alpha and the addition with beta*C int index = idn*(kSizeM/VWM) + idm; + realM result; realM xval = cpm[ni][mi]; - realM yval = cgm[index]; - #if VWM == 1 - AXPBY(result, alpha, xval, beta, yval); - #elif VWM == 2 - AXPBY(result.x, alpha, xval.x, beta, yval.x); - AXPBY(result.y, alpha, xval.y, beta, yval.y); - #elif VWM == 4 - AXPBY(result.x, alpha, xval.x, beta, yval.x); - AXPBY(result.y, alpha, xval.y, beta, yval.y); - AXPBY(result.z, alpha, xval.z, beta, yval.z); - AXPBY(result.w, alpha, xval.w, beta, yval.w); - #elif VWM == 8 - AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); - AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); - AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); - AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); - AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); - AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); - AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); - AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); - #elif VWM == 16 - AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); - AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); - AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); - AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); - AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); - AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); - AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); - AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); - AXPBY(result.s8, alpha, xval.s8, beta, yval.s8); - AXPBY(result.s9, alpha, xval.s9, beta, yval.s9); - AXPBY(result.sA, alpha, xval.sA, beta, yval.sA); - AXPBY(result.sB, alpha, xval.sB, beta, yval.sB); - AXPBY(result.sC, alpha, xval.sC, beta, yval.sC); - AXPBY(result.sD, alpha, xval.sD, beta, yval.sD); - AXPBY(result.sE, alpha, xval.sE, beta, yval.sE); - AXPBY(result.sF, alpha, xval.sF, beta, yval.sF); - #endif - cgm[index] = result; - } - } -} - -// ================================================================================================= - -// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above. -inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, - const __global realM* restrict agm, const __global realN* restrict bgm, - __global realM* cgm, realM cpm[NWI][MWI/VWM] - #if SA == 1 && SB == 1 - , __local realM* alm, __local realN* blm - #elif SA == 1 - , __local realM* alm - #elif SB == 1 - , __local realN* blm - #endif - ) { - - // Allocates workitem-private memory (registers) - realM apm[MWI/VWM]; - realN bpm[NWI/VWN]; - - // Combined thread identifier (volatile to disable caching) - #if SA == 1 || SB == 1 - volatile int tid = get_local_id(0) + MDIMC*get_local_id(1); - #endif - - // Initializes the accumulation registers - InitAccRegisters(cpm); - - // Loops over all workgroup tiles - for (int kwg=0; kwg<kSizeK; kwg+=KWG) { - // Loads data: off-chip --> local (matrix A) - #if SA == 1 - GlobalToLocalA(agm, alm, kSizeM, tid, kwg); - #endif - // Loads data: off-chip --> local (matrix B) - #if SB == 1 - GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); - #endif - #if SA == 1 || SB == 1 - barrier(CLK_LOCAL_MEM_FENCE); - #endif - - // Loops over all workitem tiles, unrolled by a factor KWI - for (int pwi=0; pwi<KWG; pwi+=KWI) { - #pragma unroll - for (int pit=0; pit<KWI; ++pit) { - #if SA == 0 || SB == 0 - int idk = kwg + pwi + pit; - #endif - #if SA == 1 || SB == 1 - int kg = pwi+pit; - #endif - - // Loads data: local --> private (matrix A) - #if SA == 1 - LocalToPrivateA(alm, apm, kg); - // Loads data: off-chip --> private (matrix A) - #else - GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); + // The final multiplication with alpha (in case beta == 0) + if (IsZero(beta)) { + #if VWM == 1 + Multiply(result, alpha, xval); + #elif VWM == 2 + Multiply(result.x, alpha, xval.x); + Multiply(result.y, alpha, xval.y); + #elif VWM == 4 + Multiply(result.x, alpha, xval.x); + Multiply(result.y, alpha, xval.y); + Multiply(result.z, alpha, xval.z); + Multiply(result.w, alpha, xval.w); + #elif VWM == 8 + Multiply(result.s0, alpha, xval.s0); + Multiply(result.s1, alpha, xval.s1); + Multiply(result.s2, alpha, xval.s2); + Multiply(result.s3, alpha, xval.s3); + Multiply(result.s4, alpha, xval.s4); + Multiply(result.s5, alpha, xval.s5); + Multiply(result.s6, alpha, xval.s6); + Multiply(result.s7, alpha, xval.s7); + #elif VWM == 16 + Multiply(result.s0, alpha, xval.s0); + Multiply(result.s1, alpha, xval.s1); + Multiply(result.s2, alpha, xval.s2); + Multiply(result.s3, alpha, xval.s3); + Multiply(result.s4, alpha, xval.s4); + Multiply(result.s5, alpha, xval.s5); + Multiply(result.s6, alpha, xval.s6); + Multiply(result.s7, alpha, xval.s7); + Multiply(result.s8, alpha, xval.s8); + Multiply(result.s9, alpha, xval.s9); + Multiply(result.sA, alpha, xval.sA); + Multiply(result.sB, alpha, xval.sB); + Multiply(result.sC, alpha, xval.sC); + Multiply(result.sD, alpha, xval.sD); + Multiply(result.sE, alpha, xval.sE); + Multiply(result.sF, alpha, xval.sF); #endif + } - // Loads data: local --> private (matrix B) - #if SB == 1 - LocalToPrivateB(blm, bpm, kg); - // Loads data: off-chip --> private (matrix B) - #else - GlobalToPrivateB(bgm, bpm, kSizeN, idk); + // The final multiplication with alpha and the addition with beta*C + else { + realM yval = cgm[index]; + #if VWM == 1 + AXPBY(result, alpha, xval, beta, yval); + #elif VWM == 2 + AXPBY(result.x, alpha, xval.x, beta, yval.x); + AXPBY(result.y, alpha, xval.y, beta, yval.y); + #elif VWM == 4 + AXPBY(result.x, alpha, xval.x, beta, yval.x); + AXPBY(result.y, alpha, xval.y, beta, yval.y); + AXPBY(result.z, alpha, xval.z, beta, yval.z); + AXPBY(result.w, alpha, xval.w, beta, yval.w); + #elif VWM == 8 + AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); + AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); + AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); + AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); + AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); + AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); + AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); + AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); + #elif VWM == 16 + AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); + AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); + AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); + AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); + AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); + AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); + AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); + AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); + AXPBY(result.s8, alpha, xval.s8, beta, yval.s8); + AXPBY(result.s9, alpha, xval.s9, beta, yval.s9); + AXPBY(result.sA, alpha, xval.sA, beta, yval.sA); + AXPBY(result.sB, alpha, xval.sB, beta, yval.sB); + AXPBY(result.sC, alpha, xval.sC, beta, yval.sC); + AXPBY(result.sD, alpha, xval.sD, beta, yval.sD); + AXPBY(result.sE, alpha, xval.sE, beta, yval.sE); + AXPBY(result.sF, alpha, xval.sF, beta, yval.sF); #endif - - // Performs the accumulation (Cpm += Apm * Bpm) - MultiplyAccumulate(cpm, apm, bpm); } + cgm[index] = result; } - #if SA == 1 || SB == 1 - barrier(CLK_LOCAL_MEM_FENCE); - #endif - } - #if GLOBAL_MEM_FENCE == 1 - barrier(CLK_GLOBAL_MEM_FENCE); - #endif -} - -// ================================================================================================= -// The upper-triangular and lower-triangular kernels are only used in special cases -#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) - -// Main entry point of the kernel. This is the upper-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmUpper(const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = GetRealArg(arg_alpha); - const real beta = GetRealArg(arg_beta); - - // Skip these threads if they do not contain threads contributing to the upper-triangle - if (GetGroupID1()*NWG < GetGroupID0()*MWG) { - return; - } - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeN, alpha, beta); -} - -// Main entry point of the kernel. This is the lower-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmLower(const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = GetRealArg(arg_alpha); - const real beta = GetRealArg(arg_beta); - - // Skip these threads if they do not contain threads contributing to the lower-triangle - if (GetGroupID1()*NWG > GetGroupID0()*MWG) { - return; } - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeN, alpha, beta); -} - -// ================================================================================================= -// If not using a triangular version, include the regular kernel -#else - -// Main entry point of the kernel. This is the regular full version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = GetRealArg(arg_alpha); - const real beta = GetRealArg(arg_beta); - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeM, alpha, beta); } -#endif // ================================================================================================= // End of the C++11 raw string literal diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl new file mode 100644 index 00000000..a5faef5a --- /dev/null +++ b/src/kernels/level3/xgemm_part3.opencl @@ -0,0 +1,229 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This is part 3 of 3 of the GEMM kernel. See part 1 for more information. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above. +inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, + const __global realM* restrict agm, const __global realN* restrict bgm, + __global realM* cgm, realM cpm[NWI][MWI/VWM] + #if SA == 1 && SB == 1 + , __local realM* alm, __local realN* blm + #elif SA == 1 + , __local realM* alm + #elif SB == 1 + , __local realN* blm + #endif + ) { + + // Allocates workitem-private memory (registers) + realM apm[MWI/VWM]; + realN bpm[NWI/VWN]; + + // Combined thread identifier (volatile to disable caching) + #if SA == 1 || SB == 1 + volatile int tid = get_local_id(0) + MDIMC*get_local_id(1); + #endif + + // Initializes the accumulation registers + InitAccRegisters(cpm); + + // Loops over all workgroup tiles + for (int kwg=0; kwg<kSizeK; kwg+=KWG) { + + // Loads data: off-chip --> local (matrix A) + #if SA == 1 + GlobalToLocalA(agm, alm, kSizeM, tid, kwg); + #endif + // Loads data: off-chip --> local (matrix B) + #if SB == 1 + GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); + #endif + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + + // Loops over all workitem tiles, unrolled by a factor KWI + for (int pwi=0; pwi<KWG; pwi+=KWI) { + #pragma unroll + for (int pit=0; pit<KWI; ++pit) { + #if SA == 0 || SB == 0 + int idk = kwg + pwi + pit; + #endif + #if SA == 1 || SB == 1 + int kg = pwi+pit; + #endif + + // Loads data: local --> private (matrix A) + #if SA == 1 + LocalToPrivateA(alm, apm, kg); + // Loads data: off-chip --> private (matrix A) + #else + GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); + #endif + + // Loads data: local --> private (matrix B) + #if SB == 1 + LocalToPrivateB(blm, bpm, kg); + // Loads data: off-chip --> private (matrix B) + #else + GlobalToPrivateB(bgm, bpm, kSizeN, idk); + #endif + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulate(cpm, apm, bpm); + } + } + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + } + #if GLOBAL_MEM_FENCE == 1 + barrier(CLK_GLOBAL_MEM_FENCE); + #endif +} + +// ================================================================================================= +// The upper-triangular and lower-triangular kernels are only used in special cases +#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) + +// Main entry point of the kernel. This is the upper-triangular version. +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void XgemmUpper(const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Skip these threads if they do not contain threads contributing to the upper-triangle + if (GetGroupID1()*NWG < GetGroupID0()*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeN, alpha, beta); +} + +// Main entry point of the kernel. This is the lower-triangular version. +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void XgemmLower(const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Skip these threads if they do not contain threads contributing to the lower-triangle + if (GetGroupID1()*NWG > GetGroupID0()*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeN, alpha, beta); +} + +// ================================================================================================= +// If not using a triangular version, include the regular kernel +#else + +// Main entry point of the kernel. This is the regular full version. +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeM, alpha, beta); +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routine.cpp b/src/routine.cpp index 189ae190..d938d66f 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -14,6 +14,7 @@ #include <string> #include <vector> #include <chrono> +#include <cstdlib> #include "routine.hpp" @@ -42,13 +43,19 @@ StatusCode Routine::SetUp() { // Queries the cache to see whether or not the program (context-specific) is already there if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; } + // Sets the build options from an environmental variable (if set) + auto options = std::vector<std::string>(); + const auto environment_variable = std::getenv("CLBLAST_BUILD_OPTIONS"); + if (environment_variable != nullptr) { + options.push_back(std::string(environment_variable)); + } + // Queries the cache to see whether or not the binary (device-specific) is already there. If it // is, a program is created and stored in the cache if (BinaryIsInCache(device_name_, precision_, routine_name_)) { try { auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_); auto program = Program(device_, context_, binary); - auto options = std::vector<std::string>(); program.Build(device_, options); StoreProgramToCache(program, context_, precision_, routine_name_); } catch (...) { return StatusCode::kBuildProgramFailure; } @@ -115,7 +122,6 @@ StatusCode Routine::SetUp() { // Compiles the kernel try { auto program = Program(context_, source_string); - auto options = std::vector<std::string>(); const auto build_status = program.Build(device_, options); // Checks for compiler crashes/errors/warnings diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index a85f55b5..90e43fe4 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -34,6 +34,7 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/convert_hermitian.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_direct.opencl" ; } diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index 1ba6080f..ba770065 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -31,6 +31,7 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_part3.opencl" ; } diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index 0fa1b7b1..3063f3bc 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -31,6 +31,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_part3.opencl" ; } diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index 5a90a5a2..158cd9e5 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -31,6 +31,7 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_part3.opencl" ; } diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index 46b96b76..e1a72ef6 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -31,6 +31,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_part3.opencl" ; } diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 898b8435..4cb7fd00 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -7,7 +7,9 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. +// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations: +// - V==1: This tests some limited set of tuning parameters exhaustively. +// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset. // // ================================================================================================= @@ -21,18 +23,19 @@ namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class -template <typename T> +template <typename T, int V> class TuneXgemm { public: // The representative kernel and the source code - static std::string KernelFamily() { return "xgemm"; } + static std::string KernelFamily() { return (V==1) ? "xgemm_1" : "xgemm_2"; } static std::string KernelName() { return "Xgemm"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" + #include "../src/kernels/level3/xgemm_part3.opencl" ; } @@ -48,7 +51,7 @@ class TuneXgemm { static size_t DefaultM() { return 1024; } static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1024; } - static double DefaultFraction() { return 2048.0; } + static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel @@ -60,20 +63,38 @@ class TuneXgemm { // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "KWG", {16, 32}); - tuner.AddParameter(id, "MDIMC", {8, 16, 32}); - tuner.AddParameter(id, "NDIMC", {8, 16, 32}); - tuner.AddParameter(id, "MDIMA", {8, 16, 32}); - tuner.AddParameter(id, "NDIMB", {8, 16, 32}); - tuner.AddParameter(id, "KWI", {2, 8}); - tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); - tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); - tuner.AddParameter(id, "STRM", {0, 1}); - tuner.AddParameter(id, "STRN", {0, 1}); - tuner.AddParameter(id, "SA", {0, 1}); - tuner.AddParameter(id, "SB", {0, 1}); + if (V==1) { // limited subset of tuning parameters - but explorable exhaustively + tuner.AddParameter(id, "MWG", {16, 32, 64}); + tuner.AddParameter(id, "NWG", {16, 32, 64}); + tuner.AddParameter(id, "KWG", {32}); + tuner.AddParameter(id, "MDIMC", {8, 16, 32}); + tuner.AddParameter(id, "NDIMC", {8, 16, 32}); + tuner.AddParameter(id, "MDIMA", {8, 16, 32}); + tuner.AddParameter(id, "NDIMB", {8, 16, 32}); + tuner.AddParameter(id, "KWI", {2}); + tuner.AddParameter(id, "VWM", {1, 2, 4}); + tuner.AddParameter(id, "VWN", {1, 2, 4}); + tuner.AddParameter(id, "STRM", {0}); + tuner.AddParameter(id, "STRN", {0}); + tuner.AddParameter(id, "SA", {0, 1}); + tuner.AddParameter(id, "SB", {0, 1}); + } // a lot more tuning parameters - has to be sampled randomly, too much to test all + else { + tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "KWG", {16, 32}); + tuner.AddParameter(id, "MDIMC", {8, 16, 32}); + tuner.AddParameter(id, "NDIMC", {8, 16, 32}); + tuner.AddParameter(id, "MDIMA", {8, 16, 32}); + tuner.AddParameter(id, "NDIMB", {8, 16, 32}); + tuner.AddParameter(id, "KWI", {2}); + tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); + tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); + tuner.AddParameter(id, "STRM", {0, 1}); + tuner.AddParameter(id, "STRN", {0, 1}); + tuner.AddParameter(id, "SA", {0, 1}); + tuner.AddParameter(id, "SB", {0, 1}); + } } // Sets the constraints @@ -92,6 +113,14 @@ class TuneXgemm { // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}); tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); + + // Extra constraints for variation 1 to limit the set of options significantly + if (V==1) { + auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; }; + tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"}); + tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"}); + tuner.AddConstraint(id, IsEqual, {"SA", "SB"}); + } } // Sets the local memory size @@ -145,15 +174,22 @@ class TuneXgemm { using float2 = clblast::float2; using double2 = clblast::double2; -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { +// Function to tune a specific variation V (not within the clblast namespace) +template <int V> +void StartVariation(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half>, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float>, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double>, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2>, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemm<double2>, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half,V>, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float,V>, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double,V>, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2,V>, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemm<double2,V>, double2>(argc, argv); break; } +} + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); return 0; } diff --git a/src/utilities.cpp b/src/utilities.cpp index 11a6c439..77bc72d7 100644 --- a/src/utilities.cpp +++ b/src/utilities.cpp @@ -161,6 +161,8 @@ template <typename T> T ConvertArgument(const char* value) { return static_cast<T>(std::stoi(value)); } +template size_t ConvertArgument(const char* value); + template <> half ConvertArgument(const char* value) { return FloatToHalf(static_cast<float>(std::stod(value))); } @@ -179,6 +181,15 @@ template <> double2 ConvertArgument(const char* value) { return double2{val, val}; } +// Variant of "ConvertArgument" with default values +template <typename T> +T ConvertArgument(const char* value, T default_value) { + + if (value) { return ConvertArgument<T>(value); } + return default_value; +} +template size_t ConvertArgument(const char* value, size_t default_value); + // This function matches patterns in the form of "-option value" or "--option value". It returns a // default value in case the option is not found in the argument string. template <typename T> diff --git a/src/utilities.hpp b/src/utilities.hpp index 700d30d6..75bd5a69 100644 --- a/src/utilities.hpp +++ b/src/utilities.hpp @@ -187,6 +187,10 @@ std::string ToString(T value); template <typename T> T ConvertArgument(const char* value); +// Variant of "ConvertArgument" with default values +template <typename T> +T ConvertArgument(const char* value, T default_value); + // Basic argument parser, matching patterns in the form of "-option value" and "--option value" template <typename T> T GetArgument(const int argc, char **argv, std::string &help, diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp index 92e2c1b8..362c5c2c 100644 --- a/test/correctness/tester.cpp +++ b/test/correctness/tester.cpp @@ -15,6 +15,7 @@ #include <vector> #include <iostream> #include <cmath> +#include <cstdlib> #include "test/correctness/tester.hpp" @@ -27,8 +28,8 @@ template <typename T, typename U> Tester<T,U>::Tester(int argc, char *argv[], const bool silent, const std::string &name, const std::vector<std::string> &options): help_("Options given/available:\n"), - platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))), - device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))), + platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})))), + device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})))), context_(Context(device_)), queue_(Queue(context_, device_)), full_test_(CheckArgument(argc, argv, help_, kArgFullTest)), |