// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the interface to the CLTune auto-tuner. This is only used for the optional // and stand-alone tuner binaries and not part of the core of CLBlast. // // ================================================================================================= #ifndef CLBLAST_TUNING_H_ #define CLBLAST_TUNING_H_ #include #include #include namespace clblast { // ================================================================================================= // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect // the results. Used for all types of kernel families. Note that this is a header-only function so // that it is automatically compiled for the various kernels (given as the 'C' template argument). template void Tuner(int argc, char* argv[]) { // Sets the parameters and platform/device for which to tune (command-line options) auto help = std::string{"* Options given/available:\n"}; auto args = Arguments{}; args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); for (auto &o: C::GetOptions()) { if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, C::DefaultM()); } if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, C::DefaultN()); } if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, C::DefaultK()); } if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar()); } if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar()); } if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); } } fprintf(stdout, "%s\n", help.c_str()); // Tests validity of the given arguments C::TestValidArguments(args); // Tests for validity of the precision { auto platform = Platform(args.platform_id); auto device = Device(platform, args.device_id); if (!PrecisionSupported(device)) { printf("* Unsupported precision, skipping this tuning run\n\n"); return; } } // Creates input buffers with random data auto x_vec = std::vector(C::GetSizeX(args)); auto y_vec = std::vector(C::GetSizeY(args)); auto a_mat = std::vector(C::GetSizeA(args)); auto b_mat = std::vector(C::GetSizeB(args)); auto c_mat = std::vector(C::GetSizeC(args)); auto temp = std::vector(C::GetSizeTemp(args)); PopulateVector(x_vec); PopulateVector(y_vec); PopulateVector(a_mat); PopulateVector(b_mat); PopulateVector(c_mat); PopulateVector(temp); // Initializes the tuner for the chosen device cltune::Tuner tuner(args.platform_id, args.device_id); // Use full-search to explore all parameter combinations or random-search to search only a part of // the parameter values. The fraction is set as a command-line argument. if (args.fraction == 1.0 || args.fraction == 0.0) { tuner.UseFullSearch(); } else { tuner.UseRandomSearch(1.0/args.fraction); } // Loads the kernel sources and defines the kernel to tune auto sources = C::GetSources(); auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef()); // Sets the tunable parameters and their possible values C::SetParameters(tuner, id); C::SetConstraints(tuner, id); C::SetLocalMemorySize(tuner, id, args); // Tests for a specific precision tuner.AddParameter(id, "PRECISION", {static_cast(args.precision)}); tuner.AddParameterReference("PRECISION", static_cast(args.precision)); // Modifies the thread-sizes (both global and local) based on the parameters for (auto ¶meters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); } for (auto ¶meters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); } for (auto ¶meters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); } for (auto ¶meters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); } // Sets the function's arguments C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); // Starts the tuning process tuner.Tune(); // Prints the results to screen auto time_ms = tuner.PrintToScreen(); tuner.PrintFormatted(); // Also prints the performance of the best-case in terms of GB/s or GFLOPS if (time_ms != 0.0) { printf("[ -------> ] %.1lf ms", time_ms); printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str()); } // Outputs the results as JSON to disk, including some meta-data auto precision_string = std::to_string(static_cast(args.precision)); auto metadata = std::vector>{ {"kernel_family", C::KernelFamily()}, {"precision", precision_string} }; for (auto &o: C::GetOptions()) { if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); } if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); } if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); } if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); } if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } } tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata); } // ================================================================================================= } // namespace clblast // CLBLAST_TUNING_H_ #endif