Skip to content
This repository was archived by the owner on Jan 10, 2023. It is now read-only.

optimize opencl kernel building time (still has problem in win10) #34

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 129 additions & 31 deletions src/gpu/kernels_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,11 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
return std::move(scode);
}

kernels_cache::kernels_cache(gpu_toolkit& context): _context(context) {}
kernels_cache::kernels_cache(gpu_toolkit& context) : _context(context)
{
set_exepath();
set_cachemode();
}

kernels_cache::kernel_id kernels_cache::set_kernel_source(const std::shared_ptr<kernel_selector::kernel_string>& kernel_string, bool dump_custom_program, bool one_time_kernel)
{
Expand Down Expand Up @@ -214,7 +218,7 @@ kernels_cache::kernel_id kernels_cache::set_kernel_source(const std::shared_ptr<
return id;
}

kernels_cache::kernels_map kernels_cache::build_program(const program_code& program_source) const
kernels_cache::kernels_map kernels_cache::build_program(const program_code& program_source,bool bfause_build) const
{
static uint32_t current_file_index = 0;

Expand All @@ -236,6 +240,15 @@ kernels_cache::kernels_map kernels_cache::build_program(const program_code& prog
std::string err_log; //accumulated build log from all program's parts (only contains messages from parts which failed to compile)

uint32_t part_idx = 0;
uint32_t totoal_source = 0;
uint32_t curid = 0;
if (CLCACHE_NOCACHE != _cur_cachemode) {
for (const auto& sources : program_source.source) {
for (const std::string& s : sources) {
totoal_source += uint32_t(s.length());
}
}
}
for (const auto& sources : program_source.source)
{
auto current_dump_file_name = dump_file_name + std::to_string(part_idx++) + ".cl";
Expand All @@ -250,27 +263,55 @@ kernels_cache::kernels_map kernels_cache::build_program(const program_code& prog

try
{
cl::Program program(_context.context(), sources);
program.build({ _context.device() }, program_source.options.c_str());

if (dump_sources)
{
dump_file.get() << "\n/* Build Log:\n";
for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
dump_file.get() << p.second << "\n";

dump_file.get() << "*/\n";
}

cl::vector<cl::Kernel> kernels;
program.createKernels(&kernels);
cl::vector<cl::Kernel> kernels;
std::string binfilename = _fmodule_path + get_clcachename(totoal_source, program_source.kernels_counter, curid++, program_source.options, ".clbin");
std::ifstream finbin(binfilename, std::ios::binary);

if (!finbin || CLCACHE_NOCACHE == _cur_cachemode || bfause_build) {
cl::Program program(_context.context(), sources);
program.build({ _context.device() }, program_source.options.c_str());

if (dump_sources)
{
dump_file.get() << "\n/* Build Log:\n";
for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
dump_file.get() << p.second << "\n";

dump_file.get() << "*/\n";
}

if (CLCACHE_CACHE == _cur_cachemode) {
//--in cache mode, save binary
std::vector<std::vector<unsigned char>> binariesVectors;
if (CL_SUCCESS == program.getInfo(CL_PROGRAM_BINARIES, &binariesVectors)) {
if (binariesVectors.size() != 1)
throw std::runtime_error("Program get fail in CL_PROGRAM_BINARIES\n");
std::ofstream fout(binfilename, std::ios::binary);
fout.write((const char*)binariesVectors[0].data(), binariesVectors[0].size());
fout.close();
}
}
program.createKernels(&kernels);
}
else {
cl::Program::Binaries binaries;
std::vector<unsigned char> buf;
finbin.seekg(0, std::ios::end);
buf.resize(finbin.tellg());
finbin.seekg(0, std::ios::beg);
finbin.read((char*)buf.data(), buf.size());
binaries.push_back(buf);
cl::Program program(_context.context(), { _context.device() }, binaries);
program.build({ _context.device() });
program.createKernels(&kernels);
}

for (auto& k : kernels)
{
auto kernel_name = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
kmap.emplace(kernel_name, k);
}
}
}
catch (const cl::BuildError& err)
{
if (dump_sources)
Expand All @@ -280,14 +321,14 @@ kernels_cache::kernels_map kernels_cache::build_program(const program_code& prog
{
if (dump_sources)
dump_file.get() << p.second << "\n";

err_log += p.second + '\n';
}

if (dump_sources)
dump_file.get() << "*/\n";
}

}

if (!err_log.empty())
Expand Down Expand Up @@ -326,25 +367,82 @@ void kernels_cache::build_all()
_one_time_kernels.clear();
for (auto& program : sorted_program_code)
{
auto kernels = build_program(program.second);
bool bfausebuild = false;
for (uint32_t i = 0; i < 2; i++)
{
kernels_map kernels = build_program(program.second, bfausebuild);

for (auto& k : kernels)
{
const auto& entry_point = k.first;
const auto& k_id = program.second.entry_point_to_id[entry_point];
if (program.second.one_time)
{
_one_time_kernels[k_id] = k.second;
}
else
for (auto& k : kernels)
{
_kernels[k_id] = k.second;
const auto& entry_point = k.first;
const auto& k_id = program.second.entry_point_to_id[entry_point]; //todo: need modify inference_engine to avoid k_id =0...
if (k_id.length()==0) { //if cache name not unique or different sequence, different batch, here will fail and rebuild.
bfausebuild = true;
break;
}
if (program.second.one_time)
{
_one_time_kernels[k_id] = k.second;
}
else
{
_kernels[k_id] = k.second;
}
}
}
if (!bfausebuild)
break;
}
}

_kernels_code.clear();
_pending_compilation = false;
}

std::string kernels_cache::get_clcachename(uint32_t total_source, uint32_t kernels_counter, uint32_t curid, const std::string& option, const std::string& extname) const
{
std::string binfilename=std::to_string(total_source)+'_';
binfilename += std::to_string(kernels_counter) + '_';
binfilename += std::to_string(curid) + '_';
binfilename += option;
binfilename += extname;

if (binfilename.length() > 255)
binfilename = binfilename.substr(0, 255);
return binfilename;
}

/*----------------------------------------------------------------------------------------
1. coder can add openvino_clcache\nocache.flag to disable cache, openvino_clcache will create automaticlly
2. if not, work in cahce mode, the cl kernel will build fristly in end-user pc and save the binary
----------------------------------------------------------------------------------------*/
void kernels_cache::set_cachemode()
{
std::ifstream fin(_fmodule_path + "nocache.flag");
if (fin) {
_cur_cachemode = CLCACHE_NOCACHE;
fin.close();
}
else
_cur_cachemode = CLCACHE_CACHE;
}

static int global_cache = 0;
#ifdef _WIN32
#include <Windows.h>
void kernels_cache::set_exepath()
{
char buf[255];
GetModuleFileName(GetModuleHandle(NULL), buf, 255);
std::string fpath(buf);
//std::string::size_type idx = fpath.rfind('\\', fpath.length());
std::string::size_type idx = fpath.rfind('.', fpath.length());
_fmodule_path = fpath.substr(0, idx) + "_openvino_clcache\\";
CreateDirectory(_fmodule_path.c_str(), NULL);
_fmodule_path = _fmodule_path + std::to_string(global_cache++) + "\\";
CreateDirectory(_fmodule_path.c_str(), NULL);
}
#else

#endif

}}
14 changes: 13 additions & 1 deletion src/gpu/kernels_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,19 @@ class kernels_cache
sorted_code get_program_source(const kernels_code& kernels_source_code) const;
friend class gpu_toolkit;
explicit kernels_cache(gpu_toolkit& context);
kernels_map build_program(const program_code& pcode) const;
kernels_map build_program(const program_code& pcode,bool bfause_build) const;
//---add for optimize opencl kernel building time-------------------------------------------------------------------------------beg
std::string _fmodule_path;
void set_exepath();
std::string get_clcachename(uint32_t total_source, uint32_t kernels_counter, uint32_t curid, const std::string& option, const std::string& extname) const;
typedef enum {
CLCACHE_NONE,
CLCACHE_NOCACHE,
CLCACHE_CACHE,
}CLCache_Mode;
void set_cachemode();
CLCache_Mode _cur_cachemode;
//---add for optimiza opencl kernel building time-------------------------------------------------------------------------------end

public:
kernel_id set_kernel_source(const std::shared_ptr<kernel_selector::kernel_string>& kernel_string, bool dump_custom_program, bool one_time_kernel);
Expand Down