Clover Git
OpenCL 1.1 software implementation
|
00001 /* 00002 * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> 00003 * All rights reserved. 00004 * 00005 * Redistribution and use in source and binary forms, with or without 00006 * modification, are permitted provided that the following conditions are met: 00007 * * Redistributions of source code must retain the above copyright 00008 * notice, this list of conditions and the following disclaimer. 00009 * * Redistributions in binary form must reproduce the above copyright 00010 * notice, this list of conditions and the following disclaimer in the 00011 * documentation and/or other materials provided with the distribution. 00012 * * Neither the name of the copyright holder nor the 00013 * names of its contributors may be used to endorse or promote products 00014 * derived from this software without specific prior written permission. 00015 * 00016 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 00017 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 00018 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 00019 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY 00020 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 00021 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00022 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 00023 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00024 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00025 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00026 */ 00027 00033 #include "kernel.h" 00034 #include "device.h" 00035 #include "buffer.h" 00036 #include "program.h" 00037 #include "builtins.h" 00038 00039 #include "../kernel.h" 00040 #include "../memobject.h" 00041 #include "../events.h" 00042 #include "../program.h" 00043 00044 #include <llvm/Function.h> 00045 #include <llvm/Constants.h> 00046 #include <llvm/Instructions.h> 00047 #include <llvm/LLVMContext.h> 00048 #include <llvm/Module.h> 00049 #include <llvm/ExecutionEngine/ExecutionEngine.h> 00050 00051 #include <cstdlib> 00052 #include <cstring> 00053 #include <iostream> 00054 #include <sys/mman.h> 00055 00056 using namespace Coal; 00057 00058 CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function) 00059 : DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function), 00060 p_call_function(0) 00061 { 00062 pthread_mutex_init(&p_call_function_mutex, 0); 00063 } 00064 00065 CPUKernel::~CPUKernel() 00066 { 00067 if (p_call_function) 00068 p_call_function->eraseFromParent(); 00069 00070 pthread_mutex_destroy(&p_call_function_mutex); 00071 } 00072 00073 size_t CPUKernel::workGroupSize() const 00074 { 00075 return 0; // TODO 00076 } 00077 00078 cl_ulong CPUKernel::localMemSize() const 00079 { 00080 return 0; // TODO 00081 } 00082 00083 cl_ulong CPUKernel::privateMemSize() const 00084 { 00085 return 0; // TODO 00086 } 00087 00088 size_t CPUKernel::preferredWorkGroupSizeMultiple() const 00089 { 00090 return 0; // TODO 00091 } 00092 00093 template<typename T> 00094 T k_exp(T base, unsigned int e) 00095 { 00096 T rs = base; 00097 00098 for (unsigned int i=1; i<e; ++i) 00099 rs *= base; 00100 00101 return rs; 00102 } 00103 00104 // Try to find the size a work group has to have to be executed the fastest on 00105 // the CPU. 00106 size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim, 00107 size_t global_work_size) const 00108 { 00109 unsigned int cpus = p_device->numCPUs(); 00110 00111 // Don't break in too small parts 00112 if (k_exp(global_work_size, num_dims) > 64) 00113 return global_work_size; 00114 00115 // Find the divisor of global_work_size the closest to cpus but >= than it 00116 unsigned int divisor = cpus; 00117 00118 while (true) 00119 { 00120 if ((global_work_size % divisor) == 0) 00121 break; 00122 00123 // Don't let the loop go up to global_work_size, the overhead would be 00124 // too huge 00125 if (divisor > global_work_size || divisor > cpus * 32) 00126 { 00127 divisor = 1; // Not parallel but has no CommandQueue overhead 00128 break; 00129 } 00130 } 00131 00132 // Return the size 00133 return global_work_size / divisor; 00134 } 00135 00136 llvm::Function *CPUKernel::function() const 00137 { 00138 return p_function; 00139 } 00140 00141 Kernel *CPUKernel::kernel() const 00142 { 00143 return p_kernel; 00144 } 00145 00146 CPUDevice *CPUKernel::device() const 00147 { 00148 return p_device; 00149 } 00150 00151 // From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two 00152 template <class T> 00153 T next_power_of_two(T k) { 00154 if (k == 0) 00155 return 1; 00156 k--; 00157 for (int i=1; i<sizeof(T)*8; i<<=1) 00158 k = k | k >> i; 00159 return k+1; 00160 } 00161 00162 size_t CPUKernel::typeOffset(size_t &offset, size_t type_len) 00163 { 00164 size_t rs = offset; 00165 00166 // Align offset to stype_len 00167 type_len = next_power_of_two(type_len); 00168 size_t mask = ~(type_len - 1); 00169 00170 while (rs & mask != rs) 00171 rs++; 00172 00173 // Where to try to place the next value 00174 offset = rs + type_len; 00175 00176 return rs; 00177 } 00178 00179 llvm::Function *CPUKernel::callFunction() 00180 { 00181 pthread_mutex_lock(&p_call_function_mutex); 00182 00183 // If we can reuse the same function between work groups, do it 00184 if (p_call_function) 00185 { 00186 llvm::Function *rs = p_call_function; 00187 pthread_mutex_unlock(&p_call_function_mutex); 00188 00189 return rs; 00190 } 00191 00192 /* Create a stub function in the form of 00193 * 00194 * void stub(void *args) { 00195 * kernel(*(int *)((char *)args + 0), 00196 * *(float **)((char *)args + sizeof(int)), 00197 * *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *))); 00198 * } 00199 * 00200 * In LLVM, it is exprimed in the form of : 00201 * 00202 * @stub(i8* args) { 00203 * kernel( 00204 * load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)), 00205 * load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)), 00206 * ... 00207 * ); 00208 * } 00209 */ 00210 llvm::FunctionType *kernel_function_type = p_function->getFunctionType(); 00211 llvm::FunctionType *stub_function_type = llvm::FunctionType::get( 00212 p_function->getReturnType(), 00213 llvm::Type::getInt8PtrTy( 00214 p_function->getContext()), 00215 false); 00216 llvm::Function *stub_function = llvm::Function::Create( 00217 stub_function_type, 00218 llvm::Function::InternalLinkage, 00219 "", 00220 p_function->getParent()); 00221 00222 // Insert a basic block 00223 llvm::BasicBlock *basic_block = llvm::BasicBlock::Create( 00224 p_function->getContext(), 00225 "", 00226 stub_function); 00227 00228 // Create the function arguments 00229 llvm::Argument &stub_arg = stub_function->getArgumentList().front(); 00230 llvm::SmallVector<llvm::Value *, 8> args; 00231 size_t args_offset = 0; 00232 00233 for (unsigned int i=0; i<kernel_function_type->getNumParams(); ++i) 00234 { 00235 llvm::Type *param_type = kernel_function_type->getParamType(i); 00236 llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value 00237 const Kernel::Arg &arg = p_kernel->arg(i); 00238 00239 // Calculate the size of the arg 00240 size_t arg_size = arg.valueSize() * arg.vecDim(); 00241 00242 // Get where to place this argument 00243 size_t arg_offset = typeOffset(args_offset, arg_size); 00244 00245 // %1 = getelementptr(args, $arg_offset); 00246 llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds( 00247 &stub_arg, 00248 llvm::ConstantInt::get(stub_function->getContext(), 00249 llvm::APInt(64, arg_offset)), 00250 "", 00251 basic_block); 00252 00253 // %2 = bitcast(%1, $param_type_ptr) 00254 llvm::Value *bitcast = new llvm::BitCastInst( 00255 getelementptr, 00256 param_type_ptr, 00257 "", 00258 basic_block); 00259 00260 // %3 = load(%2) 00261 llvm::Value *load = new llvm::LoadInst( 00262 bitcast, 00263 "", 00264 false, 00265 arg_size, // We ensure that an argument is always aligned on its size, it enables things like fast movaps 00266 basic_block); 00267 00268 // We have the value, send it to the function 00269 args.push_back(load); 00270 } 00271 00272 // Create the call instruction 00273 llvm::CallInst *call_inst = llvm::CallInst::Create( 00274 p_function, 00275 args, 00276 "", 00277 basic_block); 00278 call_inst->setCallingConv(p_function->getCallingConv()); 00279 call_inst->setTailCall(); 00280 00281 // Create a return instruction to end the stub 00282 llvm::ReturnInst::Create( 00283 p_function->getContext(), 00284 basic_block); 00285 00286 // Retain the function if it can be reused 00287 p_call_function = stub_function; 00288 00289 pthread_mutex_unlock(&p_call_function_mutex); 00290 00291 return stub_function; 00292 } 00293 00294 /* 00295 * CPUKernelEvent 00296 */ 00297 CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event) 00298 : p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0), 00299 p_kernel_args(0) 00300 { 00301 // Mutex 00302 pthread_mutex_init(&p_mutex, 0); 00303 00304 // Set current work group to (0, 0, ..., 0) 00305 std::memset(p_current_work_group, 0, event->work_dim() * sizeof(size_t)); 00306 00307 // Populate p_max_work_groups 00308 p_num_wg = 1; 00309 00310 for (cl_uint i=0; i<event->work_dim(); ++i) 00311 { 00312 p_max_work_groups[i] = 00313 (event->global_work_size(i) / event->local_work_size(i)) - 1; // 0..n-1, not 1..n 00314 00315 p_num_wg *= p_max_work_groups[i] + 1; 00316 } 00317 } 00318 00319 CPUKernelEvent::~CPUKernelEvent() 00320 { 00321 pthread_mutex_destroy(&p_mutex); 00322 00323 if (p_kernel_args) 00324 std::free(p_kernel_args); 00325 } 00326 00327 bool CPUKernelEvent::reserve() 00328 { 00329 // Lock, this will be unlocked in takeInstance() 00330 pthread_mutex_lock(&p_mutex); 00331 00332 // Last work group if current == max - 1 00333 return (p_current_wg == p_num_wg - 1); 00334 } 00335 00336 bool CPUKernelEvent::finished() 00337 { 00338 bool rs; 00339 00340 pthread_mutex_lock(&p_mutex); 00341 00342 rs = (p_finished_wg == p_num_wg); 00343 00344 pthread_mutex_unlock(&p_mutex); 00345 00346 return rs; 00347 } 00348 00349 void CPUKernelEvent::workGroupFinished() 00350 { 00351 pthread_mutex_lock(&p_mutex); 00352 00353 p_finished_wg++; 00354 00355 pthread_mutex_unlock(&p_mutex); 00356 } 00357 00358 CPUKernelWorkGroup *CPUKernelEvent::takeInstance() 00359 { 00360 CPUKernelWorkGroup *wg = new CPUKernelWorkGroup((CPUKernel *)p_event->deviceKernel(), 00361 p_event, 00362 this, 00363 p_current_work_group); 00364 00365 // Increment current work group 00366 incVec(p_event->work_dim(), p_current_work_group, p_max_work_groups); 00367 p_current_wg += 1; 00368 00369 // Release event 00370 pthread_mutex_unlock(&p_mutex); 00371 00372 return wg; 00373 } 00374 00375 void *CPUKernelEvent::kernelArgs() const 00376 { 00377 return p_kernel_args; 00378 } 00379 00380 void CPUKernelEvent::cacheKernelArgs(void *args) 00381 { 00382 p_kernel_args = args; 00383 } 00384 00385 /* 00386 * CPUKernelWorkGroup 00387 */ 00388 CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event, 00389 CPUKernelEvent *cpu_event, 00390 const size_t *work_group_index) 00391 : p_kernel(kernel), p_cpu_event(cpu_event), p_event(event), 00392 p_work_dim(event->work_dim()), p_contexts(0), p_stack_size(8192 /* TODO */), 00393 p_had_barrier(false) 00394 { 00395 00396 // Set index 00397 std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t)); 00398 00399 // Set maxs and global id 00400 p_num_work_items = 1; 00401 00402 for (unsigned int i=0; i<p_work_dim; ++i) 00403 { 00404 p_max_local_id[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n 00405 p_num_work_items *= event->local_work_size(i); 00406 00407 // Set global id 00408 p_global_id_start_offset[i] = (p_index[i] * event->local_work_size(i)) 00409 + event->global_work_offset(i); 00410 } 00411 } 00412 00413 CPUKernelWorkGroup::~CPUKernelWorkGroup() 00414 { 00415 p_cpu_event->workGroupFinished(); 00416 } 00417 00418 void *CPUKernelWorkGroup::callArgs(std::vector<void *> &locals_to_free) 00419 { 00420 if (p_cpu_event->kernelArgs() && !p_kernel->kernel()->hasLocals()) 00421 { 00422 // We have cached the args and can reuse them 00423 return p_cpu_event->kernelArgs(); 00424 } 00425 00426 // We need to create them from scratch 00427 void *rs; 00428 00429 size_t args_size = 0; 00430 00431 for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i) 00432 { 00433 const Kernel::Arg &arg = p_kernel->kernel()->arg(i); 00434 CPUKernel::typeOffset(args_size, arg.valueSize() * arg.vecDim()); 00435 } 00436 00437 rs = std::malloc(args_size); 00438 00439 if (!rs) 00440 return false; 00441 00442 size_t arg_offset = 0; 00443 00444 for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i) 00445 { 00446 const Kernel::Arg &arg = p_kernel->kernel()->arg(i); 00447 size_t size = arg.valueSize() * arg.vecDim(); 00448 size_t offset = CPUKernel::typeOffset(arg_offset, size); 00449 00450 // Where to place the argument 00451 unsigned char *target = (unsigned char *)rs; 00452 target += offset; 00453 00454 // We may have to perform some changes in the values (buffers, etc) 00455 switch (arg.kind()) 00456 { 00457 case Kernel::Arg::Buffer: 00458 { 00459 MemObject *buffer = *(MemObject **)arg.data(); 00460 00461 if (arg.file() == Kernel::Arg::Local) 00462 { 00463 // Alloc a buffer and pass it to the kernel 00464 void *local_buffer = std::malloc(arg.allocAtKernelRuntime()); 00465 locals_to_free.push_back(local_buffer); 00466 *(void **)target = local_buffer; 00467 } 00468 else 00469 { 00470 if (!buffer) 00471 { 00472 // We can do that, just send NULL 00473 *(void **)target = NULL; 00474 } 00475 else 00476 { 00477 // Get the CPU buffer, allocate it and get its pointer 00478 CPUBuffer *cpubuf = 00479 (CPUBuffer *)buffer->deviceBuffer(p_kernel->device()); 00480 void *buf_ptr = 0; 00481 00482 buffer->allocate(p_kernel->device()); 00483 buf_ptr = cpubuf->data(); 00484 00485 *(void **)target = buf_ptr; 00486 } 00487 } 00488 00489 break; 00490 } 00491 case Kernel::Arg::Image2D: 00492 case Kernel::Arg::Image3D: 00493 { 00494 // We need to ensure the image is allocated 00495 Image2D *image = *(Image2D **)arg.data(); 00496 image->allocate(p_kernel->device()); 00497 00498 // Fall through to the memcpy 00499 } 00500 default: 00501 // Simply copy the arg's data into the buffer 00502 std::memcpy(target, arg.data(), size); 00503 break; 00504 } 00505 } 00506 00507 // Cache the arguments if we can do so 00508 if (!p_kernel->kernel()->hasLocals()) 00509 p_cpu_event->cacheKernelArgs(rs); 00510 00511 return rs; 00512 } 00513 00514 bool CPUKernelWorkGroup::run() 00515 { 00516 // Get the kernel function to call 00517 std::vector<void *> locals_to_free; 00518 llvm::Function *kernel_func = p_kernel->callFunction(); 00519 00520 if (!kernel_func) 00521 return false; 00522 00523 Program *p = (Program *)p_kernel->kernel()->parent(); 00524 CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device())); 00525 00526 p_kernel_func_addr = 00527 (void(*)(void *))prog->jit()->getPointerToFunction(kernel_func); 00528 00529 // Get the arguments 00530 p_args = callArgs(locals_to_free); 00531 00532 // Tell the builtins this thread will run a kernel work group 00533 setThreadLocalWorkGroup(this); 00534 00535 // Initialize the dummy context used by the builtins before a call to barrier() 00536 p_current_work_item = 0; 00537 p_current_context = &p_dummy_context; 00538 00539 std::memset(p_dummy_context.local_id, 0, p_work_dim * sizeof(size_t)); 00540 00541 do 00542 { 00543 // Simply call the "call function", it and the builtins will do the rest 00544 p_kernel_func_addr(p_args); 00545 } while (!p_had_barrier && 00546 !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id)); 00547 00548 // If no barrier() call was made, all is fine. If not, only the first 00549 // work-item has currently finished. We must let the others run. 00550 if (p_had_barrier) 00551 { 00552 Context *main_context = p_current_context; // After the first swapcontext, 00553 // we will not be able to trust 00554 // p_current_context anymore. 00555 00556 // We'll call swapcontext for each remaining work-item. They will 00557 // finish, and when they'll do so, this main context will be resumed, so 00558 // it's easy (i starts from 1 because the main context already finished) 00559 for (unsigned int i=1; i<p_num_work_items; ++i) 00560 { 00561 Context *ctx = getContextAddr(i); 00562 swapcontext(&main_context->context, &ctx->context); 00563 } 00564 } 00565 00566 // Free the allocated locals 00567 if (p_kernel->kernel()->hasLocals()) 00568 { 00569 for (size_t i=0; i<locals_to_free.size(); ++i) 00570 { 00571 std::free(locals_to_free[i]); 00572 } 00573 00574 std::free(p_args); 00575 } 00576 00577 return true; 00578 } 00579 00580 CPUKernelWorkGroup::Context *CPUKernelWorkGroup::getContextAddr(unsigned int index) 00581 { 00582 size_t size; 00583 char *data = (char *)p_contexts; 00584 00585 // Each Context in data is an element of size p_stack_size + sizeof(Context) 00586 size = p_stack_size + sizeof(Context); 00587 size *= index; // To get an offset 00588 00589 return (Context *)(data + size); // Pointer to the context 00590 }