Clover: kernel.cpp Source File

Clover Git
OpenCL 1.1 software implementation
00001 /*
00002  * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
00003  * All rights reserved.
00004  *
00005  * Redistribution and use in source and binary forms, with or without
00006  * modification, are permitted provided that the following conditions are met:
00007  *     * Redistributions of source code must retain the above copyright
00008  *       notice, this list of conditions and the following disclaimer.
00009  *     * Redistributions in binary form must reproduce the above copyright
00010  *       notice, this list of conditions and the following disclaimer in the
00011  *       documentation and/or other materials provided with the distribution.
00012  *     * Neither the name of the copyright holder nor the
00013  *       names of its contributors may be used to endorse or promote products
00014  *       derived from this software without specific prior written permission.
00015  *
00016  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
00017  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00018  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00019  * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
00020  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00021  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00022  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00023  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00024  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00025  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00026  */
00027 
00033 #include "kernel.h"
00034 #include "device.h"
00035 #include "buffer.h"
00036 #include "program.h"
00037 #include "builtins.h"
00038 
00039 #include "../kernel.h"
00040 #include "../memobject.h"
00041 #include "../events.h"
00042 #include "../program.h"
00043 
00044 #include <llvm/Function.h>
00045 #include <llvm/Constants.h>
00046 #include <llvm/Instructions.h>
00047 #include <llvm/LLVMContext.h>
00048 #include <llvm/Module.h>
00049 #include <llvm/ExecutionEngine/ExecutionEngine.h>
00050 
00051 #include <cstdlib>
00052 #include <cstring>
00053 #include <iostream>
00054 #include <sys/mman.h>
00055 
00056 using namespace Coal;
00057 
00058 CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function)
00059 : DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function),
00060   p_call_function(0)
00061 {
00062     pthread_mutex_init(&p_call_function_mutex, 0);
00063 }
00064 
00065 CPUKernel::~CPUKernel()
00066 {
00067     if (p_call_function)
00068         p_call_function->eraseFromParent();
00069 
00070     pthread_mutex_destroy(&p_call_function_mutex);
00071 }
00072 
00073 size_t CPUKernel::workGroupSize() const
00074 {
00075     return 0; // TODO
00076 }
00077 
00078 cl_ulong CPUKernel::localMemSize() const
00079 {
00080     return 0; // TODO
00081 }
00082 
00083 cl_ulong CPUKernel::privateMemSize() const
00084 {
00085     return 0; // TODO
00086 }
00087 
00088 size_t CPUKernel::preferredWorkGroupSizeMultiple() const
00089 {
00090     return 0; // TODO
00091 }
00092 
00093 template<typename T>
00094 T k_exp(T base, unsigned int e)
00095 {
00096     T rs = base;
00097 
00098     for (unsigned int i=1; i<e; ++i)
00099         rs *= base;
00100 
00101     return rs;
00102 }
00103 
00104 // Try to find the size a work group has to have to be executed the fastest on
00105 // the CPU.
00106 size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
00107                           size_t global_work_size) const
00108 {
00109     unsigned int cpus = p_device->numCPUs();
00110 
00111     // Don't break in too small parts
00112     if (k_exp(global_work_size, num_dims) > 64)
00113         return global_work_size;
00114 
00115     // Find the divisor of global_work_size the closest to cpus but >= than it
00116     unsigned int divisor = cpus;
00117 
00118     while (true)
00119     {
00120         if ((global_work_size % divisor) == 0)
00121             break;
00122 
00123         // Don't let the loop go up to global_work_size, the overhead would be
00124         // too huge
00125         if (divisor > global_work_size || divisor > cpus * 32)
00126         {
00127             divisor = 1;  // Not parallel but has no CommandQueue overhead
00128             break;
00129         }
00130     }
00131 
00132     // Return the size
00133     return global_work_size / divisor;
00134 }
00135 
00136 llvm::Function *CPUKernel::function() const
00137 {
00138     return p_function;
00139 }
00140 
00141 Kernel *CPUKernel::kernel() const
00142 {
00143     return p_kernel;
00144 }
00145 
00146 CPUDevice *CPUKernel::device() const
00147 {
00148     return p_device;
00149 }
00150 
00151 // From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two
00152 template <class T>
00153 T next_power_of_two(T k) {
00154         if (k == 0)
00155                 return 1;
00156         k--;
00157         for (int i=1; i<sizeof(T)*8; i<<=1)
00158                 k = k | k >> i;
00159         return k+1;
00160 }
00161 
00162 size_t CPUKernel::typeOffset(size_t &offset, size_t type_len)
00163 {
00164     size_t rs = offset;
00165 
00166     // Align offset to stype_len
00167     type_len = next_power_of_two(type_len);
00168     size_t mask = ~(type_len - 1);
00169 
00170     while (rs & mask != rs)
00171         rs++;
00172 
00173     // Where to try to place the next value
00174     offset = rs + type_len;
00175 
00176     return rs;
00177 }
00178 
00179 llvm::Function *CPUKernel::callFunction()
00180 {
00181     pthread_mutex_lock(&p_call_function_mutex);
00182 
00183     // If we can reuse the same function between work groups, do it
00184     if (p_call_function)
00185     {
00186         llvm::Function *rs = p_call_function;
00187         pthread_mutex_unlock(&p_call_function_mutex);
00188 
00189         return rs;
00190     }
00191 
00192     /* Create a stub function in the form of
00193      *
00194      * void stub(void *args) {
00195      *     kernel(*(int *)((char *)args + 0),
00196      *            *(float **)((char *)args + sizeof(int)),
00197      *            *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *)));
00198      * }
00199      *
00200      * In LLVM, it is exprimed in the form of :
00201      *
00202      * @stub(i8* args) {
00203      *     kernel(
00204      *         load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)),
00205      *         load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)),
00206      *         ...
00207      *     );
00208      * }
00209      */
00210     llvm::FunctionType *kernel_function_type = p_function->getFunctionType();
00211     llvm::FunctionType *stub_function_type = llvm::FunctionType::get(
00212         p_function->getReturnType(),
00213         llvm::Type::getInt8PtrTy(
00214             p_function->getContext()),
00215         false);
00216     llvm::Function *stub_function = llvm::Function::Create(
00217         stub_function_type,
00218         llvm::Function::InternalLinkage,
00219         "",
00220         p_function->getParent());
00221 
00222     // Insert a basic block
00223     llvm::BasicBlock *basic_block = llvm::BasicBlock::Create(
00224         p_function->getContext(),
00225         "",
00226         stub_function);
00227 
00228     // Create the function arguments
00229     llvm::Argument &stub_arg = stub_function->getArgumentList().front();
00230     llvm::SmallVector<llvm::Value *, 8> args;
00231     size_t args_offset = 0;
00232 
00233     for (unsigned int i=0; i<kernel_function_type->getNumParams(); ++i)
00234     {
00235         llvm::Type *param_type = kernel_function_type->getParamType(i);
00236         llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value
00237         const Kernel::Arg &arg = p_kernel->arg(i);
00238 
00239         // Calculate the size of the arg
00240         size_t arg_size = arg.valueSize() * arg.vecDim();
00241 
00242         // Get where to place this argument
00243         size_t arg_offset = typeOffset(args_offset, arg_size);
00244 
00245         // %1 = getelementptr(args, $arg_offset);
00246         llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds(
00247             &stub_arg,
00248             llvm::ConstantInt::get(stub_function->getContext(),
00249                                    llvm::APInt(64, arg_offset)),
00250             "",
00251             basic_block);
00252 
00253         // %2 = bitcast(%1, $param_type_ptr)
00254         llvm::Value *bitcast = new llvm::BitCastInst(
00255             getelementptr,
00256             param_type_ptr,
00257             "",
00258             basic_block);
00259 
00260         // %3 = load(%2)
00261         llvm::Value *load = new llvm::LoadInst(
00262             bitcast,
00263             "",
00264             false,
00265             arg_size,   // We ensure that an argument is always aligned on its size, it enables things like fast movaps
00266             basic_block);
00267 
00268         // We have the value, send it to the function
00269         args.push_back(load);
00270     }
00271 
00272     // Create the call instruction
00273     llvm::CallInst *call_inst = llvm::CallInst::Create(
00274         p_function,
00275         args,
00276         "",
00277         basic_block);
00278     call_inst->setCallingConv(p_function->getCallingConv());
00279     call_inst->setTailCall();
00280 
00281     // Create a return instruction to end the stub
00282     llvm::ReturnInst::Create(
00283         p_function->getContext(),
00284         basic_block);
00285 
00286     // Retain the function if it can be reused
00287     p_call_function = stub_function;
00288 
00289     pthread_mutex_unlock(&p_call_function_mutex);
00290 
00291     return stub_function;
00292 }
00293 
00294 /*
00295  * CPUKernelEvent
00296  */
00297 CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event)
00298 : p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0),
00299   p_kernel_args(0)
00300 {
00301     // Mutex
00302     pthread_mutex_init(&p_mutex, 0);
00303 
00304     // Set current work group to (0, 0, ..., 0)
00305     std::memset(p_current_work_group, 0, event->work_dim() * sizeof(size_t));
00306 
00307     // Populate p_max_work_groups
00308     p_num_wg = 1;
00309 
00310     for (cl_uint i=0; i<event->work_dim(); ++i)
00311     {
00312         p_max_work_groups[i] =
00313             (event->global_work_size(i) / event->local_work_size(i)) - 1; // 0..n-1, not 1..n
00314 
00315         p_num_wg *= p_max_work_groups[i] + 1;
00316     }
00317 }
00318 
00319 CPUKernelEvent::~CPUKernelEvent()
00320 {
00321     pthread_mutex_destroy(&p_mutex);
00322 
00323     if (p_kernel_args)
00324         std::free(p_kernel_args);
00325 }
00326 
00327 bool CPUKernelEvent::reserve()
00328 {
00329     // Lock, this will be unlocked in takeInstance()
00330     pthread_mutex_lock(&p_mutex);
00331 
00332     // Last work group if current == max - 1
00333     return (p_current_wg == p_num_wg - 1);
00334 }
00335 
00336 bool CPUKernelEvent::finished()
00337 {
00338     bool rs;
00339 
00340     pthread_mutex_lock(&p_mutex);
00341 
00342     rs = (p_finished_wg == p_num_wg);
00343 
00344     pthread_mutex_unlock(&p_mutex);
00345 
00346     return rs;
00347 }
00348 
00349 void CPUKernelEvent::workGroupFinished()
00350 {
00351     pthread_mutex_lock(&p_mutex);
00352 
00353     p_finished_wg++;
00354 
00355     pthread_mutex_unlock(&p_mutex);
00356 }
00357 
00358 CPUKernelWorkGroup *CPUKernelEvent::takeInstance()
00359 {
00360     CPUKernelWorkGroup *wg = new CPUKernelWorkGroup((CPUKernel *)p_event->deviceKernel(),
00361                                                     p_event,
00362                                                     this,
00363                                                     p_current_work_group);
00364 
00365     // Increment current work group
00366     incVec(p_event->work_dim(), p_current_work_group, p_max_work_groups);
00367     p_current_wg += 1;
00368 
00369     // Release event
00370     pthread_mutex_unlock(&p_mutex);
00371 
00372     return wg;
00373 }
00374 
00375 void *CPUKernelEvent::kernelArgs() const
00376 {
00377     return p_kernel_args;
00378 }
00379 
00380 void CPUKernelEvent::cacheKernelArgs(void *args)
00381 {
00382     p_kernel_args = args;
00383 }
00384 
00385 /*
00386  * CPUKernelWorkGroup
00387  */
00388 CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
00389                                        CPUKernelEvent *cpu_event,
00390                                        const size_t *work_group_index)
00391 : p_kernel(kernel), p_cpu_event(cpu_event), p_event(event),
00392   p_work_dim(event->work_dim()), p_contexts(0), p_stack_size(8192 /* TODO */),
00393   p_had_barrier(false)
00394 {
00395 
00396     // Set index
00397     std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t));
00398 
00399     // Set maxs and global id
00400     p_num_work_items = 1;
00401 
00402     for (unsigned int i=0; i<p_work_dim; ++i)
00403     {
00404         p_max_local_id[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n
00405         p_num_work_items *= event->local_work_size(i);
00406 
00407         // Set global id
00408         p_global_id_start_offset[i] = (p_index[i] * event->local_work_size(i))
00409                          + event->global_work_offset(i);
00410     }
00411 }
00412 
00413 CPUKernelWorkGroup::~CPUKernelWorkGroup()
00414 {
00415     p_cpu_event->workGroupFinished();
00416 }
00417 
00418 void *CPUKernelWorkGroup::callArgs(std::vector<void *> &locals_to_free)
00419 {
00420     if (p_cpu_event->kernelArgs() && !p_kernel->kernel()->hasLocals())
00421     {
00422         // We have cached the args and can reuse them
00423         return p_cpu_event->kernelArgs();
00424     }
00425 
00426     // We need to create them from scratch
00427     void *rs;
00428 
00429     size_t args_size = 0;
00430 
00431     for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i)
00432     {
00433         const Kernel::Arg &arg = p_kernel->kernel()->arg(i);
00434         CPUKernel::typeOffset(args_size, arg.valueSize() * arg.vecDim());
00435     }
00436 
00437     rs = std::malloc(args_size);
00438 
00439     if (!rs)
00440         return false;
00441 
00442     size_t arg_offset = 0;
00443 
00444     for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i)
00445     {
00446         const Kernel::Arg &arg = p_kernel->kernel()->arg(i);
00447         size_t size = arg.valueSize() * arg.vecDim();
00448         size_t offset = CPUKernel::typeOffset(arg_offset, size);
00449 
00450         // Where to place the argument
00451         unsigned char *target = (unsigned char *)rs;
00452         target += offset;
00453 
00454         // We may have to perform some changes in the values (buffers, etc)
00455         switch (arg.kind())
00456         {
00457             case Kernel::Arg::Buffer:
00458             {
00459                 MemObject *buffer = *(MemObject **)arg.data();
00460 
00461                 if (arg.file() == Kernel::Arg::Local)
00462                 {
00463                     // Alloc a buffer and pass it to the kernel
00464                     void *local_buffer = std::malloc(arg.allocAtKernelRuntime());
00465                     locals_to_free.push_back(local_buffer);
00466                     *(void **)target = local_buffer;
00467                 }
00468                 else
00469                 {
00470                     if (!buffer)
00471                     {
00472                         // We can do that, just send NULL
00473                         *(void **)target = NULL;
00474                     }
00475                     else
00476                     {
00477                         // Get the CPU buffer, allocate it and get its pointer
00478                         CPUBuffer *cpubuf =
00479                             (CPUBuffer *)buffer->deviceBuffer(p_kernel->device());
00480                         void *buf_ptr = 0;
00481 
00482                         buffer->allocate(p_kernel->device());
00483                         buf_ptr = cpubuf->data();
00484 
00485                         *(void **)target = buf_ptr;
00486                     }
00487                 }
00488 
00489                 break;
00490             }
00491             case Kernel::Arg::Image2D:
00492             case Kernel::Arg::Image3D:
00493             {
00494                 // We need to ensure the image is allocated
00495                 Image2D *image = *(Image2D **)arg.data();
00496                 image->allocate(p_kernel->device());
00497 
00498                 // Fall through to the memcpy
00499             }
00500             default:
00501                 // Simply copy the arg's data into the buffer
00502                 std::memcpy(target, arg.data(), size);
00503                 break;
00504         }
00505     }
00506 
00507     // Cache the arguments if we can do so
00508     if (!p_kernel->kernel()->hasLocals())
00509         p_cpu_event->cacheKernelArgs(rs);
00510 
00511     return rs;
00512 }
00513 
00514 bool CPUKernelWorkGroup::run()
00515 {
00516     // Get the kernel function to call
00517     std::vector<void *> locals_to_free;
00518     llvm::Function *kernel_func = p_kernel->callFunction();
00519 
00520     if (!kernel_func)
00521         return false;
00522 
00523     Program *p = (Program *)p_kernel->kernel()->parent();
00524     CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device()));
00525 
00526     p_kernel_func_addr =
00527         (void(*)(void *))prog->jit()->getPointerToFunction(kernel_func);
00528 
00529     // Get the arguments
00530     p_args = callArgs(locals_to_free);
00531 
00532     // Tell the builtins this thread will run a kernel work group
00533     setThreadLocalWorkGroup(this);
00534 
00535     // Initialize the dummy context used by the builtins before a call to barrier()
00536     p_current_work_item = 0;
00537     p_current_context = &p_dummy_context;
00538 
00539     std::memset(p_dummy_context.local_id, 0, p_work_dim * sizeof(size_t));
00540 
00541     do
00542     {
00543         // Simply call the "call function", it and the builtins will do the rest
00544         p_kernel_func_addr(p_args);
00545     } while (!p_had_barrier &&
00546              !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id));
00547 
00548     // If no barrier() call was made, all is fine. If not, only the first
00549     // work-item has currently finished. We must let the others run.
00550     if (p_had_barrier)
00551     {
00552         Context *main_context = p_current_context; // After the first swapcontext,
00553                                                    // we will not be able to trust
00554                                                    // p_current_context anymore.
00555 
00556         // We'll call swapcontext for each remaining work-item. They will
00557         // finish, and when they'll do so, this main context will be resumed, so
00558         // it's easy (i starts from 1 because the main context already finished)
00559         for (unsigned int i=1; i<p_num_work_items; ++i)
00560         {
00561             Context *ctx = getContextAddr(i);
00562             swapcontext(&main_context->context, &ctx->context);
00563         }
00564     }
00565 
00566     // Free the allocated locals
00567     if (p_kernel->kernel()->hasLocals())
00568     {
00569         for (size_t i=0; i<locals_to_free.size(); ++i)
00570         {
00571             std::free(locals_to_free[i]);
00572         }
00573 
00574         std::free(p_args);
00575     }
00576 
00577     return true;
00578 }
00579 
00580 CPUKernelWorkGroup::Context *CPUKernelWorkGroup::getContextAddr(unsigned int index)
00581 {
00582     size_t size;
00583     char *data = (char *)p_contexts;
00584 
00585     // Each Context in data is an element of size p_stack_size + sizeof(Context)
00586     size = p_stack_size + sizeof(Context);
00587     size *= index;  // To get an offset
00588 
00589     return (Context *)(data + size); // Pointer to the context
00590 }