Clover Git
OpenCL 1.1 software implementation
|
00001 /* 00002 * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> 00003 * All rights reserved. 00004 * 00005 * Redistribution and use in source and binary forms, with or without 00006 * modification, are permitted provided that the following conditions are met: 00007 * * Redistributions of source code must retain the above copyright 00008 * notice, this list of conditions and the following disclaimer. 00009 * * Redistributions in binary form must reproduce the above copyright 00010 * notice, this list of conditions and the following disclaimer in the 00011 * documentation and/or other materials provided with the distribution. 00012 * * Neither the name of the copyright holder nor the 00013 * names of its contributors may be used to endorse or promote products 00014 * derived from this software without specific prior written permission. 00015 * 00016 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 00017 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 00018 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 00019 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY 00020 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 00021 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00022 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 00023 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00024 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00025 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00026 */ 00027 00033 #include "device.h" 00034 #include "buffer.h" 00035 #include "kernel.h" 00036 #include "program.h" 00037 #include "worker.h" 00038 #include "builtins.h" 00039 00040 #include <core/config.h> 00041 #include "../propertylist.h" 00042 #include "../commandqueue.h" 00043 #include "../events.h" 00044 #include "../memobject.h" 00045 #include "../kernel.h" 00046 #include "../program.h" 00047 00048 #include <cstring> 00049 #include <cstdlib> 00050 #include <unistd.h> 00051 00052 #include <iostream> 00053 #include <fstream> 00054 #include <sstream> 00055 00056 using namespace Coal; 00057 00058 CPUDevice::CPUDevice() 00059 : DeviceInterface(), p_cores(0), p_num_events(0), p_workers(0), p_stop(false), 00060 p_initialized(false) 00061 { 00062 00063 } 00064 00065 void CPUDevice::init() 00066 { 00067 if (p_initialized) 00068 return; 00069 00070 // Initialize the locking machinery 00071 pthread_cond_init(&p_events_cond, 0); 00072 pthread_mutex_init(&p_events_mutex, 0); 00073 00074 // Get info about the system 00075 p_cores = sysconf(_SC_NPROCESSORS_ONLN); 00076 p_cpu_mhz = 0.0f; 00077 00078 std::filebuf fb; 00079 fb.open("/proc/cpuinfo", std::ios::in); 00080 std::istream is(&fb); 00081 00082 while (!is.eof()) 00083 { 00084 std::string key, value; 00085 00086 std::getline(is, key, ':'); 00087 is.ignore(1); 00088 std::getline(is, value); 00089 00090 if (key.compare(0, 7, "cpu MHz") == 0) 00091 { 00092 std::istringstream ss(value); 00093 ss >> p_cpu_mhz; 00094 break; 00095 } 00096 } 00097 00098 // Create worker threads 00099 p_workers = (pthread_t *)std::malloc(numCPUs() * sizeof(pthread_t)); 00100 00101 for (unsigned int i=0; i<numCPUs(); ++i) 00102 { 00103 pthread_create(&p_workers[i], 0, &worker, this); 00104 } 00105 00106 p_initialized = true; 00107 } 00108 00109 CPUDevice::~CPUDevice() 00110 { 00111 if (!p_initialized) 00112 return; 00113 00114 // Terminate the workers and wait for them 00115 pthread_mutex_lock(&p_events_mutex); 00116 00117 p_stop = true; 00118 00119 pthread_cond_broadcast(&p_events_cond); 00120 pthread_mutex_unlock(&p_events_mutex); 00121 00122 for (unsigned int i=0; i<numCPUs(); ++i) 00123 { 00124 pthread_join(p_workers[i], 0); 00125 } 00126 00127 // Free allocated memory 00128 std::free((void *)p_workers); 00129 pthread_mutex_destroy(&p_events_mutex); 00130 pthread_cond_destroy(&p_events_cond); 00131 } 00132 00133 DeviceBuffer *CPUDevice::createDeviceBuffer(MemObject *buffer, cl_int *rs) 00134 { 00135 return (DeviceBuffer *)new CPUBuffer(this, buffer, rs); 00136 } 00137 00138 DeviceProgram *CPUDevice::createDeviceProgram(Program *program) 00139 { 00140 return (DeviceProgram *)new CPUProgram(this, program); 00141 } 00142 00143 DeviceKernel *CPUDevice::createDeviceKernel(Kernel *kernel, 00144 llvm::Function *function) 00145 { 00146 return (DeviceKernel *)new CPUKernel(this, kernel, function); 00147 } 00148 00149 cl_int CPUDevice::initEventDeviceData(Event *event) 00150 { 00151 switch (event->type()) 00152 { 00153 case Event::MapBuffer: 00154 { 00155 MapBufferEvent *e = (MapBufferEvent *)event; 00156 CPUBuffer *buf = (CPUBuffer *)e->buffer()->deviceBuffer(this); 00157 unsigned char *data = (unsigned char *)buf->data(); 00158 00159 data += e->offset(); 00160 00161 e->setPtr((void *)data); 00162 break; 00163 } 00164 case Event::MapImage: 00165 { 00166 MapImageEvent *e = (MapImageEvent *)event; 00167 Image2D *image = (Image2D *)e->buffer(); 00168 CPUBuffer *buf = (CPUBuffer *)image->deviceBuffer(this); 00169 unsigned char *data = (unsigned char *)buf->data(); 00170 00171 data = imageData(data, 00172 e->origin(0), 00173 e->origin(1), 00174 e->origin(2), 00175 image->row_pitch(), 00176 image->slice_pitch(), 00177 image->pixel_size()); 00178 00179 e->setPtr((void *)data); 00180 e->setRowPitch(image->row_pitch()); 00181 e->setSlicePitch(image->slice_pitch()); 00182 break; 00183 } 00184 case Event::UnmapMemObject: 00185 // Nothing do to 00186 break; 00187 00188 case Event::NDRangeKernel: 00189 case Event::TaskKernel: 00190 { 00191 // Instantiate the JIT for the CPU program 00192 KernelEvent *e = (KernelEvent *)event; 00193 Program *p = (Program *)e->kernel()->parent(); 00194 CPUProgram *prog = (CPUProgram *)p->deviceDependentProgram(this); 00195 00196 if (!prog->initJIT()) 00197 return CL_INVALID_PROGRAM_EXECUTABLE; 00198 00199 // Set device-specific data 00200 CPUKernelEvent *cpu_e = new CPUKernelEvent(this, e); 00201 e->setDeviceData((void *)cpu_e); 00202 00203 break; 00204 } 00205 default: 00206 break; 00207 } 00208 00209 return CL_SUCCESS; 00210 } 00211 00212 void CPUDevice::freeEventDeviceData(Event *event) 00213 { 00214 switch (event->type()) 00215 { 00216 case Event::NDRangeKernel: 00217 case Event::TaskKernel: 00218 { 00219 CPUKernelEvent *cpu_e = (CPUKernelEvent *)event->deviceData(); 00220 00221 if (cpu_e) 00222 delete cpu_e; 00223 } 00224 default: 00225 break; 00226 } 00227 } 00228 00229 void CPUDevice::pushEvent(Event *event) 00230 { 00231 // Add an event in the list 00232 pthread_mutex_lock(&p_events_mutex); 00233 00234 p_events.push_back(event); 00235 p_num_events++; // Way faster than STL list::size() ! 00236 00237 pthread_cond_broadcast(&p_events_cond); 00238 pthread_mutex_unlock(&p_events_mutex); 00239 } 00240 00241 Event *CPUDevice::getEvent(bool &stop) 00242 { 00243 // Return the first event in the list, if any. Remove it if it is a 00244 // single-shot event. 00245 pthread_mutex_lock(&p_events_mutex); 00246 00247 while (p_num_events == 0 && !p_stop) 00248 pthread_cond_wait(&p_events_cond, &p_events_mutex); 00249 00250 if (p_stop) 00251 { 00252 pthread_mutex_unlock(&p_events_mutex); 00253 stop = true; 00254 return 0; 00255 } 00256 00257 Event *event = p_events.front(); 00258 00259 // If the run of this event will finish it, remove it from the list 00260 bool last_slot = true; 00261 00262 if (event->type() == Event::NDRangeKernel || 00263 event->type() == Event::TaskKernel) 00264 { 00265 CPUKernelEvent *ke = (CPUKernelEvent *)event->deviceData(); 00266 last_slot = ke->reserve(); 00267 } 00268 00269 if (last_slot) 00270 { 00271 p_num_events--; 00272 p_events.pop_front(); 00273 } 00274 00275 pthread_mutex_unlock(&p_events_mutex); 00276 00277 return event; 00278 } 00279 00280 unsigned int CPUDevice::numCPUs() const 00281 { 00282 return p_cores; 00283 } 00284 00285 float CPUDevice::cpuMhz() const 00286 { 00287 return p_cpu_mhz; 00288 } 00289 00290 // From inner parentheses to outher ones : 00291 // 00292 // sizeof * 8 => 8 00293 // -1 => 7 00294 // 1 << $ => 10000000 00295 // -1 => 01111111 00296 // *2 => 11111110 00297 // +1 => 11111111 00298 // 00299 // A simple way to do this is (1 << (sizeof(type) * 8)) - 1, but it overflows 00300 // the type (for int8, 1 << $ = 100000000 = 256 > 255) 00301 #define TYPE_MAX(type) ((((type)1 << ((sizeof(type) * 8) - 1)) - 1) * 2 + 1) 00302 00303 cl_int CPUDevice::info(cl_device_info param_name, 00304 size_t param_value_size, 00305 void *param_value, 00306 size_t *param_value_size_ret) const 00307 { 00308 void *value = 0; 00309 size_t value_length = 0; 00310 00311 union { 00312 cl_device_type cl_device_type_var; 00313 cl_uint cl_uint_var; 00314 size_t size_t_var; 00315 cl_ulong cl_ulong_var; 00316 cl_bool cl_bool_var; 00317 cl_device_fp_config cl_device_fp_config_var; 00318 cl_device_mem_cache_type cl_device_mem_cache_type_var; 00319 cl_device_local_mem_type cl_device_local_mem_type_var; 00320 cl_device_exec_capabilities cl_device_exec_capabilities_var; 00321 cl_command_queue_properties cl_command_queue_properties_var; 00322 cl_platform_id cl_platform_id_var; 00323 size_t work_dims[MAX_WORK_DIMS]; 00324 }; 00325 00326 switch (param_name) 00327 { 00328 case CL_DEVICE_TYPE: 00329 SIMPLE_ASSIGN(cl_device_type, CL_DEVICE_TYPE_CPU); 00330 break; 00331 00332 case CL_DEVICE_VENDOR_ID: 00333 SIMPLE_ASSIGN(cl_uint, 0); 00334 break; 00335 00336 case CL_DEVICE_MAX_COMPUTE_UNITS: 00337 SIMPLE_ASSIGN(cl_uint, numCPUs()); 00338 break; 00339 00340 case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 00341 SIMPLE_ASSIGN(cl_uint, MAX_WORK_DIMS); 00342 break; 00343 00344 case CL_DEVICE_MAX_WORK_GROUP_SIZE: 00345 SIMPLE_ASSIGN(size_t, TYPE_MAX(size_t)); 00346 break; 00347 00348 case CL_DEVICE_MAX_WORK_ITEM_SIZES: 00349 for (int i=0; i<MAX_WORK_DIMS; ++i) 00350 { 00351 work_dims[i] = TYPE_MAX(size_t); 00352 } 00353 value_length = MAX_WORK_DIMS * sizeof(size_t); 00354 value = &work_dims; 00355 break; 00356 00357 case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: 00358 SIMPLE_ASSIGN(cl_uint, 16); 00359 break; 00360 00361 case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: 00362 SIMPLE_ASSIGN(cl_uint, 8); 00363 break; 00364 00365 case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: 00366 SIMPLE_ASSIGN(cl_uint, 4); 00367 break; 00368 00369 case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: 00370 SIMPLE_ASSIGN(cl_uint, 2); 00371 break; 00372 00373 case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: 00374 SIMPLE_ASSIGN(cl_uint, 4); 00375 break; 00376 00377 case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: 00378 SIMPLE_ASSIGN(cl_uint, 2); 00379 break; 00380 00381 case CL_DEVICE_MAX_CLOCK_FREQUENCY: 00382 SIMPLE_ASSIGN(cl_uint, cpuMhz() * 1000000); 00383 break; 00384 00385 case CL_DEVICE_ADDRESS_BITS: 00386 SIMPLE_ASSIGN(cl_uint, 32); 00387 break; 00388 00389 case CL_DEVICE_MAX_READ_IMAGE_ARGS: 00390 SIMPLE_ASSIGN(cl_uint, 65536); 00391 break; 00392 00393 case CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 00394 SIMPLE_ASSIGN(cl_uint, 65536); 00395 break; 00396 00397 case CL_DEVICE_MAX_MEM_ALLOC_SIZE: 00398 SIMPLE_ASSIGN(cl_ulong, 128*1024*1024); 00399 break; 00400 00401 case CL_DEVICE_IMAGE2D_MAX_WIDTH: 00402 SIMPLE_ASSIGN(size_t, 65536); 00403 break; 00404 00405 case CL_DEVICE_IMAGE2D_MAX_HEIGHT: 00406 SIMPLE_ASSIGN(size_t, 65536); 00407 break; 00408 00409 case CL_DEVICE_IMAGE3D_MAX_WIDTH: 00410 SIMPLE_ASSIGN(size_t, 65536); 00411 break; 00412 00413 case CL_DEVICE_IMAGE3D_MAX_HEIGHT: 00414 SIMPLE_ASSIGN(size_t, 65536); 00415 break; 00416 00417 case CL_DEVICE_IMAGE3D_MAX_DEPTH: 00418 SIMPLE_ASSIGN(size_t, 65536); 00419 break; 00420 00421 case CL_DEVICE_IMAGE_SUPPORT: 00422 SIMPLE_ASSIGN(cl_bool, CL_TRUE); 00423 break; 00424 00425 case CL_DEVICE_MAX_PARAMETER_SIZE: 00426 SIMPLE_ASSIGN(size_t, 65536); 00427 break; 00428 00429 case CL_DEVICE_MAX_SAMPLERS: 00430 SIMPLE_ASSIGN(cl_uint, 16); 00431 break; 00432 00433 case CL_DEVICE_MEM_BASE_ADDR_ALIGN: 00434 SIMPLE_ASSIGN(cl_uint, 0); 00435 break; 00436 00437 case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE: 00438 SIMPLE_ASSIGN(cl_uint, 16); 00439 break; 00440 00441 case CL_DEVICE_SINGLE_FP_CONFIG: 00442 // TODO: Check what an x86 SSE engine can support. 00443 SIMPLE_ASSIGN(cl_device_fp_config, 00444 CL_FP_DENORM | 00445 CL_FP_INF_NAN | 00446 CL_FP_ROUND_TO_NEAREST); 00447 break; 00448 00449 case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: 00450 SIMPLE_ASSIGN(cl_device_mem_cache_type, 00451 CL_READ_WRITE_CACHE); 00452 break; 00453 00454 case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: 00455 // TODO: Get this information from the processor 00456 SIMPLE_ASSIGN(cl_uint, 16); 00457 break; 00458 00459 case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: 00460 // TODO: Get this information from the processor 00461 SIMPLE_ASSIGN(cl_ulong, 512*1024*1024); 00462 break; 00463 00464 case CL_DEVICE_GLOBAL_MEM_SIZE: 00465 // TODO: 1 Gio seems to be enough for software acceleration 00466 SIMPLE_ASSIGN(cl_ulong, 1*1024*1024*1024); 00467 break; 00468 00469 case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: 00470 SIMPLE_ASSIGN(cl_ulong, 1*1024*1024*1024); 00471 break; 00472 00473 case CL_DEVICE_MAX_CONSTANT_ARGS: 00474 SIMPLE_ASSIGN(cl_uint, 65536); 00475 break; 00476 00477 case CL_DEVICE_LOCAL_MEM_TYPE: 00478 SIMPLE_ASSIGN(cl_device_local_mem_type, CL_GLOBAL); 00479 break; 00480 00481 case CL_DEVICE_LOCAL_MEM_SIZE: 00482 SIMPLE_ASSIGN(cl_ulong, 1*1024*1024*1024); 00483 break; 00484 00485 case CL_DEVICE_ERROR_CORRECTION_SUPPORT: 00486 SIMPLE_ASSIGN(cl_bool, CL_FALSE); 00487 break; 00488 00489 case CL_DEVICE_PROFILING_TIMER_RESOLUTION: 00490 // TODO 00491 SIMPLE_ASSIGN(size_t, 1000); // 1000 nanoseconds = 1 ms 00492 break; 00493 00494 case CL_DEVICE_ENDIAN_LITTLE: 00495 SIMPLE_ASSIGN(cl_bool, CL_TRUE); 00496 break; 00497 00498 case CL_DEVICE_AVAILABLE: 00499 SIMPLE_ASSIGN(cl_bool, CL_TRUE); 00500 break; 00501 00502 case CL_DEVICE_COMPILER_AVAILABLE: 00503 SIMPLE_ASSIGN(cl_bool, CL_TRUE); 00504 break; 00505 00506 case CL_DEVICE_EXECUTION_CAPABILITIES: 00507 SIMPLE_ASSIGN(cl_device_exec_capabilities, CL_EXEC_KERNEL | 00508 CL_EXEC_NATIVE_KERNEL); 00509 break; 00510 00511 case CL_DEVICE_QUEUE_PROPERTIES: 00512 SIMPLE_ASSIGN(cl_command_queue_properties, 00513 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | 00514 CL_QUEUE_PROFILING_ENABLE); 00515 break; 00516 00517 case CL_DEVICE_NAME: 00518 STRING_ASSIGN("CPU"); 00519 break; 00520 00521 case CL_DEVICE_VENDOR: 00522 STRING_ASSIGN("Mesa"); 00523 break; 00524 00525 case CL_DRIVER_VERSION: 00526 STRING_ASSIGN("" COAL_VERSION); 00527 break; 00528 00529 case CL_DEVICE_PROFILE: 00530 STRING_ASSIGN("FULL_PROFILE"); 00531 break; 00532 00533 case CL_DEVICE_VERSION: 00534 STRING_ASSIGN("OpenCL 1.1 Mesa " COAL_VERSION); 00535 break; 00536 00537 case CL_DEVICE_EXTENSIONS: 00538 STRING_ASSIGN("cl_khr_global_int32_base_atomics" 00539 " cl_khr_global_int32_extended_atomics" 00540 " cl_khr_local_int32_base_atomics" 00541 " cl_khr_local_int32_extended_atomics" 00542 " cl_khr_byte_addressable_store" 00543 00544 " cl_khr_fp64" 00545 " cl_khr_int64_base_atomics" 00546 " cl_khr_int64_extended_atomics") 00547 00548 break; 00549 00550 case CL_DEVICE_PLATFORM: 00551 SIMPLE_ASSIGN(cl_platform_id, 0); 00552 break; 00553 00554 case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF: 00555 SIMPLE_ASSIGN(cl_uint, 0); 00556 break; 00557 00558 case CL_DEVICE_HOST_UNIFIED_MEMORY: 00559 SIMPLE_ASSIGN(cl_bool, CL_TRUE); 00560 break; 00561 00562 case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR: 00563 SIMPLE_ASSIGN(cl_uint, 16); 00564 break; 00565 00566 case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT: 00567 SIMPLE_ASSIGN(cl_uint, 8); 00568 break; 00569 00570 case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT: 00571 SIMPLE_ASSIGN(cl_uint, 4); 00572 break; 00573 00574 case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG: 00575 SIMPLE_ASSIGN(cl_uint, 2); 00576 break; 00577 00578 case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT: 00579 SIMPLE_ASSIGN(cl_uint, 4); 00580 break; 00581 00582 case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE: 00583 SIMPLE_ASSIGN(cl_uint, 2); 00584 break; 00585 00586 case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF: 00587 SIMPLE_ASSIGN(cl_uint, 0); 00588 break; 00589 00590 case CL_DEVICE_OPENCL_C_VERSION: 00591 STRING_ASSIGN("OpenCL C 1.1 LLVM " LLVM_VERSION); 00592 break; 00593 00594 default: 00595 return CL_INVALID_VALUE; 00596 } 00597 00598 if (param_value && param_value_size < value_length) 00599 return CL_INVALID_VALUE; 00600 00601 if (param_value_size_ret) 00602 *param_value_size_ret = value_length; 00603 00604 if (param_value) 00605 std::memcpy(param_value, value, value_length); 00606 00607 return CL_SUCCESS; 00608 }