26 #include <boost/scoped_array.hpp>
27 #include <boost/shared_array.hpp>
34 #if CUDA_VERSION < 3020
35 typedef unsigned int Cuda_size_t;
37 typedef size_t Cuda_size_t;
57 throw ::kjb::gpu::Cuda_error(err, __FILE__, __LINE__); \
67 std::cerr << "Cuda error: " << get_cuda_error_string(err) << std::endl; \
75 const char* get_cuda_error_string(
const CUresult& err);
82 Cuda_error(CUresult error_code,
const char* file,
int line) :
87 Cuda_error(CUresult error_code,
const std::string& msg,
char* file,
int line) :
92 Cuda_error(
const std::string& msg,
const char* file,
int line) :
94 code_(CUDA_ERROR_UNKNOWN)
98 CUresult get_cuda_error()
116 struct Cuda_compute_capability
118 Cuda_compute_capability() :
123 Cuda_compute_capability(
int major_,
int minor_) :
128 Cuda_compute_capability(
const Cuda_compute_capability& other) :
133 Cuda_compute_capability& operator=(
const Cuda_compute_capability& other)
141 bool operator==(
const Cuda_compute_capability& other)
143 return major == other.major && minor == other.minor;
146 bool operator<(
const Cuda_compute_capability& other)
148 if(major < other.major)
return true;
149 if(major == other.major && minor < other.minor)
return true;
172 Cuda_device(
const Cuda_device& other) :
173 handle_(other.handle_),
174 capability_(other.capability_),
176 properties_(other.properties_)
179 Cuda_device& operator=(
const Cuda_device& other)
181 handle_ = other.handle_;
182 capability_ = other.capability_;
184 properties_ = other.properties_;
191 return handle_ == other.handle_;
194 operator CUdevice()
const
206 int get_attribute(CUdevice_attribute attrib)
const
209 CU_ETX(cuDeviceGetAttribute(&result, attrib, handle_));
216 const Cuda_compute_capability& compute_capability()
const
221 const std::string& name()
const
234 int max_threads()
const
236 return properties_.maxThreadsPerBlock;
246 int max_threads(
int d)
const
248 assert(d >= 0 && d < 3);
249 return properties_.maxThreadsDim[d];
259 int max_blocks(
int d)
const
261 assert(d >= 0 && d < 3);
262 return properties_.maxGridSize[d];
268 int shared_memory_size()
const
270 return properties_.sharedMemPerBlock;
276 int const_memory_size()
const
278 return properties_.totalConstantMemory;
284 int warp_size()
const
286 return properties_.SIMDWidth;
293 int memory_pitch()
const
295 return properties_.memPitch;
301 int register_count()
const
303 return properties_.regsPerBlock;
309 int clock_rate()
const
311 return properties_.clockRate;
320 int texture_alignment()
const
322 return properties_.textureAlign;
328 int available_memory()
const
330 #if CUDA_VERSION < 3020
336 CU_ETX(cuDeviceTotalMem(&bytes, handle_));
344 Cuda_device(CUdevice handle) :
350 CU_ETX(cuDeviceComputeCapability(&capability_.major, &capability_.minor, handle_));
352 const size_t BUF_SIZE = 256;
353 char name_buff[BUF_SIZE];
354 CU_ETX(cuDeviceGetName(name_buff, BUF_SIZE, handle_));
355 name_ = std::string(name_buff);
357 CU_ETX(cuDeviceGetProperties(&properties_, handle_));
362 Cuda_compute_capability capability_;
364 CUdevprop properties_;
378 static int get_num_devices();
385 static Cuda_device get_device(
int i);
387 static bool is_initialized()
392 static void ensure_initialized()
394 if(!is_initialized())
401 static CUmodule load_module(
const char* mod_fname,
int max_registers = 32);
402 static CUmodule load_module_file(
const std::string& mod_fname,
int max_registers = 32);
408 static void set_jit_log_buffer_size(
size_t size)
410 jit_log_buffer_size_ = size;
415 static void init_(
unsigned int flags)
426 static int jit_log_buffer_size_;
427 static bool initialized_;
436 Cuda_context(
unsigned int flags,
const Cuda_device& device) :
439 Cuda::ensure_initialized();
441 cuCtxCreate(&context_, flags, device);
444 Cuda_context(
const Cuda_device& device) :
447 Cuda::ensure_initialized();
449 cuCtxCreate(&context_, 0, device);
460 size_t get_free_memory()
462 #if CUDA_VERSION < 3020
463 unsigned int free, total;
467 CU_EPETE(cuMemGetInfo(&free, &total));
475 cuCtxDestroy(context_);
478 operator CUcontext()
const
485 Cuda_context(
const Cuda_context& ) {}
506 class Cuda_base_module
512 Cuda_base_module(
const std::string& mod_fname,
int max_registers = -1) :
513 handle_(Cuda::load_module_file(mod_fname, max_registers))
517 Cuda_base_module(
const char* mod_code,
int max_registers = -1) :
518 handle_(Cuda::load_module(mod_code, max_registers))
522 virtual ~Cuda_base_module()
524 cuModuleUnload(handle_);
529 CUmodule& get_handle_() {
return handle_; }
542 class Cuda_reduce_module :
public Cuda_base_module
545 typedef Cuda_base_module Base;
565 static const int NUM_TYPES = 2;
568 static const int NUM_SIZES = 10;
570 Cuda_reduce_module();
595 double reduce(CUdeviceptr d_in,
int N);
616 double reduce(T* h_in,
int N);
634 float reduce(
const std::vector<float>&,
int )
640 double tex_reduce(CUarray da_in,
int width,
int height);
641 double tex_count(CUarray da_in,
int width,
int height);
642 double chamfer_reduce(CUdeviceptr d_in_1, CUdeviceptr d_in_2,
int N,
bool square);
643 double chamfer_reduce(CUarray tex_1, CUarray tex_2,
int width,
int height);
645 double tex_function(CUarray da_in,
int width,
int height, CUfunction func);
648 void get_num_blocks_and_threads_(
int n,
int maxBlocks,
int maxThreads,
int& blocks,
int& threads)
651 #define MIN(x,y) ((x < y) ? x : y)
654 threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
655 blocks = (n + (threads * 2 - 1)) / (threads * 2);
657 blocks =
MIN(maxBlocks, blocks);
660 int get_function_index_(
int type_index,
int threads,
bool is_pow_2);
666 virtual void load_functions();
668 static int is_pow_2(
unsigned int x)
670 return ((x&(x-1))==0);
673 static unsigned int nextPow2(
unsigned int x )
684 static const int NUM_FUNCTIONS = NUM_SIZES * NUM_TYPES * 2;
686 CUfunction functions_[NUM_FUNCTIONS];
687 CUfunction tex_reduce_function;
688 CUfunction tex_count_function;
689 CUfunction chamfer_and_reduce_function;
690 CUfunction squared_chamfer_and_reduce_function;
691 CUfunction chamfer_reduce_function;
698 static const size_t MAX_THREADS = 256;
699 static const size_t MAX_BLOCKS = 64;
710 using boost::scoped_array;
711 using kjb_c::kjb_debug_level;
712 using kjb_c::add_error;
715 cuCtxAttach(&ctx, 0);
719 get_num_blocks_and_threads_(N, MAX_BLOCKS, MAX_THREADS, blocks, threads);
721 scoped_array<float> h_out(
new float[blocks]);
723 CUdeviceptr d_out = NULL;
725 CUresult err = CUDA_SUCCESS;
729 EGC(err = cuMemAlloc(&d_out,
sizeof(
float) * blocks));
736 const int type_index_ = type_index<T>();
737 const int i = get_function_index_(type_index_, threads, is_pow_2(N));
738 function = functions_[
i];
742 #define ALIGN_UP(offset, alignment) \
743 (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
750 const int WARP_SIZE = 32;
751 const int smemSize = (threads <= WARP_SIZE) ? 2 * threads *
sizeof(
float) : threads *
sizeof(float);
752 cuFuncSetSharedSize(
function, smemSize);
757 ptr = (
void*)(
size_t) d_in;
758 ALIGN_UP(offset, __alignof(ptr));
759 EGC(err = cuParamSetv(
function, offset, &ptr,
sizeof(ptr)));
760 offset +=
sizeof(ptr);
762 ptr = (
void*)(
size_t) d_out;
763 ALIGN_UP(offset, __alignof(ptr));
764 EGC(err = cuParamSetv(
function, offset, &ptr,
sizeof(ptr)));
765 offset +=
sizeof(ptr);
767 ALIGN_UP(offset, __alignof(N));
768 EGC(err = cuParamSeti(
function, offset, N));
771 EGC(err = cuParamSetSize(
function, offset));
773 EGC(err = cuFuncSetBlockShape(
function, threads, 1, 1));
774 EGC(err = cuLaunchGrid(
function, blocks, 1));
781 EGC(err = cuMemcpyDtoH(h_out.get(), d_out, blocks *
sizeof(float)));
787 double gpu_total = 0;
788 for(
int i = 0;
i < blocks;
i++)
791 gpu_total += h_out[
i];
811 err = cuCtxDetach(ctx);
813 if(err != CUDA_SUCCESS)
815 throw Cuda_error(err, __FILE__, __LINE__);
827 using kjb_c::kjb_debug_level;
828 using kjb_c::add_error;
831 CUresult err = CUDA_SUCCESS;
834 CUdeviceptr d_in = NULL;
835 const size_t size = N *
sizeof(T);
837 CUcontext ctx = NULL;
838 EGC(err = cuCtxAttach(&ctx, 0));
841 EGC( err = cuMemAlloc(&d_in, size));
844 EGC(err = cuMemcpyHtoD(d_in, h_in, size));
849 result = reduce<T>(d_in, N);
851 catch(Cuda_error& ex)
854 err = ex.get_cuda_error();
865 if(err != CUDA_SUCCESS)
867 throw Cuda_error(err, __FILE__, __LINE__);
882 int Cuda_reduce_module::type_index<float>();
891 int Cuda_reduce_module::type_index<unsigned char>();
909 CUarray create_cuda_array(
const Matrix&
m,
bool flip_y);
919 template <
class Matrix_type,
class Dest_type>
920 Dest_type* to_pitch(
const Matrix_type& m,
size_t pitch,
bool flip_y)
927 assert(pitch >= (
size_t) m.get_num_cols());
929 T* out =
new T[pitch * m.get_num_rows()];
935 for(
int row = m.get_num_rows() - 1; row >= 0; row--)
936 for(
size_t col = 0; col < pitch; col++)
938 if(col >= (
size_t) m.get_num_cols())
941 *cur++ = (T)
m(row,col);
946 for(
int row = 0; row < m.get_num_rows(); row++)
947 for(
size_t col = 0; col < pitch; col++)
949 if(col >= (
size_t) m.get_num_cols())
952 *cur++ = (T)
m(row,col);
968 template <
class Matrix_type,
class Dest_type>
969 #if CUDA_VERSION < 3020
970 CUdeviceptr create_cuda_pitch(
const Matrix_type& m,
unsigned int& pitch,
bool flip_y)
972 CUdeviceptr create_cuda_pitch(
const Matrix_type& m,
size_t& pitch,
bool flip_y)
977 int width = m.get_num_cols();
978 int height = m.get_num_rows();
987 CU_ETX(cuMemAllocPitch(
994 boost::scoped_array<T> data(to_pitch<Matrix_type, T>(m, pitch, flip_y));
996 CUDA_MEMCPY2D cpy_meta;
997 cpy_meta.srcMemoryType = CU_MEMORYTYPE_HOST;
998 cpy_meta.srcXInBytes = 0;
1000 cpy_meta.srcHost = data.get();
1001 cpy_meta.srcDevice = 0;
1002 cpy_meta.srcArray = 0;
1003 cpy_meta.srcPitch = pitch;
1004 cpy_meta.dstXInBytes = 0;
1006 cpy_meta.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1007 cpy_meta.dstHost = 0;
1008 cpy_meta.dstDevice = result;
1009 cpy_meta.dstArray = 0;
1010 cpy_meta.dstPitch = pitch;
1011 cpy_meta.WidthInBytes = pitch;
1012 cpy_meta.Height =
height;
1014 CU_ETX(cuMemcpy2D(&cpy_meta));
1017 boost::scoped_array<T> o_data(
new T[width * height]);
1019 cpy_meta.srcXInBytes = 0;
1021 cpy_meta.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1022 cpy_meta.srcHost = 0;
1023 cpy_meta.srcDevice = result;
1024 cpy_meta.srcArray = 0;
1025 cpy_meta.srcPitch = pitch;
1026 cpy_meta.dstMemoryType = CU_MEMORYTYPE_HOST;
1027 cpy_meta.dstXInBytes = 0;
1029 cpy_meta.dstHost = o_data.get();
1030 cpy_meta.dstDevice = 0;
1031 cpy_meta.dstArray = 0;
1032 cpy_meta.dstPitch = 0;
1033 cpy_meta.WidthInBytes = width *
sizeof(T);
1034 cpy_meta.Height =
height;
1036 CU_ETX(cuMemcpy2D(&cpy_meta));
1039 for(
int row = 0; row <
height; row++)
1040 for(
int col = 0; col < width; col++)
1042 int rev_row = height - row - 1;
1043 int in_i = row * pitch /
sizeof(T) + col;
1044 int out_i = row * width + col;
1045 assert(data[in_i] == o_data[out_i]);
1047 assert(fabs(
m(rev_row, col) - o_data[out_i]) <= fabs(
m(rev_row, col)) * FLT_EPSILON);
1049 assert(fabs(
m(row, col) - o_data[out_i]) <= fabs(
m(row, col)) * FLT_EPSILON);
bool operator<(const Face_detection &f1, const Face_detection &f2)
Compares to boxes using middle of box. Needed because we have associated containers of these...
Definition: d_facecom.cpp:147
height
Definition: APPgetLargeConnectedEdges.m:33
#define KJB_THROW(ex)
Definition: l_exception.h:46
This class implements vectors, in the linear-algebra sense, with real-valued elements.
Definition: m_vector.h:87
#define MIN(a, b)
Definition: utils.h:126
double reduce(const Assignable_array_type &array, size_t length)
Definition: m_arith.h:42
x
Definition: APPgetLargeConnectedEdges.m:100
#define KJB_THROW_2(ex, msg)
Definition: l_exception.h:48
bool operator==(const Int_matrix &op1, const Int_matrix::Impl_type &op2)
Test for exact equality between two matrices.
Definition: l_int_matrix.cpp:218
Object thrown when attempting to use unimplemented functionality.
Definition: l_exception.h:281
get the indices of edges in each direction for i
Definition: APPgetLargeConnectedEdges.m:48
This class implements matrices, in the linear-algebra sense, with real-valued elements.
Definition: m_matrix.h:94
for m
Definition: APPgetLargeConnectedEdges.m:64
Support for error handling exception classes in libKJB.
Definition for the Vector class, a thin wrapper on the KJB Vector struct and its related functionalit...
Object thrown when computation fails somehow during execution.
Definition: l_exception.h:321