KJB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
gpu_cuda.h
Go to the documentation of this file.
1 /* $Id: gpu_cuda.h 10604 2011-09-29 19:50:28Z predoehl $ */
2 /* {{{=========================================================================== *
3  |
4  | Copyright (c) 1994-2010 by Kobus Barnard (author)
5  |
6  | Personal and educational use of this code is granted, provided that this
7  | header is kept intact, and that the authorship is not misrepresented, that
8  | its use is acknowledged in publications, and relevant papers are cited.
9  |
10  | For other use contact the author (kobus AT cs DOT arizona DOT edu).
11  |
12  | Please note that the code in this file has not necessarily been adequately
13  | tested. Naturally, there is no guarantee of performance, support, or fitness
14  | for any particular task. Nonetheless, I am interested in hearing about
15  | problems that you encounter.
16  |
17  | Author: Kyle Simek
18  * =========================================================================== }}}*/
19 
20 #ifndef KJB_CUDA_H
21 #define KJB_CUDA_H
22 
23 #include <string>
24 #include <vector>
25 
26 #include <boost/scoped_array.hpp>
27 #include <boost/shared_array.hpp>
28 
29 #include <m_cpp/m_vector.h>
30 #include <l_cpp/l_exception.h>
31 
32 #ifdef KJB_HAVE_CUDA
33 #include <cuda.h>
34 #if CUDA_VERSION < 3020
35 typedef unsigned int Cuda_size_t;
36 #else
37 typedef size_t Cuda_size_t;
38 #endif
39 #endif
40 
41 #undef major
42 #undef minor
43 
44 
45 namespace kjb
46 {
47 namespace gpu
48 {
49 
50 #ifdef KJB_HAVE_CUDA
51 // "On error throw exception" -- in this case, "error" is ANY nonzero value.
52 #define CU_ETX(a) \
53 { \
54  CUresult err = a; \
55  if(err) \
56  { \
57  throw ::kjb::gpu::Cuda_error(err, __FILE__, __LINE__); \
58  } \
59 }
60 
61 // "On error throw exception" -- in this case, "error" is ANY nonzero value.
62 #define CU_EPETE(a) \
63 { \
64  CUresult err = a; \
65  if(err) \
66  { \
67  std::cerr << "Cuda error: " << get_cuda_error_string(err) << std::endl; \
68  abort(); \
69  } \
70 }
71 #endif
72 
73 
74 #ifdef KJB_HAVE_CUDA
75 const char* get_cuda_error_string(const CUresult& err);
76 #endif
77 
78 #ifdef KJB_HAVE_CUDA
79 class Cuda_error : public kjb::Runtime_error
80 {
81 public:
82  Cuda_error(CUresult error_code, const char* file, int line) :
83  Runtime_error(get_cuda_error_string(error_code), file, line),
84  code_(error_code)
85  {}
86 
87  Cuda_error(CUresult error_code, const std::string& msg, char* file, int line) :
88  Runtime_error(msg, file, line),
89  code_(error_code)
90  {}
91 
92  Cuda_error(const std::string& msg, const char* file, int line) :
93  Runtime_error(msg, file, line),
94  code_(CUDA_ERROR_UNKNOWN)
95  {}
96 
97 
98  CUresult get_cuda_error()
99  {
100  return code_;
101  }
102 private:
103  CUresult code_;
104 };
105 #endif
106 
115 #ifdef KJB_HAVE_CUDA
116 struct Cuda_compute_capability
117 {
118  Cuda_compute_capability() :
119  major(0),
120  minor(0)
121  {}
122 
123  Cuda_compute_capability(int major_, int minor_) :
124  major(major_),
125  minor(minor_)
126  {}
127 
128  Cuda_compute_capability(const Cuda_compute_capability& other) :
129  major(other.major),
130  minor(other.minor)
131  {}
132 
133  Cuda_compute_capability& operator=(const Cuda_compute_capability& other)
134  {
135  major = other.major;
136  minor = other.minor;
137 
138  return *this;
139  }
140 
141  bool operator==(const Cuda_compute_capability& other)
142  {
143  return major == other.major && minor == other.minor;
144  }
145 
146  bool operator<(const Cuda_compute_capability& other)
147  {
148  if(major < other.major) return true;
149  if(major == other.major && minor < other.minor) return true;
150  return false;
151 
152  }
153 
154 
155  int major;
156  int minor;
157 };
158 #endif
159 
167 #ifdef KJB_HAVE_CUDA
168 class Cuda_device
169 {
170 public:
171  friend class Cuda;
172  Cuda_device(const Cuda_device& other) :
173  handle_(other.handle_),
174  capability_(other.capability_),
175  name_(other.name_),
176  properties_(other.properties_)
177  {}
178 
179  Cuda_device& operator=(const Cuda_device& other)
180  {
181  handle_ = other.handle_;
182  capability_ = other.capability_;
183  name_ = other.name_;
184  properties_ = other.properties_;
185 
186  return *this;
187  }
188 
189  bool operator==(const Cuda_device& other)
190  {
191  return handle_ == other.handle_;
192  }
193 
194  operator CUdevice() const
195  {
196  return handle_;
197  }
198 
206  int get_attribute(CUdevice_attribute attrib) const
207  {
208  int result;
209  CU_ETX(cuDeviceGetAttribute(&result, attrib, handle_));
210  return result;
211  }
212 
216  const Cuda_compute_capability& compute_capability() const
217  {
218  return capability_;
219  }
220 
221  const std::string& name() const
222  {
223  return name_;
224  }
225 
234  int max_threads() const
235  {
236  return properties_.maxThreadsPerBlock;
237  }
238 
246  int max_threads(int d) const
247  {
248  assert(d >= 0 && d < 3);
249  return properties_.maxThreadsDim[d];
250  }
251 
259  int max_blocks(int d) const
260  {
261  assert(d >= 0 && d < 3);
262  return properties_.maxGridSize[d];
263  }
264 
268  int shared_memory_size() const
269  {
270  return properties_.sharedMemPerBlock;
271  }
272 
276  int const_memory_size() const
277  {
278  return properties_.totalConstantMemory;
279  }
280 
284  int warp_size() const
285  {
286  return properties_.SIMDWidth;
287  }
288 
293  int memory_pitch() const
294  {
295  return properties_.memPitch;
296  }
297 
301  int register_count() const
302  {
303  return properties_.regsPerBlock;
304  }
305 
309  int clock_rate() const
310  {
311  return properties_.clockRate;
312  }
313 
320  int texture_alignment() const
321  {
322  return properties_.textureAlign;
323  }
324 
328  int available_memory() const
329  {
330 #if CUDA_VERSION < 3020
331  unsigned int bytes;
332 #else
333  size_t bytes;
334 #endif
335 
336  CU_ETX(cuDeviceTotalMem(&bytes, handle_));
337  return bytes;
338  }
339 
340 
341 
342 private:
344  Cuda_device(CUdevice handle) :
345  handle_(handle),
346  capability_(),
347  name_(),
348  properties_()
349  {
350  CU_ETX(cuDeviceComputeCapability(&capability_.major, &capability_.minor, handle_));
351 
352  const size_t BUF_SIZE = 256;
353  char name_buff[BUF_SIZE];
354  CU_ETX(cuDeviceGetName(name_buff, BUF_SIZE, handle_));
355  name_ = std::string(name_buff);
356 
357  CU_ETX(cuDeviceGetProperties(&properties_, handle_));
358  }
359 
360 private:
361  CUdevice handle_;
362  Cuda_compute_capability capability_;
363  std::string name_;
364  CUdevprop properties_;
365 
366 };
367 #endif
368 
369 
370 #ifdef KJB_HAVE_CUDA
371 class Cuda
372 {
373 public:
374 
378  static int get_num_devices();
379 
385  static Cuda_device get_device(int i);
386 
387  static bool is_initialized()
388  {
389  return initialized_;
390  }
391 
392  static void ensure_initialized()
393  {
394  if(!is_initialized())
395  init_(0);
396  }
397 
398  /* read and jit-compile file
399  * @param mod_fname The filename of the module file. Currently ptx is supported; adding bin support should be trivial if you need it.
400  * @param max_registers The maximum number of registers this module will use. Defualt is 32; this seems to be what all the Nvidia examples use.*/
401  static CUmodule load_module(const char* mod_fname, int max_registers = 32);
402  static CUmodule load_module_file(const std::string& mod_fname, int max_registers = 32);
403 
408  static void set_jit_log_buffer_size(size_t size)
409  {
410  jit_log_buffer_size_ = size;
411  }
412 
413 private:
414 
415  static void init_(unsigned int flags)
416  {
417  if(initialized_)
418  {
419  KJB_THROW_2(Runtime_error, "Cuda already initialized, cannot reinitialize.");
420  }
421 
422  initialized_ = true;
423  cuInit(flags);
424  }
425 
426  static int jit_log_buffer_size_;
427  static bool initialized_;
428 };
429 #endif
430 
431 // TODO: how to handle contexts? C-style handles it pretty well currently. what added value do we get? automatic garbage collection; RAII semantics. so just have a really simple context, whhich destroys itself and can return the CUcontext object... Not copyable, not default constructible.
432 #ifdef KJB_HAVE_CUDA
433 class Cuda_context
434 {
435 public:
436  Cuda_context(unsigned int flags, const Cuda_device& device) :
437  context_(NULL)
438  {
439  Cuda::ensure_initialized();
440 
441  cuCtxCreate(&context_, flags, device);
442  }
443 
444  Cuda_context(const Cuda_device& device) :
445  context_(NULL)
446  {
447  Cuda::ensure_initialized();
448 
449  cuCtxCreate(&context_, 0, device);
450  }
451 
455  CUcontext get()
456  {
457  return context_;
458  }
459 
460  size_t get_free_memory()
461  {
462 #if CUDA_VERSION < 3020
463  unsigned int free, total;
464 #else
465  size_t free, total;
466 #endif
467  CU_EPETE(cuMemGetInfo(&free, &total));
468 
469  return free;
470  }
471 
472 
473  ~Cuda_context()
474  {
475  cuCtxDestroy(context_);
476  }
477 
478  operator CUcontext() const
479  {
480  return context_;
481  }
482 
483 private:
484  // no copying contexts
485  Cuda_context(const Cuda_context& /*other*/) {}
486 
487  CUcontext context_;
488 };
489 #endif
490 
491 
505 #ifdef KJB_HAVE_CUDA
506 class Cuda_base_module
507 {
508 public:
512  Cuda_base_module(const std::string& mod_fname, int max_registers = -1) :
513  handle_(Cuda::load_module_file(mod_fname, max_registers))
514  {
515  }
516 
517  Cuda_base_module(const char* mod_code, int max_registers = -1) :
518  handle_(Cuda::load_module(mod_code, max_registers))
519  {
520  }
521 
522  virtual ~Cuda_base_module()
523  {
524  cuModuleUnload(handle_);
525  }
526 
527 
528 protected:
529  CUmodule& get_handle_() { return handle_; }
530 private:
531 
532  CUmodule handle_;
533 
534 };
535 #endif
536 
541 #ifdef KJB_HAVE_CUDA
542 class Cuda_reduce_module : public Cuda_base_module
543 {
544 private:
545 typedef Cuda_base_module Base;
546 
548  template<class T>
549  int type_index() { KJB_THROW(Not_implemented); }
550 
551  /**************
552  * ADDING SUPPORT FOR NEW TYPES
553  *
554  * Currently, float and unsigned char are supported. Adding support for new types
555  * requires changes in multiple places:
556  *
557  * 1. reduce.cu: Create a set of wrapper functions for your new type
558  * 2. Add a new "Support for XXX" section like the ones for float and unsigned char later in this file.
559  * 3. Add an entry to the type_strings array in load_functions.
560  * 4. Increment the NUM_TYPES constant below.
561  */
562 //
563 // The constants below aid in building the name strings of cuda kernel functions in reduce.cu
564 //
565  static const int NUM_TYPES = 2; // length of type_strings
566 
567  // 1, 2, ..., 128, 512 -- total of 10; one for each function in reduce.cu
568  static const int NUM_SIZES = 10;
569 public:
570  Cuda_reduce_module();
571 
576 // float packed_float_reduce(CUdeviceptr d_in, int N);
577 
594  template <class T>
595  double reduce(CUdeviceptr d_in, int N);
596 
597  template <class T>
598  double reduce(CUarray* da_in, int width, int height);
599 
615  template <class T>
616  double reduce(T* h_in, int N);
617 
628  float reduce(const kjb::Vector& , int /*N*/)
629  {
630  // TODO
632  }
633 
634  float reduce(const std::vector<float>&, int /*N*/)
635  {
636  // TODO
638  }
639 
640  double tex_reduce(CUarray da_in, int width, int height);
641  double tex_count(CUarray da_in, int width, int height);
642  double chamfer_reduce(CUdeviceptr d_in_1, CUdeviceptr d_in_2, int N, bool square);
643  double chamfer_reduce(CUarray tex_1, CUarray tex_2, int width, int height);
644 
645  double tex_function(CUarray da_in, int width, int height, CUfunction func);
646 
647 
648  void get_num_blocks_and_threads_(int n, int maxBlocks, int maxThreads, int& blocks, int& threads)
649  {
650 #ifndef MIN
651 #define MIN(x,y) ((x < y) ? x : y)
652 #endif
653 
654  threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
655  blocks = (n + (threads * 2 - 1)) / (threads * 2);
656 
657  blocks = MIN(maxBlocks, blocks);
658  }
659 
660  int get_function_index_(int type_index, int threads, bool is_pow_2);
661 
666  virtual void load_functions();
667 
668  static int is_pow_2(unsigned int x)
669  {
670  return ((x&(x-1))==0);
671  }
672 
673  static unsigned int nextPow2( unsigned int x )
674  {
675  --x;
676  x |= x >> 1;
677  x |= x >> 2;
678  x |= x >> 4;
679  x |= x >> 8;
680  x |= x >> 16;
681  return ++x;
682  }
683 private:
684  static const int NUM_FUNCTIONS = NUM_SIZES * NUM_TYPES * 2;
685 
686  CUfunction functions_[NUM_FUNCTIONS];
687  CUfunction tex_reduce_function;
688  CUfunction tex_count_function;
689  CUfunction chamfer_and_reduce_function; // testing
690  CUfunction squared_chamfer_and_reduce_function; // testing
691  CUfunction chamfer_reduce_function;
692 
693  CUtexref tex_ref_1;
694  CUtexref tex_ref_2;
695 
696  // TODO: Make these adjustible, or at least check device capabilities for these.
697  // They are reasonable lower bounds, though.
698  static const size_t MAX_THREADS = 256;
699  static const size_t MAX_BLOCKS = 64;
700 
701 
702 };
703 #endif
704 
705 
706 #ifdef KJB_HAVE_CUDA
707 template <class T>
708 double Cuda_reduce_module::reduce(CUdeviceptr d_in, int N)
709 {
710  using boost::scoped_array;
711  using kjb_c::kjb_debug_level;
712  using kjb_c::add_error;
713 
714  CUcontext ctx;
715  cuCtxAttach(&ctx, 0);
716 
717  int blocks, threads;
718 
719  get_num_blocks_and_threads_(N, MAX_BLOCKS, MAX_THREADS, blocks, threads);
720 
721  scoped_array<float> h_out(new float[blocks]);
722  double result;
723  CUdeviceptr d_out = NULL;
724 
725  CUresult err = CUDA_SUCCESS;
726 
727 
728  // TODO: make this allocation exception-safe
729  EGC(err = cuMemAlloc(&d_out, sizeof(float) * blocks));
730 
731  // function
732  CUfunction function;
733 
734  // get the reduction function designed for 2^thread_i threads
735  {
736  const int type_index_ = type_index<T>();
737  const int i = get_function_index_(type_index_, threads, is_pow_2(N));
738  function = functions_[i];
739  }
740 
741  /* Call Kernel */
742  #define ALIGN_UP(offset, alignment) \
743  (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
744  {
745 
746 
747  /* when there is only one warp per block, we need to allocate two warps */
748  /* worth of shared memory so that we don't index shared memory out of bounds */
749  // TODO: make warp size adjustible
750  const int WARP_SIZE = 32;
751  const int smemSize = (threads <= WARP_SIZE) ? 2 * threads * sizeof(float) : threads * sizeof(float);
752  cuFuncSetSharedSize(function, smemSize);
753 
754  int offset = 0;
755  void* ptr;
756 
757  ptr = (void*)(size_t) d_in;
758  ALIGN_UP(offset, __alignof(ptr));
759  EGC(err = cuParamSetv(function, offset, &ptr, sizeof(ptr)));
760  offset += sizeof(ptr);
761 
762  ptr = (void*)(size_t) d_out;
763  ALIGN_UP(offset, __alignof(ptr));
764  EGC(err = cuParamSetv(function, offset, &ptr, sizeof(ptr)));
765  offset += sizeof(ptr);
766 
767  ALIGN_UP(offset, __alignof(N));
768  EGC(err = cuParamSeti(function, offset, N));
769  offset += sizeof(N);
770 
771  EGC(err = cuParamSetSize(function, offset));
772 
773  EGC(err = cuFuncSetBlockShape(function, threads, 1, 1));
774  EGC(err = cuLaunchGrid(function, blocks, 1));
775  }
776  #undef ALIGN_UP
777 
778  // TODO: iteratively call kernel until 1 item left
779  /* copy result from device to host */
780 
781  EGC(err = cuMemcpyDtoH(h_out.get(), d_out, blocks * sizeof(float)));
782 
783  {
784  // manually sum up the block results.
785 
786 // double total = 0;
787  double gpu_total = 0;
788  for(int i = 0; i < blocks; i++)
789  {
790 /* printf("\t%f", h_out[i]); */
791  gpu_total += h_out[i];
792  }
793 
794  /* begin debug */
795 /* total = reduceCPU(h_in, N); */
796  /* for(i = 0; i < N; i++) */
797  /* { */
798  /* total += h_in[i]; */
799  /* } */
800 /* printf("\nCPU total: %f\nGPU total: %f\n", total, gpu_total); */
801  /* end debug */
802 
803  result = gpu_total;
804  }
805 
806 cleanup:
807  // this C-style cleanup is necessary because allocated device memory is not exception safe.
808  if(d_out)
809  cuMemFree(d_out);
810 
811  err = cuCtxDetach(ctx);
812 
813  if(err != CUDA_SUCCESS)
814  {
815  throw Cuda_error(err, __FILE__, __LINE__);
816  }
817 
818  return result;
819 }
820 #endif
821 
822 #ifdef KJB_HAVE_CUDA
823 template <class T>
824 double Cuda_reduce_module::reduce(T* h_in, int N)
825 {
826 
827  using kjb_c::kjb_debug_level;
828  using kjb_c::add_error;
829 
830  double result;
831  CUresult err = CUDA_SUCCESS;
832 
833  // allocate memory on device
834  CUdeviceptr d_in = NULL;
835  const size_t size = N * sizeof(T);
836 
837  CUcontext ctx = NULL;
838  EGC(err = cuCtxAttach(&ctx, 0));
839 
840  // TODO: Make this exception safe
841  EGC( err = cuMemAlloc(&d_in, size));
842 
843  // copy data to device
844  EGC(err = cuMemcpyHtoD(d_in, h_in, size));
845 
846 
847  // do reduction
848  try{
849  result = reduce<T>(d_in, N);
850  }
851  catch(Cuda_error& ex)
852  {
853  // please, god, lets add exception safety so these kludges aren't necessary!
854  err = ex.get_cuda_error();
855  goto cleanup;
856  }
857 
858  // cleanup
859 cleanup:
860  if(d_in)
861  cuMemFree(d_in);
862 
863  cuCtxDetach(ctx);
864 
865  if(err != CUDA_SUCCESS)
866  {
867  throw Cuda_error(err, __FILE__, __LINE__);
868  }
869 
870  return result;
871 }
872 #endif
873 
874 
875 #ifdef KJB_HAVE_CUDA
876 /* *************
877  * Support for float
878  * *************/
879 
881 template<>
882 int Cuda_reduce_module::type_index<float>();
883 #endif
884 
885 #ifdef KJB_HAVE_CUDA
886 /* *************
887  * Support for unsigned char
888  * *************/
890 template<>
891 int Cuda_reduce_module::type_index<unsigned char>();
892 #endif
893 
894 /* ************
895  * end unsigned char
896  * ************/
897 
898 
899 /* ************
900  * UTILITY FUNCTIONS
901  * ************/
902 
903 #ifdef KJB_HAVE_CUDA
904 
909 CUarray create_cuda_array(const Matrix& m, bool flip_y);
910 
919 template <class Matrix_type, class Dest_type>
920 Dest_type* to_pitch(const Matrix_type& m, size_t pitch, bool flip_y)
921 {
922  typedef Dest_type T;
923 
924  // convert from bytes into elements
925  pitch /= sizeof(T);
926 
927  assert(pitch >= (size_t) m.get_num_cols());
928 
929  T* out = new T[pitch * m.get_num_rows()];
930 
931  T* cur = out;
932 
933  if(flip_y)
934  {
935  for(int row = m.get_num_rows() - 1; row >= 0; row--)
936  for(size_t col = 0; col < pitch; col++)
937  {
938  if(col >= (size_t) m.get_num_cols())
939  *cur++ = 0.0;
940  else
941  *cur++ = (T) m(row,col);
942  }
943  }
944  else
945  {
946  for(int row = 0; row < m.get_num_rows(); row++)
947  for(size_t col = 0; col < pitch; col++)
948  {
949  if(col >= (size_t) m.get_num_cols())
950  *cur++ = 0.0;
951  else
952  *cur++ = (T) m(row,col);
953  }
954 
955  }
956 
957  return out;
958 }
959 
968 template <class Matrix_type, class Dest_type>
969 #if CUDA_VERSION < 3020
970 CUdeviceptr create_cuda_pitch(const Matrix_type& m, unsigned int& pitch, bool flip_y)
971 #else
972 CUdeviceptr create_cuda_pitch(const Matrix_type& m, size_t& pitch, bool flip_y)
973 #endif
974 {
975  typedef Dest_type T;
976 
977  int width = m.get_num_cols();
978  int height = m.get_num_rows();
979 
980  CUdeviceptr result;
981 
982  // I put this in for debugging, and may need it again someday soon...
983 // Cuda_size_t free, total;
984 // CU_EPETE(cuMemGetInfo(&free, &total));
985 // std::cout << "Available memory: " << free << "/" << total << std::endl;
986 
987  CU_ETX(cuMemAllocPitch(
988  &result,
989  &pitch, // pitch
990  width * sizeof(T),
991  height,
992  sizeof(T)));
993 
994  boost::scoped_array<T> data(to_pitch<Matrix_type, T>(m, pitch, flip_y));
995 
996  CUDA_MEMCPY2D cpy_meta;
997  cpy_meta.srcMemoryType = CU_MEMORYTYPE_HOST;
998  cpy_meta.srcXInBytes = 0;
999  cpy_meta.srcY = 0;
1000  cpy_meta.srcHost = data.get();
1001  cpy_meta.srcDevice = 0;
1002  cpy_meta.srcArray = 0;
1003  cpy_meta.srcPitch = pitch;
1004  cpy_meta.dstXInBytes = 0;
1005  cpy_meta.dstY = 0;
1006  cpy_meta.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1007  cpy_meta.dstHost = 0;
1008  cpy_meta.dstDevice = result;
1009  cpy_meta.dstArray = 0;
1010  cpy_meta.dstPitch = pitch;
1011  cpy_meta.WidthInBytes = pitch; //width * sizeof(float);
1012  cpy_meta.Height = height;
1013 
1014  CU_ETX(cuMemcpy2D(&cpy_meta));
1015 
1016 #ifdef TEST
1017  boost::scoped_array<T> o_data(new T[width * height]);
1018 
1019  cpy_meta.srcXInBytes = 0;
1020  cpy_meta.srcY = 0;
1021  cpy_meta.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1022  cpy_meta.srcHost = 0;
1023  cpy_meta.srcDevice = result;
1024  cpy_meta.srcArray = 0;
1025  cpy_meta.srcPitch = pitch;
1026  cpy_meta.dstMemoryType = CU_MEMORYTYPE_HOST;
1027  cpy_meta.dstXInBytes = 0;
1028  cpy_meta.dstY = 0;
1029  cpy_meta.dstHost = o_data.get();
1030  cpy_meta.dstDevice = 0;
1031  cpy_meta.dstArray = 0;
1032  cpy_meta.dstPitch = 0;
1033  cpy_meta.WidthInBytes = width * sizeof(T);
1034  cpy_meta.Height = height;
1035 
1036  CU_ETX(cuMemcpy2D(&cpy_meta));
1037 
1038 
1039  for(int row = 0; row < height; row++)
1040  for(int col = 0; col < width; col++)
1041  {
1042  int rev_row = height - row - 1;
1043  int in_i = row * pitch / sizeof(T) + col;
1044  int out_i = row * width + col;
1045  assert(data[in_i] == o_data[out_i]);
1046  if(flip_y)
1047  assert(fabs(m(rev_row, col) - o_data[out_i]) <= fabs(m(rev_row, col)) * FLT_EPSILON);
1048  else
1049  assert(fabs(m(row, col) - o_data[out_i]) <= fabs(m(row, col)) * FLT_EPSILON);
1050  }
1051 #endif
1052 
1053 
1054  return result;
1055 }
1056 #endif
1057 
1058 
1059 
1060 
1061 
1062 
1063 
1064 
1065 //const size_t Cuda_reduce_module MAX_THREADS = 256;
1066 //const size_t Cuda_reduce_module MAX_BLOCKS = 64;
1067 } // namespace kjb
1068 } // namespace gpu
1069 
1070 
1071 
1072 
1073 
1074 #endif
1075 
1076 
1077 // vim: tabstop=4 shiftwidth=4 foldmethod=marker
1078 
bool operator<(const Face_detection &f1, const Face_detection &f2)
Compares to boxes using middle of box. Needed because we have associated containers of these...
Definition: d_facecom.cpp:147
height
Definition: APPgetLargeConnectedEdges.m:33
#define KJB_THROW(ex)
Definition: l_exception.h:46
This class implements vectors, in the linear-algebra sense, with real-valued elements.
Definition: m_vector.h:87
#define MIN(a, b)
Definition: utils.h:126
double reduce(const Assignable_array_type &array, size_t length)
Definition: m_arith.h:42
x
Definition: APPgetLargeConnectedEdges.m:100
#define KJB_THROW_2(ex, msg)
Definition: l_exception.h:48
bool operator==(const Int_matrix &op1, const Int_matrix::Impl_type &op2)
Test for exact equality between two matrices.
Definition: l_int_matrix.cpp:218
Object thrown when attempting to use unimplemented functionality.
Definition: l_exception.h:281
get the indices of edges in each direction for i
Definition: APPgetLargeConnectedEdges.m:48
This class implements matrices, in the linear-algebra sense, with real-valued elements.
Definition: m_matrix.h:94
for m
Definition: APPgetLargeConnectedEdges.m:64
Support for error handling exception classes in libKJB.
Definition for the Vector class, a thin wrapper on the KJB Vector struct and its related functionalit...
Object thrown when computation fails somehow during execution.
Definition: l_exception.h:321