KJB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
gpu_cuda_util.h
Go to the documentation of this file.
1 /* $Id: gpu_cuda_util.h 10606 2011-09-29 19:50:30Z predoehl $ */
2 /* {{{=========================================================================== *
3  |
4  | Copyright (c) 1994-2010 by Kobus Barnard (author)
5  |
6  | Personal and educational use of this code is granted, provided that this
7  | header is kept intact, and that the authorship is not misrepresented, that
8  | its use is acknowledged in publications, and relevant papers are cited.
9  |
10  | For other use contact the author (kobus AT cs DOT arizona DOT edu).
11  |
12  | Please note that the code in this file has not necessarily been adequately
13  | tested. Naturally, there is no guarantee of performance, support, or fitness
14  | for any particular task. Nonetheless, I am interested in hearing about
15  | problems that you encounter.
16  |
17  | Author: Kyle Simek
18  * =========================================================================== }}}*/
19 
20 #ifndef KJB_CUDA_UTIL_H
21 #define KJB_CUDA_UTIL_H
22 
23 #include <string>
24 #include <vector>
25 
26 #include <boost/scoped_array.hpp>
27 #include <boost/shared_array.hpp>
28 
29 #include <m_cpp/m_vector.h>
30 #include <l_cpp/l_exception.h>
31 
32 #include <gpu_cpp/gpu_cuda.h>
33 
34 #undef major
35 #undef minor
36 
37 namespace kjb
38 {
39 namespace gpu
40 {
41 
42 
47 #ifdef KJB_HAVE_CUDA
48 class Cuda_utility_module : public Cuda_base_module
49 {
50 private:
51 typedef Cuda_base_module Base;
52 
53 public:
54  Cuda_utility_module();
55 
56  void load_functions_()
57  {
58 #define KJB_LOAD_FUNCTION__(a) cuModuleGetFunction(&a##_func, get_handle_(), #a);
59  CU_ETX(KJB_LOAD_FUNCTION__(ow_ew_multiply_float));
60  CU_ETX(KJB_LOAD_FUNCTION__(ow_ew_multiply_uint));
61  CU_ETX(KJB_LOAD_FUNCTION__(ow_ew_multiply_int));
62  CU_ETX(KJB_LOAD_FUNCTION__(ow_ew_multiply_uint_float));
63  CU_ETX(KJB_LOAD_FUNCTION__(detect_changes_float));
64  CU_ETX(KJB_LOAD_FUNCTION__(detect_changes_uint));
65  CU_ETX(KJB_LOAD_FUNCTION__(detect_changes_int));
66 #undef KJB_LOAD_FUNCTION__
67  }
68 
69 
70 
71  CUfunction get_ow_ew_multiply_func(const unsigned int&, const float&)
72  {
73  return ow_ew_multiply_uint_float_func;
74  }
75 
76  CUfunction get_ow_ew_multiply_func(const float&, const float&)
77  {
78  return ow_ew_multiply_float_func;
79  }
80 
81  CUfunction get_ow_ew_multiply_func(const unsigned int&, const unsigned int&)
82  {
83  return ow_ew_multiply_uint_func;
84  }
85 
86  CUfunction get_ow_ew_multiply_func(const int&, const int&)
87  {
88  return ow_ew_multiply_int_func;
89  }
90 
91 
92 
93  CUfunction get_detect_changes_func(const float&)
94  {
95  return detect_changes_float_func;
96  }
97 
98  CUfunction get_detect_changes_func(const unsigned int&)
99  {
100  return detect_changes_uint_func;
101  }
102 
103  CUfunction get_detect_changes_func(const int&)
104  {
105  return detect_changes_int_func;
106  }
107 
108 
109 
110  template <class T1, class T2>
111  void ow_ew_multiply(CUdeviceptr v1, CUdeviceptr v2, unsigned int N)
112  {
113  using boost::scoped_array;
114  using kjb_c::kjb_debug_level;
115  using kjb_c::add_error;
116 
117  CUcontext ctx;
118  cuCtxAttach(&ctx, 0);
119 
120  size_t blocks, threads;
121 
122  threads = 512;
123  blocks = N / threads + (N % threads ? 1 : 0);
124 
125  assert(blocks < MAX_BLOCKS);
126 
127  // function
128  CUfunction function = get_ow_ew_multiply_func(T1(), T2());
129 
130  /* Call Kernel */
131  #define ALIGN_UP(offset, alignment) \
132  (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
133  {
134 
135 
136  int offset = 0;
137  void* ptr;
138 
139  ptr = (void*)(size_t) v1;
140  ALIGN_UP(offset, __alignof(ptr));
141  ETX(cuParamSetv(function, offset, &ptr, sizeof(ptr)));
142  offset += sizeof(ptr);
143 
144  ptr = (void*)(size_t) v2;
145  ALIGN_UP(offset, __alignof(ptr));
146  ETX(cuParamSetv(function, offset, &ptr, sizeof(ptr)));
147  offset += sizeof(ptr);
148 
149  ALIGN_UP(offset, __alignof(N));
150  ETX(cuParamSeti(function, offset, N));
151  offset += sizeof(N);
152 
153  ETX(cuParamSetSize(function, offset));
154 
155  ETX(cuFuncSetBlockShape(function, threads, 1, 1));
156  ETX(cuLaunchGrid(function, blocks, 1));
157  }
158  #undef ALIGN_UP
159 
160  ETX(cuCtxDetach(ctx));
161  }
162 
163  template <class T>
164  void detect_changes(CUdeviceptr d_in, unsigned int N)
165  {
166  using boost::scoped_array;
167  using kjb_c::kjb_debug_level;
168  using kjb_c::add_error;
169 
170  CUcontext ctx;
171  cuCtxAttach(&ctx, 0);
172 
173  size_t blocks, threads;
174 
175  threads = 512;
176  blocks = N / threads + (N % threads ? 1 : 0);
177 
178  assert(blocks < MAX_BLOCKS);
179 
180  // function
181  CUfunction function = get_detect_changes_func(T());
182 
183  /* Call Kernel */
184  #define ALIGN_UP(offset, alignment) \
185  (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
186  {
187  int offset = 0;
188  void* ptr;
189 
190  ptr = (void*)(size_t) d_in;
191  ALIGN_UP(offset, __alignof(ptr));
192  ETX(cuParamSetv(function, offset, &ptr, sizeof(ptr)));
193  offset += sizeof(ptr);
194 
195  ALIGN_UP(offset, __alignof(N));
196  ETX(cuParamSeti(function, offset, N));
197  offset += sizeof(N);
198 
199  ETX(cuParamSetSize(function, offset));
200 
201  ETX(cuFuncSetBlockShape(function, threads, 1, 1));
202  ETX(cuLaunchGrid(function, blocks, 1));
203  }
204  #undef ALIGN_UP
205 
206  ETX(cuCtxDetach(ctx));
207  }
208 private:
209 
210 
211  CUfunction ow_ew_multiply_uint_float_func;
212 
213  CUfunction ow_ew_multiply_float_func;
214  CUfunction ow_ew_multiply_uint_func;
215  CUfunction ow_ew_multiply_int_func;
216 
217  CUfunction detect_changes_float_func;
218  CUfunction detect_changes_uint_func;
219  CUfunction detect_changes_int_func;
220 
221 
222  // TODO: Make these adjustible, or at least check device capabilities for these.
223  // They are taken from my Quadro NV 140M and provide reasonable lower bounds, though.
224  static const size_t MAX_THREADS = 512;
225  static const size_t MAX_BLOCKS = 65535;
226 };
227 #endif
228 
229 
230 } // namespace kjb
231 } // namespace gpu
232 
233 
234 
235 #endif
236 
237 
238 // vim: tabstop=4 shiftwidth=4 foldmethod=marker
239 
#define ETX(a)
Definition: l_exception.h:67
Support for error handling exception classes in libKJB.
Definition for the Vector class, a thin wrapper on the KJB Vector struct and its related functionalit...