Go to the documentation of this file. 1 static const char* reduce_ptx =
3 " .target sm_10, map_f64_to_f32\n"
4 " // compiled with /usr/local/cuda/open64/lib//be\n"
5 " // nvopencc 3.1 built on 2010-06-07\n"
7 " //-----------------------------------------------------------\n"
8 " // Compiling /tmp/tmpxft_00007884_00000000-7_reduce.cpp3.i (/tmp/ccBI#.LV1fMO)\n"
9 " //-----------------------------------------------------------\n"
11 " //-----------------------------------------------------------\n"
13 " //-----------------------------------------------------------\n"
14 " // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64\n"
15 " // -O3 (Optimization level)\n"
16 " // -g0 (Debug level)\n"
17 " // -m2 (Report advisories)\n"
18 " //-----------------------------------------------------------\n"
20 " .file 1 \"<command-line>\"\n"
21 " .file 2 \"/tmp/tmpxft_00007884_00000000-6_reduce.cudafe2.gpu\"\n"
22 " .file 3 \"reduce.cu\"\n"
23 " .file 4 \"/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h\"\n"
24 " .file 5 \"/usr/local/cuda/bin/../include/crt/device_runtime.h\"\n"
25 " .file 6 \"/usr/local/cuda/bin/../include/host_defines.h\"\n"
26 " .file 7 \"/usr/local/cuda/bin/../include/builtin_types.h\"\n"
27 " .file 8 \"/usr/local/cuda/bin/../include/device_types.h\"\n"
28 " .file 9 \"/usr/local/cuda/bin/../include/driver_types.h\"\n"
29 " .file 10 \"/usr/local/cuda/bin/../include/surface_types.h\"\n"
30 " .file 11 \"/usr/local/cuda/bin/../include/texture_types.h\"\n"
31 " .file 12 \"/usr/local/cuda/bin/../include/vector_types.h\"\n"
32 " .file 13 \"/usr/local/cuda/bin/../include/device_launch_parameters.h\"\n"
33 " .file 14 \"/usr/local/cuda/bin/../include/crt/storage_class.h\"\n"
34 " .file 15 \"/usr/include/bits/types.h\"\n"
35 " .file 16 \"/usr/include/time.h\"\n"
36 " .file 17 \"/usr/local/cuda/bin/../include/common_functions.h\"\n"
37 " .file 18 \"/usr/local/cuda/bin/../include/math_functions.h\"\n"
38 " .file 19 \"/usr/local/cuda/bin/../include/math_constants.h\"\n"
39 " .file 20 \"/usr/local/cuda/bin/../include/device_functions.h\"\n"
40 " .file 21 \"/usr/local/cuda/bin/../include/sm_11_atomic_functions.h\"\n"
41 " .file 22 \"/usr/local/cuda/bin/../include/sm_12_atomic_functions.h\"\n"
42 " .file 23 \"/usr/local/cuda/bin/../include/sm_13_double_functions.h\"\n"
43 " .file 24 \"/usr/local/cuda/bin/../include/sm_20_atomic_functions.h\"\n"
44 " .file 25 \"/usr/local/cuda/bin/../include/sm_20_intrinsics.h\"\n"
45 " .file 26 \"/usr/local/cuda/bin/../include/surface_functions.h\"\n"
46 " .file 27 \"/usr/local/cuda/bin/../include/texture_fetch_functions.h\"\n"
47 " .file 28 \"/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h\"\n"
49 " .extern .shared .align 4 .b8 __smem[];\n"
50 " .tex .u64 tex_ref_1;\n"
51 " .tex .u64 tex_ref_2;\n"
53 " .entry chamfer_and_reduce (\n"
54 " .param .u64 __cudaparm_chamfer_and_reduce_g_idata_1,\n"
55 " .param .u64 __cudaparm_chamfer_and_reduce_g_idata_2,\n"
56 " .param .u64 __cudaparm_chamfer_and_reduce_g_odata,\n"
57 " .param .u32 __cudaparm_chamfer_and_reduce_n)\n"
59 " .reg .u16 %rh<3>;\n"
60 " .reg .u32 %r<14>;\n"
61 " .reg .u64 %rd<18>;\n"
62 " .reg .f32 %f<23>;\n"
63 " .reg .pred %p<9>;\n"
65 "$LDWbegin_chamfer_and_reduce:\n"
67 " cvt.u32.u16 %r1, %ctaid.x;\n"
68 " mul.lo.u32 %r2, %r1, 512;\n"
69 " cvt.u32.u16 %r3, %tid.x;\n"
70 " add.u32 %r4, %r2, %r3;\n"
71 " ld.param.u32 %r5, [__cudaparm_chamfer_and_reduce_n];\n"
72 " setp.ge.u32 %p1, %r4, %r5;\n"
73 " @%p1 bra $Lt_0_18178;\n"
74 " add.u32 %r6, %r4, 256;\n"
75 " ld.param.u32 %r5, [__cudaparm_chamfer_and_reduce_n];\n"
76 " add.u32 %r7, %r5, 256;\n"
77 " mov.u16 %rh1, %nctaid.x;\n"
78 " mul.wide.u16 %r8, %rh1, 512;\n"
79 " cvt.u64.u32 %rd1, %r4;\n"
80 " mul.wide.u32 %rd2, %r4, 4;\n"
81 " cvt.s64.u32 %rd3, %r8;\n"
82 " ld.param.u64 %rd4, [__cudaparm_chamfer_and_reduce_g_idata_1];\n"
83 " add.u64 %rd5, %rd4, %rd2;\n"
84 " mul.wide.u32 %rd6, %r8, 4;\n"
85 " ld.param.u64 %rd7, [__cudaparm_chamfer_and_reduce_g_idata_2];\n"
86 " add.u64 %rd8, %rd7, %rd2;\n"
87 " mov.f32 %f1, 0f00000000; // 0\n"
89 " //<loop> Loop body line 105, nesting depth: 1, estimated iterations: unknown\n"
91 " ld.global.f32 %f2, [%rd5+0];\n"
92 " ld.global.f32 %f3, [%rd8+0];\n"
93 " mad.f32 %f1, %f2, %f3, %f1;\n"
95 " ld.param.u32 %r5, [__cudaparm_chamfer_and_reduce_n];\n"
97 " setp.ge.u32 %p2, %r6, %r5;\n"
98 " @%p2 bra $Lt_0_15362;\n"
99 " //<loop> Part of loop body line 105, head labeled $Lt_0_15106\n"
101 " ld.global.f32 %f4, [%rd5+1024];\n"
102 " ld.global.f32 %f5, [%rd8+1024];\n"
103 " mad.f32 %f1, %f4, %f5, %f1;\n"
105 " //<loop> Part of loop body line 105, head labeled $Lt_0_15106\n"
106 " add.u32 %r6, %r6, %r8;\n"
107 " add.u64 %rd8, %rd8, %rd6;\n"
108 " add.u64 %rd5, %rd5, %rd6;\n"
109 " setp.lt.u32 %p3, %r6, %r7;\n"
110 " @%p3 bra $Lt_0_15106;\n"
111 " bra.uni $Lt_0_14594;\n"
113 " mov.f32 %f1, 0f00000000; // 0\n"
116 " mov.f32 %f6, %f1;\n"
117 " mov.f32 %f7, %f6;\n"
119 " mov.u64 %rd9, __smem;\n"
120 " cvt.u64.u32 %rd10, %r3;\n"
121 " mul.wide.u32 %rd11, %r3, 4;\n"
122 " add.u64 %rd12, %rd9, %rd11;\n"
123 " st.volatile.shared.f32 [%rd12+0], %f6;\n"
126 " mov.u32 %r9, 127;\n"
127 " setp.gt.u32 %p4, %r3, %r9;\n"
128 " @%p4 bra $Lt_0_16130;\n"
130 " ld.volatile.shared.f32 %f8, [%rd12+512];\n"
131 " add.f32 %f7, %f8, %f6;\n"
132 " st.volatile.shared.f32 [%rd12+0], %f7;\n"
135 " mov.u32 %r10, 63;\n"
136 " setp.gt.u32 %p5, %r3, %r10;\n"
137 " @%p5 bra $Lt_0_16642;\n"
139 " ld.volatile.shared.f32 %f9, [%rd12+256];\n"
140 " add.f32 %f7, %f9, %f7;\n"
141 " st.volatile.shared.f32 [%rd12+0], %f7;\n"
144 " mov.u32 %r11, 31;\n"
145 " setp.gt.u32 %p6, %r3, %r11;\n"
146 " @%p6 bra $Lt_0_17154;\n"
148 " ld.volatile.shared.f32 %f10, [%rd12+128];\n"
149 " add.f32 %f11, %f10, %f7;\n"
150 " st.volatile.shared.f32 [%rd12+0], %f11;\n"
152 " ld.volatile.shared.f32 %f12, [%rd12+64];\n"
153 " add.f32 %f13, %f12, %f11;\n"
154 " st.volatile.shared.f32 [%rd12+0], %f13;\n"
156 " ld.volatile.shared.f32 %f14, [%rd12+32];\n"
157 " add.f32 %f15, %f14, %f13;\n"
158 " st.volatile.shared.f32 [%rd12+0], %f15;\n"
160 " ld.volatile.shared.f32 %f16, [%rd12+16];\n"
161 " add.f32 %f17, %f16, %f15;\n"
162 " st.volatile.shared.f32 [%rd12+0], %f17;\n"
164 " ld.volatile.shared.f32 %f18, [%rd12+8];\n"
165 " add.f32 %f19, %f18, %f17;\n"
166 " st.volatile.shared.f32 [%rd12+0], %f19;\n"
168 " ld.volatile.shared.f32 %f20, [%rd12+4];\n"
169 " add.f32 %f7, %f20, %f19;\n"
170 " st.volatile.shared.f32 [%rd12+0], %f7;\n"
173 " mov.u32 %r12, 0;\n"
174 " setp.ne.u32 %p7, %r3, %r12;\n"
175 " @%p7 bra $Lt_0_17666;\n"
177 " ld.shared.f32 %f21, [__smem+0];\n"
178 " ld.param.u64 %rd13, [__cudaparm_chamfer_and_reduce_g_odata];\n"
179 " cvt.u64.u32 %rd14, %r1;\n"
180 " mul.wide.u32 %rd15, %r1, 4;\n"
181 " add.u64 %rd16, %rd13, %rd15;\n"
182 " st.global.f32 [%rd16+0], %f21;\n"
186 "$LDWend_chamfer_and_reduce:\n"
187 " } // chamfer_and_reduce\n"
189 " .entry squared_chamfer_and_reduce (\n"
190 " .param .u64 __cudaparm_squared_chamfer_and_reduce_g_idata_1,\n"
191 " .param .u64 __cudaparm_squared_chamfer_and_reduce_g_idata_2,\n"
192 " .param .u64 __cudaparm_squared_chamfer_and_reduce_g_odata,\n"
193 " .param .u32 __cudaparm_squared_chamfer_and_reduce_n)\n"
195 " .reg .u16 %rh<3>;\n"
196 " .reg .u32 %r<14>;\n"
197 " .reg .u64 %rd<18>;\n"
198 " .reg .f32 %f<25>;\n"
199 " .reg .pred %p<9>;\n"
201 "$LDWbegin_squared_chamfer_and_reduce:\n"
203 " cvt.u32.u16 %r1, %ctaid.x;\n"
204 " mul.lo.u32 %r2, %r1, 512;\n"
205 " cvt.u32.u16 %r3, %tid.x;\n"
206 " add.u32 %r4, %r2, %r3;\n"
207 " ld.param.u32 %r5, [__cudaparm_squared_chamfer_and_reduce_n];\n"
208 " setp.ge.u32 %p1, %r4, %r5;\n"
209 " @%p1 bra $Lt_1_18178;\n"
210 " add.u32 %r6, %r4, 256;\n"
211 " ld.param.u32 %r5, [__cudaparm_squared_chamfer_and_reduce_n];\n"
212 " add.u32 %r7, %r5, 256;\n"
213 " mov.u16 %rh1, %nctaid.x;\n"
214 " mul.wide.u16 %r8, %rh1, 512;\n"
215 " cvt.u64.u32 %rd1, %r4;\n"
216 " mul.wide.u32 %rd2, %r4, 4;\n"
217 " cvt.s64.u32 %rd3, %r8;\n"
218 " ld.param.u64 %rd4, [__cudaparm_squared_chamfer_and_reduce_g_idata_1];\n"
219 " add.u64 %rd5, %rd4, %rd2;\n"
220 " mul.wide.u32 %rd6, %r8, 4;\n"
221 " ld.param.u64 %rd7, [__cudaparm_squared_chamfer_and_reduce_g_idata_2];\n"
222 " add.u64 %rd8, %rd7, %rd2;\n"
223 " mov.f32 %f1, 0f00000000; // 0\n"
225 " //<loop> Loop body line 105, nesting depth: 1, estimated iterations: unknown\n"
227 " ld.global.f32 %f2, [%rd5+0];\n"
228 " ld.global.f32 %f3, [%rd8+0];\n"
229 " mul.f32 %f4, %f2, %f3;\n"
231 " mad.f32 %f1, %f4, %f4, %f1;\n"
233 " ld.param.u32 %r5, [__cudaparm_squared_chamfer_and_reduce_n];\n"
235 " setp.ge.u32 %p2, %r6, %r5;\n"
236 " @%p2 bra $Lt_1_15362;\n"
237 " //<loop> Part of loop body line 105, head labeled $Lt_1_15106\n"
239 " ld.global.f32 %f5, [%rd5+1024];\n"
240 " ld.global.f32 %f6, [%rd8+1024];\n"
241 " mul.f32 %f7, %f5, %f6;\n"
243 " mad.f32 %f1, %f7, %f7, %f1;\n"
245 " //<loop> Part of loop body line 105, head labeled $Lt_1_15106\n"
246 " add.u32 %r6, %r6, %r8;\n"
247 " add.u64 %rd8, %rd8, %rd6;\n"
248 " add.u64 %rd5, %rd5, %rd6;\n"
249 " setp.lt.u32 %p3, %r6, %r7;\n"
250 " @%p3 bra $Lt_1_15106;\n"
251 " bra.uni $Lt_1_14594;\n"
253 " mov.f32 %f1, 0f00000000; // 0\n"
256 " mov.f32 %f8, %f1;\n"
257 " mov.f32 %f9, %f8;\n"
259 " mov.u64 %rd9, __smem;\n"
260 " cvt.u64.u32 %rd10, %r3;\n"
261 " mul.wide.u32 %rd11, %r3, 4;\n"
262 " add.u64 %rd12, %rd9, %rd11;\n"
263 " st.volatile.shared.f32 [%rd12+0], %f8;\n"
266 " mov.u32 %r9, 127;\n"
267 " setp.gt.u32 %p4, %r3, %r9;\n"
268 " @%p4 bra $Lt_1_16130;\n"
270 " ld.volatile.shared.f32 %f10, [%rd12+512];\n"
271 " add.f32 %f9, %f10, %f8;\n"
272 " st.volatile.shared.f32 [%rd12+0], %f9;\n"
275 " mov.u32 %r10, 63;\n"
276 " setp.gt.u32 %p5, %r3, %r10;\n"
277 " @%p5 bra $Lt_1_16642;\n"
279 " ld.volatile.shared.f32 %f11, [%rd12+256];\n"
280 " add.f32 %f9, %f11, %f9;\n"
281 " st.volatile.shared.f32 [%rd12+0], %f9;\n"
284 " mov.u32 %r11, 31;\n"
285 " setp.gt.u32 %p6, %r3, %r11;\n"
286 " @%p6 bra $Lt_1_17154;\n"
288 " ld.volatile.shared.f32 %f12, [%rd12+128];\n"
289 " add.f32 %f13, %f12, %f9;\n"
290 " st.volatile.shared.f32 [%rd12+0], %f13;\n"
292 " ld.volatile.shared.f32 %f14, [%rd12+64];\n"
293 " add.f32 %f15, %f14, %f13;\n"
294 " st.volatile.shared.f32 [%rd12+0], %f15;\n"
296 " ld.volatile.shared.f32 %f16, [%rd12+32];\n"
297 " add.f32 %f17, %f16, %f15;\n"
298 " st.volatile.shared.f32 [%rd12+0], %f17;\n"
300 " ld.volatile.shared.f32 %f18, [%rd12+16];\n"
301 " add.f32 %f19, %f18, %f17;\n"
302 " st.volatile.shared.f32 [%rd12+0], %f19;\n"
304 " ld.volatile.shared.f32 %f20, [%rd12+8];\n"
305 " add.f32 %f21, %f20, %f19;\n"
306 " st.volatile.shared.f32 [%rd12+0], %f21;\n"
308 " ld.volatile.shared.f32 %f22, [%rd12+4];\n"
309 " add.f32 %f9, %f22, %f21;\n"
310 " st.volatile.shared.f32 [%rd12+0], %f9;\n"
313 " mov.u32 %r12, 0;\n"
314 " setp.ne.u32 %p7, %r3, %r12;\n"
315 " @%p7 bra $Lt_1_17666;\n"
317 " ld.shared.f32 %f23, [__smem+0];\n"
318 " ld.param.u64 %rd13, [__cudaparm_squared_chamfer_and_reduce_g_odata];\n"
319 " cvt.u64.u32 %rd14, %r1;\n"
320 " mul.wide.u32 %rd15, %r1, 4;\n"
321 " add.u64 %rd16, %rd13, %rd15;\n"
322 " st.global.f32 [%rd16+0], %f23;\n"
326 "$LDWend_squared_chamfer_and_reduce:\n"
327 " } // squared_chamfer_and_reduce\n"
329 " .entry reduce_float_1_true (\n"
330 " .param .u64 __cudaparm_reduce_float_1_true_g_idata,\n"
331 " .param .u64 __cudaparm_reduce_float_1_true_g_odata,\n"
332 " .param .u32 __cudaparm_reduce_float_1_true_n)\n"
334 " .reg .u16 %rh<3>;\n"
335 " .reg .u32 %r<10>;\n"
336 " .reg .u64 %rd<16>;\n"
337 " .reg .f32 %f<7>;\n"
338 " .reg .pred %p<5>;\n"
340 "$LDWbegin_reduce_float_1_true:\n"
342 " cvt.u32.u16 %r1, %ctaid.x;\n"
343 " mul24.lo.u32 %r2, %r1, 2;\n"
344 " cvt.u32.u16 %r3, %tid.x;\n"
345 " add.u32 %r4, %r2, %r3;\n"
346 " mov.s32 %r5, %r4;\n"
347 " ld.param.u32 %r6, [__cudaparm_reduce_float_1_true_n];\n"
348 " setp.ge.u32 %p1, %r4, %r6;\n"
349 " @%p1 bra $Lt_2_16642;\n"
350 " mov.u16 %rh1, %nctaid.x;\n"
351 " mul.wide.u16 %r7, %rh1, 2;\n"
352 " cvt.s64.u32 %rd1, %r7;\n"
353 " ld.param.u64 %rd2, [__cudaparm_reduce_float_1_true_g_idata];\n"
354 " cvt.u64.u32 %rd3, %r4;\n"
355 " mul.wide.u32 %rd4, %r4, 4;\n"
356 " add.u64 %rd5, %rd2, %rd4;\n"
357 " mul.wide.u32 %rd6, %r7, 4;\n"
358 " mov.f32 %f1, 0f00000000; // 0\n"
360 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
362 " ld.global.f32 %f2, [%rd5+0];\n"
363 " add.f32 %f3, %f2, %f1;\n"
365 " ld.global.f32 %f4, [%rd5+4];\n"
366 " add.f32 %f1, %f4, %f3;\n"
367 " add.u32 %r5, %r7, %r5;\n"
368 " add.u64 %rd5, %rd5, %rd6;\n"
370 " ld.param.u32 %r6, [__cudaparm_reduce_float_1_true_n];\n"
372 " setp.lt.u32 %p2, %r5, %r6;\n"
373 " @%p2 bra $Lt_2_15618;\n"
374 " bra.uni $Lt_2_15106;\n"
376 " mov.f32 %f1, 0f00000000; // 0\n"
379 " mov.u64 %rd7, __smem;\n"
380 " cvt.u64.u32 %rd8, %r3;\n"
381 " mul.wide.u32 %rd9, %r3, 4;\n"
382 " add.u64 %rd10, %rd7, %rd9;\n"
383 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
388 " setp.ne.u32 %p3, %r3, %r8;\n"
389 " @%p3 bra $Lt_2_16130;\n"
391 " ld.shared.f32 %f5, [__smem+0];\n"
392 " ld.param.u64 %rd11, [__cudaparm_reduce_float_1_true_g_odata];\n"
393 " cvt.u64.u32 %rd12, %r1;\n"
394 " mul.wide.u32 %rd13, %r1, 4;\n"
395 " add.u64 %rd14, %rd11, %rd13;\n"
396 " st.global.f32 [%rd14+0], %f5;\n"
400 "$LDWend_reduce_float_1_true:\n"
401 " } // reduce_float_1_true\n"
403 " .entry reduce_float_2_true (\n"
404 " .param .u64 __cudaparm_reduce_float_2_true_g_idata,\n"
405 " .param .u64 __cudaparm_reduce_float_2_true_g_odata,\n"
406 " .param .u32 __cudaparm_reduce_float_2_true_n)\n"
408 " .reg .u16 %rh<3>;\n"
409 " .reg .u32 %r<11>;\n"
410 " .reg .u64 %rd<16>;\n"
411 " .reg .f32 %f<9>;\n"
412 " .reg .pred %p<6>;\n"
414 "$LDWbegin_reduce_float_2_true:\n"
416 " cvt.u32.u16 %r1, %ctaid.x;\n"
417 " mul24.lo.u32 %r2, %r1, 4;\n"
418 " cvt.u32.u16 %r3, %tid.x;\n"
419 " add.u32 %r4, %r2, %r3;\n"
420 " mov.s32 %r5, %r4;\n"
421 " ld.param.u32 %r6, [__cudaparm_reduce_float_2_true_n];\n"
422 " setp.ge.u32 %p1, %r4, %r6;\n"
423 " @%p1 bra $Lt_3_16898;\n"
424 " mov.u16 %rh1, %nctaid.x;\n"
425 " mul.wide.u16 %r7, %rh1, 4;\n"
426 " cvt.s64.u32 %rd1, %r7;\n"
427 " ld.param.u64 %rd2, [__cudaparm_reduce_float_2_true_g_idata];\n"
428 " cvt.u64.u32 %rd3, %r4;\n"
429 " mul.wide.u32 %rd4, %r4, 4;\n"
430 " add.u64 %rd5, %rd2, %rd4;\n"
431 " mul.wide.u32 %rd6, %r7, 4;\n"
432 " mov.f32 %f1, 0f00000000; // 0\n"
434 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
436 " ld.global.f32 %f2, [%rd5+0];\n"
437 " add.f32 %f3, %f2, %f1;\n"
439 " ld.global.f32 %f4, [%rd5+8];\n"
440 " add.f32 %f1, %f4, %f3;\n"
441 " add.u32 %r5, %r7, %r5;\n"
442 " add.u64 %rd5, %rd5, %rd6;\n"
444 " ld.param.u32 %r6, [__cudaparm_reduce_float_2_true_n];\n"
446 " setp.lt.u32 %p2, %r5, %r6;\n"
447 " @%p2 bra $Lt_3_15362;\n"
448 " bra.uni $Lt_3_14850;\n"
450 " mov.f32 %f1, 0f00000000; // 0\n"
453 " mov.u64 %rd7, __smem;\n"
454 " cvt.u64.u32 %rd8, %r3;\n"
455 " mul.wide.u32 %rd9, %r3, 4;\n"
456 " add.u64 %rd10, %rd7, %rd9;\n"
457 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
460 " mov.u32 %r8, 31;\n"
461 " setp.gt.u32 %p3, %r3, %r8;\n"
462 " @%p3 bra $Lt_3_15874;\n"
464 " ld.volatile.shared.f32 %f5, [%rd10+4];\n"
465 " add.f32 %f6, %f5, %f1;\n"
466 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
470 " setp.ne.u32 %p4, %r3, %r9;\n"
471 " @%p4 bra $Lt_3_16386;\n"
473 " ld.shared.f32 %f7, [__smem+0];\n"
474 " ld.param.u64 %rd11, [__cudaparm_reduce_float_2_true_g_odata];\n"
475 " cvt.u64.u32 %rd12, %r1;\n"
476 " mul.wide.u32 %rd13, %r1, 4;\n"
477 " add.u64 %rd14, %rd11, %rd13;\n"
478 " st.global.f32 [%rd14+0], %f7;\n"
482 "$LDWend_reduce_float_2_true:\n"
483 " } // reduce_float_2_true\n"
485 " .entry reduce_float_4_true (\n"
486 " .param .u64 __cudaparm_reduce_float_4_true_g_idata,\n"
487 " .param .u64 __cudaparm_reduce_float_4_true_g_odata,\n"
488 " .param .u32 __cudaparm_reduce_float_4_true_n)\n"
490 " .reg .u16 %rh<3>;\n"
491 " .reg .u32 %r<11>;\n"
492 " .reg .u64 %rd<16>;\n"
493 " .reg .f32 %f<11>;\n"
494 " .reg .pred %p<6>;\n"
496 "$LDWbegin_reduce_float_4_true:\n"
498 " cvt.u32.u16 %r1, %ctaid.x;\n"
499 " mul24.lo.u32 %r2, %r1, 8;\n"
500 " cvt.u32.u16 %r3, %tid.x;\n"
501 " add.u32 %r4, %r2, %r3;\n"
502 " mov.s32 %r5, %r4;\n"
503 " ld.param.u32 %r6, [__cudaparm_reduce_float_4_true_n];\n"
504 " setp.ge.u32 %p1, %r4, %r6;\n"
505 " @%p1 bra $Lt_4_16642;\n"
506 " mov.u16 %rh1, %nctaid.x;\n"
507 " mul.wide.u16 %r7, %rh1, 8;\n"
508 " cvt.s64.u32 %rd1, %r7;\n"
509 " ld.param.u64 %rd2, [__cudaparm_reduce_float_4_true_g_idata];\n"
510 " cvt.u64.u32 %rd3, %r4;\n"
511 " mul.wide.u32 %rd4, %r4, 4;\n"
512 " add.u64 %rd5, %rd2, %rd4;\n"
513 " mul.wide.u32 %rd6, %r7, 4;\n"
514 " mov.f32 %f1, 0f00000000; // 0\n"
516 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
518 " ld.global.f32 %f2, [%rd5+0];\n"
519 " add.f32 %f3, %f2, %f1;\n"
521 " ld.global.f32 %f4, [%rd5+16];\n"
522 " add.f32 %f1, %f4, %f3;\n"
523 " add.u32 %r5, %r7, %r5;\n"
524 " add.u64 %rd5, %rd5, %rd6;\n"
526 " ld.param.u32 %r6, [__cudaparm_reduce_float_4_true_n];\n"
528 " setp.lt.u32 %p2, %r5, %r6;\n"
529 " @%p2 bra $Lt_4_15106;\n"
530 " bra.uni $Lt_4_14594;\n"
532 " mov.f32 %f1, 0f00000000; // 0\n"
535 " mov.u64 %rd7, __smem;\n"
536 " cvt.u64.u32 %rd8, %r3;\n"
537 " mul.wide.u32 %rd9, %r3, 4;\n"
538 " add.u64 %rd10, %rd7, %rd9;\n"
539 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
542 " mov.u32 %r8, 31;\n"
543 " setp.gt.u32 %p3, %r3, %r8;\n"
544 " @%p3 bra $Lt_4_15618;\n"
546 " ld.volatile.shared.f32 %f5, [%rd10+8];\n"
547 " add.f32 %f6, %f5, %f1;\n"
548 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
550 " ld.volatile.shared.f32 %f7, [%rd10+4];\n"
551 " add.f32 %f8, %f7, %f6;\n"
552 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
556 " setp.ne.u32 %p4, %r3, %r9;\n"
557 " @%p4 bra $Lt_4_16130;\n"
559 " ld.shared.f32 %f9, [__smem+0];\n"
560 " ld.param.u64 %rd11, [__cudaparm_reduce_float_4_true_g_odata];\n"
561 " cvt.u64.u32 %rd12, %r1;\n"
562 " mul.wide.u32 %rd13, %r1, 4;\n"
563 " add.u64 %rd14, %rd11, %rd13;\n"
564 " st.global.f32 [%rd14+0], %f9;\n"
568 "$LDWend_reduce_float_4_true:\n"
569 " } // reduce_float_4_true\n"
571 " .entry reduce_float_8_true (\n"
572 " .param .u64 __cudaparm_reduce_float_8_true_g_idata,\n"
573 " .param .u64 __cudaparm_reduce_float_8_true_g_odata,\n"
574 " .param .u32 __cudaparm_reduce_float_8_true_n)\n"
576 " .reg .u16 %rh<3>;\n"
577 " .reg .u32 %r<11>;\n"
578 " .reg .u64 %rd<16>;\n"
579 " .reg .f32 %f<13>;\n"
580 " .reg .pred %p<6>;\n"
582 "$LDWbegin_reduce_float_8_true:\n"
584 " cvt.u32.u16 %r1, %ctaid.x;\n"
585 " mul24.lo.u32 %r2, %r1, 16;\n"
586 " cvt.u32.u16 %r3, %tid.x;\n"
587 " add.u32 %r4, %r2, %r3;\n"
588 " mov.s32 %r5, %r4;\n"
589 " ld.param.u32 %r6, [__cudaparm_reduce_float_8_true_n];\n"
590 " setp.ge.u32 %p1, %r4, %r6;\n"
591 " @%p1 bra $Lt_5_16386;\n"
592 " mov.u16 %rh1, %nctaid.x;\n"
593 " mul.wide.u16 %r7, %rh1, 16;\n"
594 " cvt.s64.u32 %rd1, %r7;\n"
595 " ld.param.u64 %rd2, [__cudaparm_reduce_float_8_true_g_idata];\n"
596 " cvt.u64.u32 %rd3, %r4;\n"
597 " mul.wide.u32 %rd4, %r4, 4;\n"
598 " add.u64 %rd5, %rd2, %rd4;\n"
599 " mul.wide.u32 %rd6, %r7, 4;\n"
600 " mov.f32 %f1, 0f00000000; // 0\n"
602 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
604 " ld.global.f32 %f2, [%rd5+0];\n"
605 " add.f32 %f3, %f2, %f1;\n"
607 " ld.global.f32 %f4, [%rd5+32];\n"
608 " add.f32 %f1, %f4, %f3;\n"
609 " add.u32 %r5, %r7, %r5;\n"
610 " add.u64 %rd5, %rd5, %rd6;\n"
612 " ld.param.u32 %r6, [__cudaparm_reduce_float_8_true_n];\n"
614 " setp.lt.u32 %p2, %r5, %r6;\n"
615 " @%p2 bra $Lt_5_14850;\n"
616 " bra.uni $Lt_5_14338;\n"
618 " mov.f32 %f1, 0f00000000; // 0\n"
621 " mov.u64 %rd7, __smem;\n"
622 " cvt.u64.u32 %rd8, %r3;\n"
623 " mul.wide.u32 %rd9, %r3, 4;\n"
624 " add.u64 %rd10, %rd7, %rd9;\n"
625 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
628 " mov.u32 %r8, 31;\n"
629 " setp.gt.u32 %p3, %r3, %r8;\n"
630 " @%p3 bra $Lt_5_15362;\n"
632 " ld.volatile.shared.f32 %f5, [%rd10+16];\n"
633 " add.f32 %f6, %f5, %f1;\n"
634 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
636 " ld.volatile.shared.f32 %f7, [%rd10+8];\n"
637 " add.f32 %f8, %f7, %f6;\n"
638 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
640 " ld.volatile.shared.f32 %f9, [%rd10+4];\n"
641 " add.f32 %f10, %f9, %f8;\n"
642 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
646 " setp.ne.u32 %p4, %r3, %r9;\n"
647 " @%p4 bra $Lt_5_15874;\n"
649 " ld.shared.f32 %f11, [__smem+0];\n"
650 " ld.param.u64 %rd11, [__cudaparm_reduce_float_8_true_g_odata];\n"
651 " cvt.u64.u32 %rd12, %r1;\n"
652 " mul.wide.u32 %rd13, %r1, 4;\n"
653 " add.u64 %rd14, %rd11, %rd13;\n"
654 " st.global.f32 [%rd14+0], %f11;\n"
658 "$LDWend_reduce_float_8_true:\n"
659 " } // reduce_float_8_true\n"
661 " .entry reduce_float_16_true (\n"
662 " .param .u64 __cudaparm_reduce_float_16_true_g_idata,\n"
663 " .param .u64 __cudaparm_reduce_float_16_true_g_odata,\n"
664 " .param .u32 __cudaparm_reduce_float_16_true_n)\n"
666 " .reg .u16 %rh<3>;\n"
667 " .reg .u32 %r<11>;\n"
668 " .reg .u64 %rd<16>;\n"
669 " .reg .f32 %f<15>;\n"
670 " .reg .pred %p<6>;\n"
672 "$LDWbegin_reduce_float_16_true:\n"
674 " cvt.u32.u16 %r1, %ctaid.x;\n"
675 " mul24.lo.u32 %r2, %r1, 32;\n"
676 " cvt.u32.u16 %r3, %tid.x;\n"
677 " add.u32 %r4, %r2, %r3;\n"
678 " mov.s32 %r5, %r4;\n"
679 " ld.param.u32 %r6, [__cudaparm_reduce_float_16_true_n];\n"
680 " setp.ge.u32 %p1, %r4, %r6;\n"
681 " @%p1 bra $Lt_6_16130;\n"
682 " mov.u16 %rh1, %nctaid.x;\n"
683 " mul.wide.u16 %r7, %rh1, 32;\n"
684 " cvt.s64.u32 %rd1, %r7;\n"
685 " ld.param.u64 %rd2, [__cudaparm_reduce_float_16_true_g_idata];\n"
686 " cvt.u64.u32 %rd3, %r4;\n"
687 " mul.wide.u32 %rd4, %r4, 4;\n"
688 " add.u64 %rd5, %rd2, %rd4;\n"
689 " mul.wide.u32 %rd6, %r7, 4;\n"
690 " mov.f32 %f1, 0f00000000; // 0\n"
692 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
694 " ld.global.f32 %f2, [%rd5+0];\n"
695 " add.f32 %f3, %f2, %f1;\n"
697 " ld.global.f32 %f4, [%rd5+64];\n"
698 " add.f32 %f1, %f4, %f3;\n"
699 " add.u32 %r5, %r7, %r5;\n"
700 " add.u64 %rd5, %rd5, %rd6;\n"
702 " ld.param.u32 %r6, [__cudaparm_reduce_float_16_true_n];\n"
704 " setp.lt.u32 %p2, %r5, %r6;\n"
705 " @%p2 bra $Lt_6_14594;\n"
706 " bra.uni $Lt_6_14082;\n"
708 " mov.f32 %f1, 0f00000000; // 0\n"
711 " mov.u64 %rd7, __smem;\n"
712 " cvt.u64.u32 %rd8, %r3;\n"
713 " mul.wide.u32 %rd9, %r3, 4;\n"
714 " add.u64 %rd10, %rd7, %rd9;\n"
715 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
718 " mov.u32 %r8, 31;\n"
719 " setp.gt.u32 %p3, %r3, %r8;\n"
720 " @%p3 bra $Lt_6_15106;\n"
722 " ld.volatile.shared.f32 %f5, [%rd10+32];\n"
723 " add.f32 %f6, %f5, %f1;\n"
724 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
726 " ld.volatile.shared.f32 %f7, [%rd10+16];\n"
727 " add.f32 %f8, %f7, %f6;\n"
728 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
730 " ld.volatile.shared.f32 %f9, [%rd10+8];\n"
731 " add.f32 %f10, %f9, %f8;\n"
732 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
734 " ld.volatile.shared.f32 %f11, [%rd10+4];\n"
735 " add.f32 %f12, %f11, %f10;\n"
736 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
740 " setp.ne.u32 %p4, %r3, %r9;\n"
741 " @%p4 bra $Lt_6_15618;\n"
743 " ld.shared.f32 %f13, [__smem+0];\n"
744 " ld.param.u64 %rd11, [__cudaparm_reduce_float_16_true_g_odata];\n"
745 " cvt.u64.u32 %rd12, %r1;\n"
746 " mul.wide.u32 %rd13, %r1, 4;\n"
747 " add.u64 %rd14, %rd11, %rd13;\n"
748 " st.global.f32 [%rd14+0], %f13;\n"
752 "$LDWend_reduce_float_16_true:\n"
753 " } // reduce_float_16_true\n"
755 " .entry reduce_float_32_true (\n"
756 " .param .u64 __cudaparm_reduce_float_32_true_g_idata,\n"
757 " .param .u64 __cudaparm_reduce_float_32_true_g_odata,\n"
758 " .param .u32 __cudaparm_reduce_float_32_true_n)\n"
760 " .reg .u16 %rh<3>;\n"
761 " .reg .u32 %r<11>;\n"
762 " .reg .u64 %rd<16>;\n"
763 " .reg .f32 %f<17>;\n"
764 " .reg .pred %p<6>;\n"
766 "$LDWbegin_reduce_float_32_true:\n"
768 " cvt.u32.u16 %r1, %ctaid.x;\n"
769 " mul24.lo.u32 %r2, %r1, 64;\n"
770 " cvt.u32.u16 %r3, %tid.x;\n"
771 " add.u32 %r4, %r2, %r3;\n"
772 " mov.s32 %r5, %r4;\n"
773 " ld.param.u32 %r6, [__cudaparm_reduce_float_32_true_n];\n"
774 " setp.ge.u32 %p1, %r4, %r6;\n"
775 " @%p1 bra $Lt_7_15874;\n"
776 " mov.u16 %rh1, %nctaid.x;\n"
777 " mul.wide.u16 %r7, %rh1, 64;\n"
778 " cvt.s64.u32 %rd1, %r7;\n"
779 " ld.param.u64 %rd2, [__cudaparm_reduce_float_32_true_g_idata];\n"
780 " cvt.u64.u32 %rd3, %r4;\n"
781 " mul.wide.u32 %rd4, %r4, 4;\n"
782 " add.u64 %rd5, %rd2, %rd4;\n"
783 " mul.wide.u32 %rd6, %r7, 4;\n"
784 " mov.f32 %f1, 0f00000000; // 0\n"
786 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
788 " ld.global.f32 %f2, [%rd5+0];\n"
789 " add.f32 %f3, %f2, %f1;\n"
791 " ld.global.f32 %f4, [%rd5+128];\n"
792 " add.f32 %f1, %f4, %f3;\n"
793 " add.u32 %r5, %r7, %r5;\n"
794 " add.u64 %rd5, %rd5, %rd6;\n"
796 " ld.param.u32 %r6, [__cudaparm_reduce_float_32_true_n];\n"
798 " setp.lt.u32 %p2, %r5, %r6;\n"
799 " @%p2 bra $Lt_7_14338;\n"
800 " bra.uni $Lt_7_13826;\n"
802 " mov.f32 %f1, 0f00000000; // 0\n"
805 " mov.u64 %rd7, __smem;\n"
806 " cvt.u64.u32 %rd8, %r3;\n"
807 " mul.wide.u32 %rd9, %r3, 4;\n"
808 " add.u64 %rd10, %rd7, %rd9;\n"
809 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
812 " mov.u32 %r8, 31;\n"
813 " setp.gt.u32 %p3, %r3, %r8;\n"
814 " @%p3 bra $Lt_7_14850;\n"
816 " ld.volatile.shared.f32 %f5, [%rd10+64];\n"
817 " add.f32 %f6, %f5, %f1;\n"
818 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
820 " ld.volatile.shared.f32 %f7, [%rd10+32];\n"
821 " add.f32 %f8, %f7, %f6;\n"
822 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
824 " ld.volatile.shared.f32 %f9, [%rd10+16];\n"
825 " add.f32 %f10, %f9, %f8;\n"
826 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
828 " ld.volatile.shared.f32 %f11, [%rd10+8];\n"
829 " add.f32 %f12, %f11, %f10;\n"
830 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
832 " ld.volatile.shared.f32 %f13, [%rd10+4];\n"
833 " add.f32 %f14, %f13, %f12;\n"
834 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
838 " setp.ne.u32 %p4, %r3, %r9;\n"
839 " @%p4 bra $Lt_7_15362;\n"
841 " ld.shared.f32 %f15, [__smem+0];\n"
842 " ld.param.u64 %rd11, [__cudaparm_reduce_float_32_true_g_odata];\n"
843 " cvt.u64.u32 %rd12, %r1;\n"
844 " mul.wide.u32 %rd13, %r1, 4;\n"
845 " add.u64 %rd14, %rd11, %rd13;\n"
846 " st.global.f32 [%rd14+0], %f15;\n"
850 "$LDWend_reduce_float_32_true:\n"
851 " } // reduce_float_32_true\n"
853 " .entry reduce_float_64_true (\n"
854 " .param .u64 __cudaparm_reduce_float_64_true_g_idata,\n"
855 " .param .u64 __cudaparm_reduce_float_64_true_g_odata,\n"
856 " .param .u32 __cudaparm_reduce_float_64_true_n)\n"
858 " .reg .u16 %rh<3>;\n"
859 " .reg .u32 %r<11>;\n"
860 " .reg .u64 %rd<16>;\n"
861 " .reg .f32 %f<19>;\n"
862 " .reg .pred %p<6>;\n"
864 "$LDWbegin_reduce_float_64_true:\n"
866 " cvt.u32.u16 %r1, %ctaid.x;\n"
867 " mul24.lo.u32 %r2, %r1, 128;\n"
868 " cvt.u32.u16 %r3, %tid.x;\n"
869 " add.u32 %r4, %r2, %r3;\n"
870 " mov.s32 %r5, %r4;\n"
871 " ld.param.u32 %r6, [__cudaparm_reduce_float_64_true_n];\n"
872 " setp.ge.u32 %p1, %r4, %r6;\n"
873 " @%p1 bra $Lt_8_15618;\n"
874 " mov.u16 %rh1, %nctaid.x;\n"
875 " mul.wide.u16 %r7, %rh1, 128;\n"
876 " cvt.s64.u32 %rd1, %r7;\n"
877 " ld.param.u64 %rd2, [__cudaparm_reduce_float_64_true_g_idata];\n"
878 " cvt.u64.u32 %rd3, %r4;\n"
879 " mul.wide.u32 %rd4, %r4, 4;\n"
880 " add.u64 %rd5, %rd2, %rd4;\n"
881 " mul.wide.u32 %rd6, %r7, 4;\n"
882 " mov.f32 %f1, 0f00000000; // 0\n"
884 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
886 " ld.global.f32 %f2, [%rd5+0];\n"
887 " add.f32 %f3, %f2, %f1;\n"
889 " ld.global.f32 %f4, [%rd5+256];\n"
890 " add.f32 %f1, %f4, %f3;\n"
891 " add.u32 %r5, %r7, %r5;\n"
892 " add.u64 %rd5, %rd5, %rd6;\n"
894 " ld.param.u32 %r6, [__cudaparm_reduce_float_64_true_n];\n"
896 " setp.lt.u32 %p2, %r5, %r6;\n"
897 " @%p2 bra $Lt_8_14082;\n"
898 " bra.uni $Lt_8_13570;\n"
900 " mov.f32 %f1, 0f00000000; // 0\n"
903 " mov.u64 %rd7, __smem;\n"
904 " cvt.u64.u32 %rd8, %r3;\n"
905 " mul.wide.u32 %rd9, %r3, 4;\n"
906 " add.u64 %rd10, %rd7, %rd9;\n"
907 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
910 " mov.u32 %r8, 31;\n"
911 " setp.gt.u32 %p3, %r3, %r8;\n"
912 " @%p3 bra $Lt_8_14594;\n"
914 " ld.volatile.shared.f32 %f5, [%rd10+128];\n"
915 " add.f32 %f6, %f5, %f1;\n"
916 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
918 " ld.volatile.shared.f32 %f7, [%rd10+64];\n"
919 " add.f32 %f8, %f7, %f6;\n"
920 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
922 " ld.volatile.shared.f32 %f9, [%rd10+32];\n"
923 " add.f32 %f10, %f9, %f8;\n"
924 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
926 " ld.volatile.shared.f32 %f11, [%rd10+16];\n"
927 " add.f32 %f12, %f11, %f10;\n"
928 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
930 " ld.volatile.shared.f32 %f13, [%rd10+8];\n"
931 " add.f32 %f14, %f13, %f12;\n"
932 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
934 " ld.volatile.shared.f32 %f15, [%rd10+4];\n"
935 " add.f32 %f16, %f15, %f14;\n"
936 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
940 " setp.ne.u32 %p4, %r3, %r9;\n"
941 " @%p4 bra $Lt_8_15106;\n"
943 " ld.shared.f32 %f17, [__smem+0];\n"
944 " ld.param.u64 %rd11, [__cudaparm_reduce_float_64_true_g_odata];\n"
945 " cvt.u64.u32 %rd12, %r1;\n"
946 " mul.wide.u32 %rd13, %r1, 4;\n"
947 " add.u64 %rd14, %rd11, %rd13;\n"
948 " st.global.f32 [%rd14+0], %f17;\n"
952 "$LDWend_reduce_float_64_true:\n"
953 " } // reduce_float_64_true\n"
955 " .entry reduce_float_128_true (\n"
956 " .param .u64 __cudaparm_reduce_float_128_true_g_idata,\n"
957 " .param .u64 __cudaparm_reduce_float_128_true_g_odata,\n"
958 " .param .u32 __cudaparm_reduce_float_128_true_n)\n"
960 " .reg .u16 %rh<3>;\n"
961 " .reg .u32 %r<12>;\n"
962 " .reg .u64 %rd<16>;\n"
963 " .reg .f32 %f<21>;\n"
964 " .reg .pred %p<7>;\n"
966 "$LDWbegin_reduce_float_128_true:\n"
968 " cvt.u32.u16 %r1, %ctaid.x;\n"
969 " mul.lo.u32 %r2, %r1, 256;\n"
970 " cvt.u32.u16 %r3, %tid.x;\n"
971 " add.u32 %r4, %r2, %r3;\n"
972 " mov.s32 %r5, %r4;\n"
973 " ld.param.u32 %r6, [__cudaparm_reduce_float_128_true_n];\n"
974 " setp.ge.u32 %p1, %r4, %r6;\n"
975 " @%p1 bra $Lt_9_15874;\n"
976 " mov.u16 %rh1, %nctaid.x;\n"
977 " mul.wide.u16 %r7, %rh1, 256;\n"
978 " cvt.s64.u32 %rd1, %r7;\n"
979 " ld.param.u64 %rd2, [__cudaparm_reduce_float_128_true_g_idata];\n"
980 " cvt.u64.u32 %rd3, %r4;\n"
981 " mul.wide.u32 %rd4, %r4, 4;\n"
982 " add.u64 %rd5, %rd2, %rd4;\n"
983 " mul.wide.u32 %rd6, %r7, 4;\n"
984 " mov.f32 %f1, 0f00000000; // 0\n"
986 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
988 " ld.global.f32 %f2, [%rd5+0];\n"
989 " add.f32 %f3, %f2, %f1;\n"
991 " ld.global.f32 %f4, [%rd5+512];\n"
992 " add.f32 %f1, %f4, %f3;\n"
993 " add.u32 %r5, %r7, %r5;\n"
994 " add.u64 %rd5, %rd5, %rd6;\n"
996 " ld.param.u32 %r6, [__cudaparm_reduce_float_128_true_n];\n"
998 " setp.lt.u32 %p2, %r5, %r6;\n"
999 " @%p2 bra $Lt_9_13826;\n"
1000 " bra.uni $Lt_9_13314;\n"
1002 " mov.f32 %f1, 0f00000000; // 0\n"
1005 " mov.f32 %f5, %f1;\n"
1006 " mov.f32 %f6, %f5;\n"
1008 " mov.u64 %rd7, __smem;\n"
1009 " cvt.u64.u32 %rd8, %r3;\n"
1010 " mul.wide.u32 %rd9, %r3, 4;\n"
1011 " add.u64 %rd10, %rd7, %rd9;\n"
1012 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1015 " mov.u32 %r8, 63;\n"
1016 " setp.gt.u32 %p3, %r3, %r8;\n"
1017 " @%p3 bra $Lt_9_14338;\n"
1019 " ld.volatile.shared.f32 %f7, [%rd10+256];\n"
1020 " add.f32 %f6, %f7, %f5;\n"
1021 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1024 " mov.u32 %r9, 31;\n"
1025 " setp.gt.u32 %p4, %r3, %r9;\n"
1026 " @%p4 bra $Lt_9_14850;\n"
1028 " ld.volatile.shared.f32 %f8, [%rd10+128];\n"
1029 " add.f32 %f9, %f8, %f6;\n"
1030 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1032 " ld.volatile.shared.f32 %f10, [%rd10+64];\n"
1033 " add.f32 %f11, %f10, %f9;\n"
1034 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1036 " ld.volatile.shared.f32 %f12, [%rd10+32];\n"
1037 " add.f32 %f13, %f12, %f11;\n"
1038 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1040 " ld.volatile.shared.f32 %f14, [%rd10+16];\n"
1041 " add.f32 %f15, %f14, %f13;\n"
1042 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
1044 " ld.volatile.shared.f32 %f16, [%rd10+8];\n"
1045 " add.f32 %f17, %f16, %f15;\n"
1046 " st.volatile.shared.f32 [%rd10+0], %f17;\n"
1048 " ld.volatile.shared.f32 %f18, [%rd10+4];\n"
1049 " add.f32 %f6, %f18, %f17;\n"
1050 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1053 " mov.u32 %r10, 0;\n"
1054 " setp.ne.u32 %p5, %r3, %r10;\n"
1055 " @%p5 bra $Lt_9_15362;\n"
1057 " ld.shared.f32 %f19, [__smem+0];\n"
1058 " ld.param.u64 %rd11, [__cudaparm_reduce_float_128_true_g_odata];\n"
1059 " cvt.u64.u32 %rd12, %r1;\n"
1060 " mul.wide.u32 %rd13, %r1, 4;\n"
1061 " add.u64 %rd14, %rd11, %rd13;\n"
1062 " st.global.f32 [%rd14+0], %f19;\n"
1066 "$LDWend_reduce_float_128_true:\n"
1067 " } // reduce_float_128_true\n"
1069 " .entry reduce_float_256_true (\n"
1070 " .param .u64 __cudaparm_reduce_float_256_true_g_idata,\n"
1071 " .param .u64 __cudaparm_reduce_float_256_true_g_odata,\n"
1072 " .param .u32 __cudaparm_reduce_float_256_true_n)\n"
1074 " .reg .u16 %rh<3>;\n"
1075 " .reg .u32 %r<13>;\n"
1076 " .reg .u64 %rd<16>;\n"
1077 " .reg .f32 %f<22>;\n"
1078 " .reg .pred %p<8>;\n"
1080 "$LDWbegin_reduce_float_256_true:\n"
1082 " cvt.u32.u16 %r1, %ctaid.x;\n"
1083 " mul.lo.u32 %r2, %r1, 512;\n"
1084 " cvt.u32.u16 %r3, %tid.x;\n"
1085 " add.u32 %r4, %r2, %r3;\n"
1086 " mov.s32 %r5, %r4;\n"
1087 " ld.param.u32 %r6, [__cudaparm_reduce_float_256_true_n];\n"
1088 " setp.ge.u32 %p1, %r4, %r6;\n"
1089 " @%p1 bra $Lt_10_16130;\n"
1090 " mov.u16 %rh1, %nctaid.x;\n"
1091 " mul.wide.u16 %r7, %rh1, 512;\n"
1092 " cvt.s64.u32 %rd1, %r7;\n"
1093 " ld.param.u64 %rd2, [__cudaparm_reduce_float_256_true_g_idata];\n"
1094 " cvt.u64.u32 %rd3, %r4;\n"
1095 " mul.wide.u32 %rd4, %r4, 4;\n"
1096 " add.u64 %rd5, %rd2, %rd4;\n"
1097 " mul.wide.u32 %rd6, %r7, 4;\n"
1098 " mov.f32 %f1, 0f00000000; // 0\n"
1100 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1102 " ld.global.f32 %f2, [%rd5+0];\n"
1103 " add.f32 %f3, %f2, %f1;\n"
1105 " ld.global.f32 %f4, [%rd5+1024];\n"
1106 " add.f32 %f1, %f4, %f3;\n"
1107 " add.u32 %r5, %r7, %r5;\n"
1108 " add.u64 %rd5, %rd5, %rd6;\n"
1110 " ld.param.u32 %r6, [__cudaparm_reduce_float_256_true_n];\n"
1112 " setp.lt.u32 %p2, %r5, %r6;\n"
1113 " @%p2 bra $Lt_10_13570;\n"
1114 " bra.uni $Lt_10_13058;\n"
1116 " mov.f32 %f1, 0f00000000; // 0\n"
1119 " mov.f32 %f5, %f1;\n"
1120 " mov.f32 %f6, %f5;\n"
1122 " mov.u64 %rd7, __smem;\n"
1123 " cvt.u64.u32 %rd8, %r3;\n"
1124 " mul.wide.u32 %rd9, %r3, 4;\n"
1125 " add.u64 %rd10, %rd7, %rd9;\n"
1126 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1129 " mov.u32 %r8, 127;\n"
1130 " setp.gt.u32 %p3, %r3, %r8;\n"
1131 " @%p3 bra $Lt_10_14082;\n"
1133 " ld.volatile.shared.f32 %f7, [%rd10+512];\n"
1134 " add.f32 %f6, %f7, %f5;\n"
1135 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1138 " mov.u32 %r9, 63;\n"
1139 " setp.gt.u32 %p4, %r3, %r9;\n"
1140 " @%p4 bra $Lt_10_14594;\n"
1142 " ld.volatile.shared.f32 %f8, [%rd10+256];\n"
1143 " add.f32 %f6, %f8, %f6;\n"
1144 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1147 " mov.u32 %r10, 31;\n"
1148 " setp.gt.u32 %p5, %r3, %r10;\n"
1149 " @%p5 bra $Lt_10_15106;\n"
1151 " ld.volatile.shared.f32 %f9, [%rd10+128];\n"
1152 " add.f32 %f10, %f9, %f6;\n"
1153 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
1155 " ld.volatile.shared.f32 %f11, [%rd10+64];\n"
1156 " add.f32 %f12, %f11, %f10;\n"
1157 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
1159 " ld.volatile.shared.f32 %f13, [%rd10+32];\n"
1160 " add.f32 %f14, %f13, %f12;\n"
1161 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
1163 " ld.volatile.shared.f32 %f15, [%rd10+16];\n"
1164 " add.f32 %f16, %f15, %f14;\n"
1165 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
1167 " ld.volatile.shared.f32 %f17, [%rd10+8];\n"
1168 " add.f32 %f18, %f17, %f16;\n"
1169 " st.volatile.shared.f32 [%rd10+0], %f18;\n"
1171 " ld.volatile.shared.f32 %f19, [%rd10+4];\n"
1172 " add.f32 %f6, %f19, %f18;\n"
1173 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1176 " mov.u32 %r11, 0;\n"
1177 " setp.ne.u32 %p6, %r3, %r11;\n"
1178 " @%p6 bra $Lt_10_15618;\n"
1180 " ld.shared.f32 %f20, [__smem+0];\n"
1181 " ld.param.u64 %rd11, [__cudaparm_reduce_float_256_true_g_odata];\n"
1182 " cvt.u64.u32 %rd12, %r1;\n"
1183 " mul.wide.u32 %rd13, %r1, 4;\n"
1184 " add.u64 %rd14, %rd11, %rd13;\n"
1185 " st.global.f32 [%rd14+0], %f20;\n"
1189 "$LDWend_reduce_float_256_true:\n"
1190 " } // reduce_float_256_true\n"
1192 " .entry reduce_float_512_true (\n"
1193 " .param .u64 __cudaparm_reduce_float_512_true_g_idata,\n"
1194 " .param .u64 __cudaparm_reduce_float_512_true_g_odata,\n"
1195 " .param .u32 __cudaparm_reduce_float_512_true_n)\n"
1197 " .reg .u16 %rh<3>;\n"
1198 " .reg .u32 %r<14>;\n"
1199 " .reg .u64 %rd<16>;\n"
1200 " .reg .f32 %f<23>;\n"
1201 " .reg .pred %p<9>;\n"
1203 "$LDWbegin_reduce_float_512_true:\n"
1205 " cvt.u32.u16 %r1, %ctaid.x;\n"
1206 " mul.lo.u32 %r2, %r1, 1024;\n"
1207 " cvt.u32.u16 %r3, %tid.x;\n"
1208 " add.u32 %r4, %r2, %r3;\n"
1209 " mov.s32 %r5, %r4;\n"
1210 " ld.param.u32 %r6, [__cudaparm_reduce_float_512_true_n];\n"
1211 " setp.ge.u32 %p1, %r4, %r6;\n"
1212 " @%p1 bra $Lt_11_16386;\n"
1213 " mov.u16 %rh1, %nctaid.x;\n"
1214 " mul.wide.u16 %r7, %rh1, 1024;\n"
1215 " cvt.s64.u32 %rd1, %r7;\n"
1216 " ld.param.u64 %rd2, [__cudaparm_reduce_float_512_true_g_idata];\n"
1217 " cvt.u64.u32 %rd3, %r4;\n"
1218 " mul.wide.u32 %rd4, %r4, 4;\n"
1219 " add.u64 %rd5, %rd2, %rd4;\n"
1220 " mul.wide.u32 %rd6, %r7, 4;\n"
1221 " mov.f32 %f1, 0f00000000; // 0\n"
1223 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1225 " ld.global.f32 %f2, [%rd5+0];\n"
1226 " add.f32 %f3, %f2, %f1;\n"
1228 " ld.global.f32 %f4, [%rd5+2048];\n"
1229 " add.f32 %f1, %f4, %f3;\n"
1230 " add.u32 %r5, %r7, %r5;\n"
1231 " add.u64 %rd5, %rd5, %rd6;\n"
1233 " ld.param.u32 %r6, [__cudaparm_reduce_float_512_true_n];\n"
1235 " setp.lt.u32 %p2, %r5, %r6;\n"
1236 " @%p2 bra $Lt_11_13314;\n"
1237 " bra.uni $Lt_11_12802;\n"
1239 " mov.f32 %f1, 0f00000000; // 0\n"
1242 " mov.f32 %f5, %f1;\n"
1243 " mov.f32 %f6, %f5;\n"
1245 " mov.u64 %rd7, __smem;\n"
1246 " cvt.u64.u32 %rd8, %r3;\n"
1247 " mul.wide.u32 %rd9, %r3, 4;\n"
1248 " add.u64 %rd10, %rd7, %rd9;\n"
1249 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1252 " mov.u32 %r8, 255;\n"
1253 " setp.gt.u32 %p3, %r3, %r8;\n"
1254 " @%p3 bra $Lt_11_13826;\n"
1256 " ld.volatile.shared.f32 %f7, [%rd10+1024];\n"
1257 " add.f32 %f6, %f7, %f5;\n"
1258 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1261 " mov.u32 %r9, 127;\n"
1262 " setp.gt.u32 %p4, %r3, %r9;\n"
1263 " @%p4 bra $Lt_11_14338;\n"
1265 " ld.volatile.shared.f32 %f8, [%rd10+512];\n"
1266 " add.f32 %f6, %f8, %f6;\n"
1267 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1270 " mov.u32 %r10, 63;\n"
1271 " setp.gt.u32 %p5, %r3, %r10;\n"
1272 " @%p5 bra $Lt_11_14850;\n"
1274 " ld.volatile.shared.f32 %f9, [%rd10+256];\n"
1275 " add.f32 %f6, %f9, %f6;\n"
1276 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1279 " mov.u32 %r11, 31;\n"
1280 " setp.gt.u32 %p6, %r3, %r11;\n"
1281 " @%p6 bra $Lt_11_15362;\n"
1283 " ld.volatile.shared.f32 %f10, [%rd10+128];\n"
1284 " add.f32 %f11, %f10, %f6;\n"
1285 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1287 " ld.volatile.shared.f32 %f12, [%rd10+64];\n"
1288 " add.f32 %f13, %f12, %f11;\n"
1289 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1291 " ld.volatile.shared.f32 %f14, [%rd10+32];\n"
1292 " add.f32 %f15, %f14, %f13;\n"
1293 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
1295 " ld.volatile.shared.f32 %f16, [%rd10+16];\n"
1296 " add.f32 %f17, %f16, %f15;\n"
1297 " st.volatile.shared.f32 [%rd10+0], %f17;\n"
1299 " ld.volatile.shared.f32 %f18, [%rd10+8];\n"
1300 " add.f32 %f19, %f18, %f17;\n"
1301 " st.volatile.shared.f32 [%rd10+0], %f19;\n"
1303 " ld.volatile.shared.f32 %f20, [%rd10+4];\n"
1304 " add.f32 %f6, %f20, %f19;\n"
1305 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1308 " mov.u32 %r12, 0;\n"
1309 " setp.ne.u32 %p7, %r3, %r12;\n"
1310 " @%p7 bra $Lt_11_15874;\n"
1312 " ld.shared.f32 %f21, [__smem+0];\n"
1313 " ld.param.u64 %rd11, [__cudaparm_reduce_float_512_true_g_odata];\n"
1314 " cvt.u64.u32 %rd12, %r1;\n"
1315 " mul.wide.u32 %rd13, %r1, 4;\n"
1316 " add.u64 %rd14, %rd11, %rd13;\n"
1317 " st.global.f32 [%rd14+0], %f21;\n"
1321 "$LDWend_reduce_float_512_true:\n"
1322 " } // reduce_float_512_true\n"
1324 " .entry reduce_float_1_false (\n"
1325 " .param .u64 __cudaparm_reduce_float_1_false_g_idata,\n"
1326 " .param .u64 __cudaparm_reduce_float_1_false_g_odata,\n"
1327 " .param .u32 __cudaparm_reduce_float_1_false_n)\n"
1329 " .reg .u16 %rh<3>;\n"
1330 " .reg .u32 %r<11>;\n"
1331 " .reg .u64 %rd<16>;\n"
1332 " .reg .f32 %f<6>;\n"
1333 " .reg .pred %p<6>;\n"
1335 "$LDWbegin_reduce_float_1_false:\n"
1337 " cvt.u32.u16 %r1, %ctaid.x;\n"
1338 " mul24.lo.u32 %r2, %r1, 2;\n"
1339 " cvt.u32.u16 %r3, %tid.x;\n"
1340 " add.u32 %r4, %r2, %r3;\n"
1341 " ld.param.u32 %r5, [__cudaparm_reduce_float_1_false_n];\n"
1342 " setp.ge.u32 %p1, %r4, %r5;\n"
1343 " @%p1 bra $Lt_12_17154;\n"
1344 " add.u32 %r6, %r4, 1;\n"
1345 " ld.param.u32 %r5, [__cudaparm_reduce_float_1_false_n];\n"
1346 " add.u32 %r7, %r5, 1;\n"
1347 " mov.u16 %rh1, %nctaid.x;\n"
1348 " mul.wide.u16 %r8, %rh1, 2;\n"
1349 " cvt.s64.u32 %rd1, %r8;\n"
1350 " ld.param.u64 %rd2, [__cudaparm_reduce_float_1_false_g_idata];\n"
1351 " cvt.u64.u32 %rd3, %r4;\n"
1352 " mul.wide.u32 %rd4, %r4, 4;\n"
1353 " add.u64 %rd5, %rd2, %rd4;\n"
1354 " mul.wide.u32 %rd6, %r8, 4;\n"
1355 " mov.f32 %f1, 0f00000000; // 0\n"
1357 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1359 " ld.global.f32 %f2, [%rd5+0];\n"
1360 " add.f32 %f1, %f2, %f1;\n"
1362 " ld.param.u32 %r5, [__cudaparm_reduce_float_1_false_n];\n"
1364 " setp.ge.u32 %p2, %r6, %r5;\n"
1365 " @%p2 bra $Lt_12_15874;\n"
1366 " //<loop> Part of loop body line 181, head labeled $Lt_12_15618\n"
1368 " ld.global.f32 %f3, [%rd5+4];\n"
1369 " add.f32 %f1, %f3, %f1;\n"
1371 " //<loop> Part of loop body line 181, head labeled $Lt_12_15618\n"
1372 " add.u32 %r6, %r6, %r8;\n"
1373 " add.u64 %rd5, %rd5, %rd6;\n"
1374 " setp.lt.u32 %p3, %r6, %r7;\n"
1375 " @%p3 bra $Lt_12_15618;\n"
1376 " bra.uni $Lt_12_15106;\n"
1378 " mov.f32 %f1, 0f00000000; // 0\n"
1381 " mov.u64 %rd7, __smem;\n"
1382 " cvt.u64.u32 %rd8, %r3;\n"
1383 " mul.wide.u32 %rd9, %r3, 4;\n"
1384 " add.u64 %rd10, %rd7, %rd9;\n"
1385 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1389 " mov.u32 %r9, 0;\n"
1390 " setp.ne.u32 %p4, %r3, %r9;\n"
1391 " @%p4 bra $Lt_12_16642;\n"
1393 " ld.shared.f32 %f4, [__smem+0];\n"
1394 " ld.param.u64 %rd11, [__cudaparm_reduce_float_1_false_g_odata];\n"
1395 " cvt.u64.u32 %rd12, %r1;\n"
1396 " mul.wide.u32 %rd13, %r1, 4;\n"
1397 " add.u64 %rd14, %rd11, %rd13;\n"
1398 " st.global.f32 [%rd14+0], %f4;\n"
1402 "$LDWend_reduce_float_1_false:\n"
1403 " } // reduce_float_1_false\n"
1405 " .entry reduce_float_2_false (\n"
1406 " .param .u64 __cudaparm_reduce_float_2_false_g_idata,\n"
1407 " .param .u64 __cudaparm_reduce_float_2_false_g_odata,\n"
1408 " .param .u32 __cudaparm_reduce_float_2_false_n)\n"
1410 " .reg .u16 %rh<3>;\n"
1411 " .reg .u32 %r<12>;\n"
1412 " .reg .u64 %rd<16>;\n"
1413 " .reg .f32 %f<8>;\n"
1414 " .reg .pred %p<7>;\n"
1416 "$LDWbegin_reduce_float_2_false:\n"
1418 " cvt.u32.u16 %r1, %ctaid.x;\n"
1419 " mul24.lo.u32 %r2, %r1, 4;\n"
1420 " cvt.u32.u16 %r3, %tid.x;\n"
1421 " add.u32 %r4, %r2, %r3;\n"
1422 " ld.param.u32 %r5, [__cudaparm_reduce_float_2_false_n];\n"
1423 " setp.ge.u32 %p1, %r4, %r5;\n"
1424 " @%p1 bra $Lt_13_17410;\n"
1425 " add.u32 %r6, %r4, 2;\n"
1426 " ld.param.u32 %r5, [__cudaparm_reduce_float_2_false_n];\n"
1427 " add.u32 %r7, %r5, 2;\n"
1428 " mov.u16 %rh1, %nctaid.x;\n"
1429 " mul.wide.u16 %r8, %rh1, 4;\n"
1430 " cvt.s64.u32 %rd1, %r8;\n"
1431 " ld.param.u64 %rd2, [__cudaparm_reduce_float_2_false_g_idata];\n"
1432 " cvt.u64.u32 %rd3, %r4;\n"
1433 " mul.wide.u32 %rd4, %r4, 4;\n"
1434 " add.u64 %rd5, %rd2, %rd4;\n"
1435 " mul.wide.u32 %rd6, %r8, 4;\n"
1436 " mov.f32 %f1, 0f00000000; // 0\n"
1438 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1440 " ld.global.f32 %f2, [%rd5+0];\n"
1441 " add.f32 %f1, %f2, %f1;\n"
1443 " ld.param.u32 %r5, [__cudaparm_reduce_float_2_false_n];\n"
1445 " setp.ge.u32 %p2, %r6, %r5;\n"
1446 " @%p2 bra $Lt_13_15618;\n"
1447 " //<loop> Part of loop body line 181, head labeled $Lt_13_15362\n"
1449 " ld.global.f32 %f3, [%rd5+8];\n"
1450 " add.f32 %f1, %f3, %f1;\n"
1452 " //<loop> Part of loop body line 181, head labeled $Lt_13_15362\n"
1453 " add.u32 %r6, %r6, %r8;\n"
1454 " add.u64 %rd5, %rd5, %rd6;\n"
1455 " setp.lt.u32 %p3, %r6, %r7;\n"
1456 " @%p3 bra $Lt_13_15362;\n"
1457 " bra.uni $Lt_13_14850;\n"
1459 " mov.f32 %f1, 0f00000000; // 0\n"
1462 " mov.u64 %rd7, __smem;\n"
1463 " cvt.u64.u32 %rd8, %r3;\n"
1464 " mul.wide.u32 %rd9, %r3, 4;\n"
1465 " add.u64 %rd10, %rd7, %rd9;\n"
1466 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1469 " mov.u32 %r9, 31;\n"
1470 " setp.gt.u32 %p4, %r3, %r9;\n"
1471 " @%p4 bra $Lt_13_16386;\n"
1473 " ld.volatile.shared.f32 %f4, [%rd10+4];\n"
1474 " add.f32 %f5, %f4, %f1;\n"
1475 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1478 " mov.u32 %r10, 0;\n"
1479 " setp.ne.u32 %p5, %r3, %r10;\n"
1480 " @%p5 bra $Lt_13_16898;\n"
1482 " ld.shared.f32 %f6, [__smem+0];\n"
1483 " ld.param.u64 %rd11, [__cudaparm_reduce_float_2_false_g_odata];\n"
1484 " cvt.u64.u32 %rd12, %r1;\n"
1485 " mul.wide.u32 %rd13, %r1, 4;\n"
1486 " add.u64 %rd14, %rd11, %rd13;\n"
1487 " st.global.f32 [%rd14+0], %f6;\n"
1491 "$LDWend_reduce_float_2_false:\n"
1492 " } // reduce_float_2_false\n"
1494 " .entry reduce_float_4_false (\n"
1495 " .param .u64 __cudaparm_reduce_float_4_false_g_idata,\n"
1496 " .param .u64 __cudaparm_reduce_float_4_false_g_odata,\n"
1497 " .param .u32 __cudaparm_reduce_float_4_false_n)\n"
1499 " .reg .u16 %rh<3>;\n"
1500 " .reg .u32 %r<12>;\n"
1501 " .reg .u64 %rd<16>;\n"
1502 " .reg .f32 %f<10>;\n"
1503 " .reg .pred %p<7>;\n"
1505 "$LDWbegin_reduce_float_4_false:\n"
1507 " cvt.u32.u16 %r1, %ctaid.x;\n"
1508 " mul24.lo.u32 %r2, %r1, 8;\n"
1509 " cvt.u32.u16 %r3, %tid.x;\n"
1510 " add.u32 %r4, %r2, %r3;\n"
1511 " ld.param.u32 %r5, [__cudaparm_reduce_float_4_false_n];\n"
1512 " setp.ge.u32 %p1, %r4, %r5;\n"
1513 " @%p1 bra $Lt_14_17154;\n"
1514 " add.u32 %r6, %r4, 4;\n"
1515 " ld.param.u32 %r5, [__cudaparm_reduce_float_4_false_n];\n"
1516 " add.u32 %r7, %r5, 4;\n"
1517 " mov.u16 %rh1, %nctaid.x;\n"
1518 " mul.wide.u16 %r8, %rh1, 8;\n"
1519 " cvt.s64.u32 %rd1, %r8;\n"
1520 " ld.param.u64 %rd2, [__cudaparm_reduce_float_4_false_g_idata];\n"
1521 " cvt.u64.u32 %rd3, %r4;\n"
1522 " mul.wide.u32 %rd4, %r4, 4;\n"
1523 " add.u64 %rd5, %rd2, %rd4;\n"
1524 " mul.wide.u32 %rd6, %r8, 4;\n"
1525 " mov.f32 %f1, 0f00000000; // 0\n"
1527 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1529 " ld.global.f32 %f2, [%rd5+0];\n"
1530 " add.f32 %f1, %f2, %f1;\n"
1532 " ld.param.u32 %r5, [__cudaparm_reduce_float_4_false_n];\n"
1534 " setp.ge.u32 %p2, %r6, %r5;\n"
1535 " @%p2 bra $Lt_14_15362;\n"
1536 " //<loop> Part of loop body line 181, head labeled $Lt_14_15106\n"
1538 " ld.global.f32 %f3, [%rd5+16];\n"
1539 " add.f32 %f1, %f3, %f1;\n"
1541 " //<loop> Part of loop body line 181, head labeled $Lt_14_15106\n"
1542 " add.u32 %r6, %r6, %r8;\n"
1543 " add.u64 %rd5, %rd5, %rd6;\n"
1544 " setp.lt.u32 %p3, %r6, %r7;\n"
1545 " @%p3 bra $Lt_14_15106;\n"
1546 " bra.uni $Lt_14_14594;\n"
1548 " mov.f32 %f1, 0f00000000; // 0\n"
1551 " mov.u64 %rd7, __smem;\n"
1552 " cvt.u64.u32 %rd8, %r3;\n"
1553 " mul.wide.u32 %rd9, %r3, 4;\n"
1554 " add.u64 %rd10, %rd7, %rd9;\n"
1555 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1558 " mov.u32 %r9, 31;\n"
1559 " setp.gt.u32 %p4, %r3, %r9;\n"
1560 " @%p4 bra $Lt_14_16130;\n"
1562 " ld.volatile.shared.f32 %f4, [%rd10+8];\n"
1563 " add.f32 %f5, %f4, %f1;\n"
1564 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1566 " ld.volatile.shared.f32 %f6, [%rd10+4];\n"
1567 " add.f32 %f7, %f6, %f5;\n"
1568 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1571 " mov.u32 %r10, 0;\n"
1572 " setp.ne.u32 %p5, %r3, %r10;\n"
1573 " @%p5 bra $Lt_14_16642;\n"
1575 " ld.shared.f32 %f8, [__smem+0];\n"
1576 " ld.param.u64 %rd11, [__cudaparm_reduce_float_4_false_g_odata];\n"
1577 " cvt.u64.u32 %rd12, %r1;\n"
1578 " mul.wide.u32 %rd13, %r1, 4;\n"
1579 " add.u64 %rd14, %rd11, %rd13;\n"
1580 " st.global.f32 [%rd14+0], %f8;\n"
1584 "$LDWend_reduce_float_4_false:\n"
1585 " } // reduce_float_4_false\n"
1587 " .entry reduce_float_8_false (\n"
1588 " .param .u64 __cudaparm_reduce_float_8_false_g_idata,\n"
1589 " .param .u64 __cudaparm_reduce_float_8_false_g_odata,\n"
1590 " .param .u32 __cudaparm_reduce_float_8_false_n)\n"
1592 " .reg .u16 %rh<3>;\n"
1593 " .reg .u32 %r<12>;\n"
1594 " .reg .u64 %rd<16>;\n"
1595 " .reg .f32 %f<12>;\n"
1596 " .reg .pred %p<7>;\n"
1598 "$LDWbegin_reduce_float_8_false:\n"
1600 " cvt.u32.u16 %r1, %ctaid.x;\n"
1601 " mul24.lo.u32 %r2, %r1, 16;\n"
1602 " cvt.u32.u16 %r3, %tid.x;\n"
1603 " add.u32 %r4, %r2, %r3;\n"
1604 " ld.param.u32 %r5, [__cudaparm_reduce_float_8_false_n];\n"
1605 " setp.ge.u32 %p1, %r4, %r5;\n"
1606 " @%p1 bra $Lt_15_16898;\n"
1607 " add.u32 %r6, %r4, 8;\n"
1608 " ld.param.u32 %r5, [__cudaparm_reduce_float_8_false_n];\n"
1609 " add.u32 %r7, %r5, 8;\n"
1610 " mov.u16 %rh1, %nctaid.x;\n"
1611 " mul.wide.u16 %r8, %rh1, 16;\n"
1612 " cvt.s64.u32 %rd1, %r8;\n"
1613 " ld.param.u64 %rd2, [__cudaparm_reduce_float_8_false_g_idata];\n"
1614 " cvt.u64.u32 %rd3, %r4;\n"
1615 " mul.wide.u32 %rd4, %r4, 4;\n"
1616 " add.u64 %rd5, %rd2, %rd4;\n"
1617 " mul.wide.u32 %rd6, %r8, 4;\n"
1618 " mov.f32 %f1, 0f00000000; // 0\n"
1620 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1622 " ld.global.f32 %f2, [%rd5+0];\n"
1623 " add.f32 %f1, %f2, %f1;\n"
1625 " ld.param.u32 %r5, [__cudaparm_reduce_float_8_false_n];\n"
1627 " setp.ge.u32 %p2, %r6, %r5;\n"
1628 " @%p2 bra $Lt_15_15106;\n"
1629 " //<loop> Part of loop body line 181, head labeled $Lt_15_14850\n"
1631 " ld.global.f32 %f3, [%rd5+32];\n"
1632 " add.f32 %f1, %f3, %f1;\n"
1634 " //<loop> Part of loop body line 181, head labeled $Lt_15_14850\n"
1635 " add.u32 %r6, %r6, %r8;\n"
1636 " add.u64 %rd5, %rd5, %rd6;\n"
1637 " setp.lt.u32 %p3, %r6, %r7;\n"
1638 " @%p3 bra $Lt_15_14850;\n"
1639 " bra.uni $Lt_15_14338;\n"
1641 " mov.f32 %f1, 0f00000000; // 0\n"
1644 " mov.u64 %rd7, __smem;\n"
1645 " cvt.u64.u32 %rd8, %r3;\n"
1646 " mul.wide.u32 %rd9, %r3, 4;\n"
1647 " add.u64 %rd10, %rd7, %rd9;\n"
1648 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1651 " mov.u32 %r9, 31;\n"
1652 " setp.gt.u32 %p4, %r3, %r9;\n"
1653 " @%p4 bra $Lt_15_15874;\n"
1655 " ld.volatile.shared.f32 %f4, [%rd10+16];\n"
1656 " add.f32 %f5, %f4, %f1;\n"
1657 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1659 " ld.volatile.shared.f32 %f6, [%rd10+8];\n"
1660 " add.f32 %f7, %f6, %f5;\n"
1661 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1663 " ld.volatile.shared.f32 %f8, [%rd10+4];\n"
1664 " add.f32 %f9, %f8, %f7;\n"
1665 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1668 " mov.u32 %r10, 0;\n"
1669 " setp.ne.u32 %p5, %r3, %r10;\n"
1670 " @%p5 bra $Lt_15_16386;\n"
1672 " ld.shared.f32 %f10, [__smem+0];\n"
1673 " ld.param.u64 %rd11, [__cudaparm_reduce_float_8_false_g_odata];\n"
1674 " cvt.u64.u32 %rd12, %r1;\n"
1675 " mul.wide.u32 %rd13, %r1, 4;\n"
1676 " add.u64 %rd14, %rd11, %rd13;\n"
1677 " st.global.f32 [%rd14+0], %f10;\n"
1681 "$LDWend_reduce_float_8_false:\n"
1682 " } // reduce_float_8_false\n"
1684 " .entry reduce_float_16_false (\n"
1685 " .param .u64 __cudaparm_reduce_float_16_false_g_idata,\n"
1686 " .param .u64 __cudaparm_reduce_float_16_false_g_odata,\n"
1687 " .param .u32 __cudaparm_reduce_float_16_false_n)\n"
1689 " .reg .u16 %rh<3>;\n"
1690 " .reg .u32 %r<12>;\n"
1691 " .reg .u64 %rd<16>;\n"
1692 " .reg .f32 %f<14>;\n"
1693 " .reg .pred %p<7>;\n"
1695 "$LDWbegin_reduce_float_16_false:\n"
1697 " cvt.u32.u16 %r1, %ctaid.x;\n"
1698 " mul24.lo.u32 %r2, %r1, 32;\n"
1699 " cvt.u32.u16 %r3, %tid.x;\n"
1700 " add.u32 %r4, %r2, %r3;\n"
1701 " ld.param.u32 %r5, [__cudaparm_reduce_float_16_false_n];\n"
1702 " setp.ge.u32 %p1, %r4, %r5;\n"
1703 " @%p1 bra $Lt_16_16642;\n"
1704 " add.u32 %r6, %r4, 16;\n"
1705 " ld.param.u32 %r5, [__cudaparm_reduce_float_16_false_n];\n"
1706 " add.u32 %r7, %r5, 16;\n"
1707 " mov.u16 %rh1, %nctaid.x;\n"
1708 " mul.wide.u16 %r8, %rh1, 32;\n"
1709 " cvt.s64.u32 %rd1, %r8;\n"
1710 " ld.param.u64 %rd2, [__cudaparm_reduce_float_16_false_g_idata];\n"
1711 " cvt.u64.u32 %rd3, %r4;\n"
1712 " mul.wide.u32 %rd4, %r4, 4;\n"
1713 " add.u64 %rd5, %rd2, %rd4;\n"
1714 " mul.wide.u32 %rd6, %r8, 4;\n"
1715 " mov.f32 %f1, 0f00000000; // 0\n"
1717 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1719 " ld.global.f32 %f2, [%rd5+0];\n"
1720 " add.f32 %f1, %f2, %f1;\n"
1722 " ld.param.u32 %r5, [__cudaparm_reduce_float_16_false_n];\n"
1724 " setp.ge.u32 %p2, %r6, %r5;\n"
1725 " @%p2 bra $Lt_16_14850;\n"
1726 " //<loop> Part of loop body line 181, head labeled $Lt_16_14594\n"
1728 " ld.global.f32 %f3, [%rd5+64];\n"
1729 " add.f32 %f1, %f3, %f1;\n"
1731 " //<loop> Part of loop body line 181, head labeled $Lt_16_14594\n"
1732 " add.u32 %r6, %r6, %r8;\n"
1733 " add.u64 %rd5, %rd5, %rd6;\n"
1734 " setp.lt.u32 %p3, %r6, %r7;\n"
1735 " @%p3 bra $Lt_16_14594;\n"
1736 " bra.uni $Lt_16_14082;\n"
1738 " mov.f32 %f1, 0f00000000; // 0\n"
1741 " mov.u64 %rd7, __smem;\n"
1742 " cvt.u64.u32 %rd8, %r3;\n"
1743 " mul.wide.u32 %rd9, %r3, 4;\n"
1744 " add.u64 %rd10, %rd7, %rd9;\n"
1745 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1748 " mov.u32 %r9, 31;\n"
1749 " setp.gt.u32 %p4, %r3, %r9;\n"
1750 " @%p4 bra $Lt_16_15618;\n"
1752 " ld.volatile.shared.f32 %f4, [%rd10+32];\n"
1753 " add.f32 %f5, %f4, %f1;\n"
1754 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1756 " ld.volatile.shared.f32 %f6, [%rd10+16];\n"
1757 " add.f32 %f7, %f6, %f5;\n"
1758 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1760 " ld.volatile.shared.f32 %f8, [%rd10+8];\n"
1761 " add.f32 %f9, %f8, %f7;\n"
1762 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1764 " ld.volatile.shared.f32 %f10, [%rd10+4];\n"
1765 " add.f32 %f11, %f10, %f9;\n"
1766 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1769 " mov.u32 %r10, 0;\n"
1770 " setp.ne.u32 %p5, %r3, %r10;\n"
1771 " @%p5 bra $Lt_16_16130;\n"
1773 " ld.shared.f32 %f12, [__smem+0];\n"
1774 " ld.param.u64 %rd11, [__cudaparm_reduce_float_16_false_g_odata];\n"
1775 " cvt.u64.u32 %rd12, %r1;\n"
1776 " mul.wide.u32 %rd13, %r1, 4;\n"
1777 " add.u64 %rd14, %rd11, %rd13;\n"
1778 " st.global.f32 [%rd14+0], %f12;\n"
1782 "$LDWend_reduce_float_16_false:\n"
1783 " } // reduce_float_16_false\n"
1785 " .entry reduce_float_32_false (\n"
1786 " .param .u64 __cudaparm_reduce_float_32_false_g_idata,\n"
1787 " .param .u64 __cudaparm_reduce_float_32_false_g_odata,\n"
1788 " .param .u32 __cudaparm_reduce_float_32_false_n)\n"
1790 " .reg .u16 %rh<3>;\n"
1791 " .reg .u32 %r<12>;\n"
1792 " .reg .u64 %rd<16>;\n"
1793 " .reg .f32 %f<16>;\n"
1794 " .reg .pred %p<7>;\n"
1796 "$LDWbegin_reduce_float_32_false:\n"
1798 " cvt.u32.u16 %r1, %ctaid.x;\n"
1799 " mul24.lo.u32 %r2, %r1, 64;\n"
1800 " cvt.u32.u16 %r3, %tid.x;\n"
1801 " add.u32 %r4, %r2, %r3;\n"
1802 " ld.param.u32 %r5, [__cudaparm_reduce_float_32_false_n];\n"
1803 " setp.ge.u32 %p1, %r4, %r5;\n"
1804 " @%p1 bra $Lt_17_16386;\n"
1805 " add.u32 %r6, %r4, 32;\n"
1806 " ld.param.u32 %r5, [__cudaparm_reduce_float_32_false_n];\n"
1807 " add.u32 %r7, %r5, 32;\n"
1808 " mov.u16 %rh1, %nctaid.x;\n"
1809 " mul.wide.u16 %r8, %rh1, 64;\n"
1810 " cvt.s64.u32 %rd1, %r8;\n"
1811 " ld.param.u64 %rd2, [__cudaparm_reduce_float_32_false_g_idata];\n"
1812 " cvt.u64.u32 %rd3, %r4;\n"
1813 " mul.wide.u32 %rd4, %r4, 4;\n"
1814 " add.u64 %rd5, %rd2, %rd4;\n"
1815 " mul.wide.u32 %rd6, %r8, 4;\n"
1816 " mov.f32 %f1, 0f00000000; // 0\n"
1818 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1820 " ld.global.f32 %f2, [%rd5+0];\n"
1821 " add.f32 %f1, %f2, %f1;\n"
1823 " ld.param.u32 %r5, [__cudaparm_reduce_float_32_false_n];\n"
1825 " setp.ge.u32 %p2, %r6, %r5;\n"
1826 " @%p2 bra $Lt_17_14594;\n"
1827 " //<loop> Part of loop body line 181, head labeled $Lt_17_14338\n"
1829 " ld.global.f32 %f3, [%rd5+128];\n"
1830 " add.f32 %f1, %f3, %f1;\n"
1832 " //<loop> Part of loop body line 181, head labeled $Lt_17_14338\n"
1833 " add.u32 %r6, %r6, %r8;\n"
1834 " add.u64 %rd5, %rd5, %rd6;\n"
1835 " setp.lt.u32 %p3, %r6, %r7;\n"
1836 " @%p3 bra $Lt_17_14338;\n"
1837 " bra.uni $Lt_17_13826;\n"
1839 " mov.f32 %f1, 0f00000000; // 0\n"
1842 " mov.u64 %rd7, __smem;\n"
1843 " cvt.u64.u32 %rd8, %r3;\n"
1844 " mul.wide.u32 %rd9, %r3, 4;\n"
1845 " add.u64 %rd10, %rd7, %rd9;\n"
1846 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1849 " mov.u32 %r9, 31;\n"
1850 " setp.gt.u32 %p4, %r3, %r9;\n"
1851 " @%p4 bra $Lt_17_15362;\n"
1853 " ld.volatile.shared.f32 %f4, [%rd10+64];\n"
1854 " add.f32 %f5, %f4, %f1;\n"
1855 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1857 " ld.volatile.shared.f32 %f6, [%rd10+32];\n"
1858 " add.f32 %f7, %f6, %f5;\n"
1859 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1861 " ld.volatile.shared.f32 %f8, [%rd10+16];\n"
1862 " add.f32 %f9, %f8, %f7;\n"
1863 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1865 " ld.volatile.shared.f32 %f10, [%rd10+8];\n"
1866 " add.f32 %f11, %f10, %f9;\n"
1867 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1869 " ld.volatile.shared.f32 %f12, [%rd10+4];\n"
1870 " add.f32 %f13, %f12, %f11;\n"
1871 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1874 " mov.u32 %r10, 0;\n"
1875 " setp.ne.u32 %p5, %r3, %r10;\n"
1876 " @%p5 bra $Lt_17_15874;\n"
1878 " ld.shared.f32 %f14, [__smem+0];\n"
1879 " ld.param.u64 %rd11, [__cudaparm_reduce_float_32_false_g_odata];\n"
1880 " cvt.u64.u32 %rd12, %r1;\n"
1881 " mul.wide.u32 %rd13, %r1, 4;\n"
1882 " add.u64 %rd14, %rd11, %rd13;\n"
1883 " st.global.f32 [%rd14+0], %f14;\n"
1887 "$LDWend_reduce_float_32_false:\n"
1888 " } // reduce_float_32_false\n"
1890 " .entry reduce_float_64_false (\n"
1891 " .param .u64 __cudaparm_reduce_float_64_false_g_idata,\n"
1892 " .param .u64 __cudaparm_reduce_float_64_false_g_odata,\n"
1893 " .param .u32 __cudaparm_reduce_float_64_false_n)\n"
1895 " .reg .u16 %rh<3>;\n"
1896 " .reg .u32 %r<12>;\n"
1897 " .reg .u64 %rd<16>;\n"
1898 " .reg .f32 %f<18>;\n"
1899 " .reg .pred %p<7>;\n"
1901 "$LDWbegin_reduce_float_64_false:\n"
1903 " cvt.u32.u16 %r1, %ctaid.x;\n"
1904 " mul24.lo.u32 %r2, %r1, 128;\n"
1905 " cvt.u32.u16 %r3, %tid.x;\n"
1906 " add.u32 %r4, %r2, %r3;\n"
1907 " ld.param.u32 %r5, [__cudaparm_reduce_float_64_false_n];\n"
1908 " setp.ge.u32 %p1, %r4, %r5;\n"
1909 " @%p1 bra $Lt_18_16130;\n"
1910 " add.u32 %r6, %r4, 64;\n"
1911 " ld.param.u32 %r5, [__cudaparm_reduce_float_64_false_n];\n"
1912 " add.u32 %r7, %r5, 64;\n"
1913 " mov.u16 %rh1, %nctaid.x;\n"
1914 " mul.wide.u16 %r8, %rh1, 128;\n"
1915 " cvt.s64.u32 %rd1, %r8;\n"
1916 " ld.param.u64 %rd2, [__cudaparm_reduce_float_64_false_g_idata];\n"
1917 " cvt.u64.u32 %rd3, %r4;\n"
1918 " mul.wide.u32 %rd4, %r4, 4;\n"
1919 " add.u64 %rd5, %rd2, %rd4;\n"
1920 " mul.wide.u32 %rd6, %r8, 4;\n"
1921 " mov.f32 %f1, 0f00000000; // 0\n"
1923 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1925 " ld.global.f32 %f2, [%rd5+0];\n"
1926 " add.f32 %f1, %f2, %f1;\n"
1928 " ld.param.u32 %r5, [__cudaparm_reduce_float_64_false_n];\n"
1930 " setp.ge.u32 %p2, %r6, %r5;\n"
1931 " @%p2 bra $Lt_18_14338;\n"
1932 " //<loop> Part of loop body line 181, head labeled $Lt_18_14082\n"
1934 " ld.global.f32 %f3, [%rd5+256];\n"
1935 " add.f32 %f1, %f3, %f1;\n"
1937 " //<loop> Part of loop body line 181, head labeled $Lt_18_14082\n"
1938 " add.u32 %r6, %r6, %r8;\n"
1939 " add.u64 %rd5, %rd5, %rd6;\n"
1940 " setp.lt.u32 %p3, %r6, %r7;\n"
1941 " @%p3 bra $Lt_18_14082;\n"
1942 " bra.uni $Lt_18_13570;\n"
1944 " mov.f32 %f1, 0f00000000; // 0\n"
1947 " mov.u64 %rd7, __smem;\n"
1948 " cvt.u64.u32 %rd8, %r3;\n"
1949 " mul.wide.u32 %rd9, %r3, 4;\n"
1950 " add.u64 %rd10, %rd7, %rd9;\n"
1951 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1954 " mov.u32 %r9, 31;\n"
1955 " setp.gt.u32 %p4, %r3, %r9;\n"
1956 " @%p4 bra $Lt_18_15106;\n"
1958 " ld.volatile.shared.f32 %f4, [%rd10+128];\n"
1959 " add.f32 %f5, %f4, %f1;\n"
1960 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1962 " ld.volatile.shared.f32 %f6, [%rd10+64];\n"
1963 " add.f32 %f7, %f6, %f5;\n"
1964 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1966 " ld.volatile.shared.f32 %f8, [%rd10+32];\n"
1967 " add.f32 %f9, %f8, %f7;\n"
1968 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1970 " ld.volatile.shared.f32 %f10, [%rd10+16];\n"
1971 " add.f32 %f11, %f10, %f9;\n"
1972 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1974 " ld.volatile.shared.f32 %f12, [%rd10+8];\n"
1975 " add.f32 %f13, %f12, %f11;\n"
1976 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1978 " ld.volatile.shared.f32 %f14, [%rd10+4];\n"
1979 " add.f32 %f15, %f14, %f13;\n"
1980 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
1983 " mov.u32 %r10, 0;\n"
1984 " setp.ne.u32 %p5, %r3, %r10;\n"
1985 " @%p5 bra $Lt_18_15618;\n"
1987 " ld.shared.f32 %f16, [__smem+0];\n"
1988 " ld.param.u64 %rd11, [__cudaparm_reduce_float_64_false_g_odata];\n"
1989 " cvt.u64.u32 %rd12, %r1;\n"
1990 " mul.wide.u32 %rd13, %r1, 4;\n"
1991 " add.u64 %rd14, %rd11, %rd13;\n"
1992 " st.global.f32 [%rd14+0], %f16;\n"
1996 "$LDWend_reduce_float_64_false:\n"
1997 " } // reduce_float_64_false\n"
1999 " .entry reduce_float_128_false (\n"
2000 " .param .u64 __cudaparm_reduce_float_128_false_g_idata,\n"
2001 " .param .u64 __cudaparm_reduce_float_128_false_g_odata,\n"
2002 " .param .u32 __cudaparm_reduce_float_128_false_n)\n"
2004 " .reg .u16 %rh<3>;\n"
2005 " .reg .u32 %r<13>;\n"
2006 " .reg .u64 %rd<16>;\n"
2007 " .reg .f32 %f<20>;\n"
2008 " .reg .pred %p<8>;\n"
2010 "$LDWbegin_reduce_float_128_false:\n"
2012 " cvt.u32.u16 %r1, %ctaid.x;\n"
2013 " mul.lo.u32 %r2, %r1, 256;\n"
2014 " cvt.u32.u16 %r3, %tid.x;\n"
2015 " add.u32 %r4, %r2, %r3;\n"
2016 " ld.param.u32 %r5, [__cudaparm_reduce_float_128_false_n];\n"
2017 " setp.ge.u32 %p1, %r4, %r5;\n"
2018 " @%p1 bra $Lt_19_16386;\n"
2019 " add.u32 %r6, %r4, 128;\n"
2020 " ld.param.u32 %r5, [__cudaparm_reduce_float_128_false_n];\n"
2021 " add.u32 %r7, %r5, 128;\n"
2022 " mov.u16 %rh1, %nctaid.x;\n"
2023 " mul.wide.u16 %r8, %rh1, 256;\n"
2024 " cvt.s64.u32 %rd1, %r8;\n"
2025 " ld.param.u64 %rd2, [__cudaparm_reduce_float_128_false_g_idata];\n"
2026 " cvt.u64.u32 %rd3, %r4;\n"
2027 " mul.wide.u32 %rd4, %r4, 4;\n"
2028 " add.u64 %rd5, %rd2, %rd4;\n"
2029 " mul.wide.u32 %rd6, %r8, 4;\n"
2030 " mov.f32 %f1, 0f00000000; // 0\n"
2032 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2034 " ld.global.f32 %f2, [%rd5+0];\n"
2035 " add.f32 %f1, %f2, %f1;\n"
2037 " ld.param.u32 %r5, [__cudaparm_reduce_float_128_false_n];\n"
2039 " setp.ge.u32 %p2, %r6, %r5;\n"
2040 " @%p2 bra $Lt_19_14082;\n"
2041 " //<loop> Part of loop body line 181, head labeled $Lt_19_13826\n"
2043 " ld.global.f32 %f3, [%rd5+512];\n"
2044 " add.f32 %f1, %f3, %f1;\n"
2046 " //<loop> Part of loop body line 181, head labeled $Lt_19_13826\n"
2047 " add.u32 %r6, %r6, %r8;\n"
2048 " add.u64 %rd5, %rd5, %rd6;\n"
2049 " setp.lt.u32 %p3, %r6, %r7;\n"
2050 " @%p3 bra $Lt_19_13826;\n"
2051 " bra.uni $Lt_19_13314;\n"
2053 " mov.f32 %f1, 0f00000000; // 0\n"
2056 " mov.f32 %f4, %f1;\n"
2057 " mov.f32 %f5, %f4;\n"
2059 " mov.u64 %rd7, __smem;\n"
2060 " cvt.u64.u32 %rd8, %r3;\n"
2061 " mul.wide.u32 %rd9, %r3, 4;\n"
2062 " add.u64 %rd10, %rd7, %rd9;\n"
2063 " st.volatile.shared.f32 [%rd10+0], %f4;\n"
2066 " mov.u32 %r9, 63;\n"
2067 " setp.gt.u32 %p4, %r3, %r9;\n"
2068 " @%p4 bra $Lt_19_14850;\n"
2070 " ld.volatile.shared.f32 %f6, [%rd10+256];\n"
2071 " add.f32 %f5, %f6, %f4;\n"
2072 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2075 " mov.u32 %r10, 31;\n"
2076 " setp.gt.u32 %p5, %r3, %r10;\n"
2077 " @%p5 bra $Lt_19_15362;\n"
2079 " ld.volatile.shared.f32 %f7, [%rd10+128];\n"
2080 " add.f32 %f8, %f7, %f5;\n"
2081 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
2083 " ld.volatile.shared.f32 %f9, [%rd10+64];\n"
2084 " add.f32 %f10, %f9, %f8;\n"
2085 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
2087 " ld.volatile.shared.f32 %f11, [%rd10+32];\n"
2088 " add.f32 %f12, %f11, %f10;\n"
2089 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
2091 " ld.volatile.shared.f32 %f13, [%rd10+16];\n"
2092 " add.f32 %f14, %f13, %f12;\n"
2093 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
2095 " ld.volatile.shared.f32 %f15, [%rd10+8];\n"
2096 " add.f32 %f16, %f15, %f14;\n"
2097 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
2099 " ld.volatile.shared.f32 %f17, [%rd10+4];\n"
2100 " add.f32 %f5, %f17, %f16;\n"
2101 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2104 " mov.u32 %r11, 0;\n"
2105 " setp.ne.u32 %p6, %r3, %r11;\n"
2106 " @%p6 bra $Lt_19_15874;\n"
2108 " ld.shared.f32 %f18, [__smem+0];\n"
2109 " ld.param.u64 %rd11, [__cudaparm_reduce_float_128_false_g_odata];\n"
2110 " cvt.u64.u32 %rd12, %r1;\n"
2111 " mul.wide.u32 %rd13, %r1, 4;\n"
2112 " add.u64 %rd14, %rd11, %rd13;\n"
2113 " st.global.f32 [%rd14+0], %f18;\n"
2117 "$LDWend_reduce_float_128_false:\n"
2118 " } // reduce_float_128_false\n"
2120 " .entry reduce_float_256_false (\n"
2121 " .param .u64 __cudaparm_reduce_float_256_false_g_idata,\n"
2122 " .param .u64 __cudaparm_reduce_float_256_false_g_odata,\n"
2123 " .param .u32 __cudaparm_reduce_float_256_false_n)\n"
2125 " .reg .u16 %rh<3>;\n"
2126 " .reg .u32 %r<14>;\n"
2127 " .reg .u64 %rd<16>;\n"
2128 " .reg .f32 %f<21>;\n"
2129 " .reg .pred %p<9>;\n"
2131 "$LDWbegin_reduce_float_256_false:\n"
2133 " cvt.u32.u16 %r1, %ctaid.x;\n"
2134 " mul.lo.u32 %r2, %r1, 512;\n"
2135 " cvt.u32.u16 %r3, %tid.x;\n"
2136 " add.u32 %r4, %r2, %r3;\n"
2137 " ld.param.u32 %r5, [__cudaparm_reduce_float_256_false_n];\n"
2138 " setp.ge.u32 %p1, %r4, %r5;\n"
2139 " @%p1 bra $Lt_20_16642;\n"
2140 " add.u32 %r6, %r4, 256;\n"
2141 " ld.param.u32 %r5, [__cudaparm_reduce_float_256_false_n];\n"
2142 " add.u32 %r7, %r5, 256;\n"
2143 " mov.u16 %rh1, %nctaid.x;\n"
2144 " mul.wide.u16 %r8, %rh1, 512;\n"
2145 " cvt.s64.u32 %rd1, %r8;\n"
2146 " ld.param.u64 %rd2, [__cudaparm_reduce_float_256_false_g_idata];\n"
2147 " cvt.u64.u32 %rd3, %r4;\n"
2148 " mul.wide.u32 %rd4, %r4, 4;\n"
2149 " add.u64 %rd5, %rd2, %rd4;\n"
2150 " mul.wide.u32 %rd6, %r8, 4;\n"
2151 " mov.f32 %f1, 0f00000000; // 0\n"
2153 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2155 " ld.global.f32 %f2, [%rd5+0];\n"
2156 " add.f32 %f1, %f2, %f1;\n"
2158 " ld.param.u32 %r5, [__cudaparm_reduce_float_256_false_n];\n"
2160 " setp.ge.u32 %p2, %r6, %r5;\n"
2161 " @%p2 bra $Lt_20_13826;\n"
2162 " //<loop> Part of loop body line 181, head labeled $Lt_20_13570\n"
2164 " ld.global.f32 %f3, [%rd5+1024];\n"
2165 " add.f32 %f1, %f3, %f1;\n"
2167 " //<loop> Part of loop body line 181, head labeled $Lt_20_13570\n"
2168 " add.u32 %r6, %r6, %r8;\n"
2169 " add.u64 %rd5, %rd5, %rd6;\n"
2170 " setp.lt.u32 %p3, %r6, %r7;\n"
2171 " @%p3 bra $Lt_20_13570;\n"
2172 " bra.uni $Lt_20_13058;\n"
2174 " mov.f32 %f1, 0f00000000; // 0\n"
2177 " mov.f32 %f4, %f1;\n"
2178 " mov.f32 %f5, %f4;\n"
2180 " mov.u64 %rd7, __smem;\n"
2181 " cvt.u64.u32 %rd8, %r3;\n"
2182 " mul.wide.u32 %rd9, %r3, 4;\n"
2183 " add.u64 %rd10, %rd7, %rd9;\n"
2184 " st.volatile.shared.f32 [%rd10+0], %f4;\n"
2187 " mov.u32 %r9, 127;\n"
2188 " setp.gt.u32 %p4, %r3, %r9;\n"
2189 " @%p4 bra $Lt_20_14594;\n"
2191 " ld.volatile.shared.f32 %f6, [%rd10+512];\n"
2192 " add.f32 %f5, %f6, %f4;\n"
2193 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2196 " mov.u32 %r10, 63;\n"
2197 " setp.gt.u32 %p5, %r3, %r10;\n"
2198 " @%p5 bra $Lt_20_15106;\n"
2200 " ld.volatile.shared.f32 %f7, [%rd10+256];\n"
2201 " add.f32 %f5, %f7, %f5;\n"
2202 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2205 " mov.u32 %r11, 31;\n"
2206 " setp.gt.u32 %p6, %r3, %r11;\n"
2207 " @%p6 bra $Lt_20_15618;\n"
2209 " ld.volatile.shared.f32 %f8, [%rd10+128];\n"
2210 " add.f32 %f9, %f8, %f5;\n"
2211 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
2213 " ld.volatile.shared.f32 %f10, [%rd10+64];\n"
2214 " add.f32 %f11, %f10, %f9;\n"
2215 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
2217 " ld.volatile.shared.f32 %f12, [%rd10+32];\n"
2218 " add.f32 %f13, %f12, %f11;\n"
2219 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
2221 " ld.volatile.shared.f32 %f14, [%rd10+16];\n"
2222 " add.f32 %f15, %f14, %f13;\n"
2223 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
2225 " ld.volatile.shared.f32 %f16, [%rd10+8];\n"
2226 " add.f32 %f17, %f16, %f15;\n"
2227 " st.volatile.shared.f32 [%rd10+0], %f17;\n"
2229 " ld.volatile.shared.f32 %f18, [%rd10+4];\n"
2230 " add.f32 %f5, %f18, %f17;\n"
2231 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2234 " mov.u32 %r12, 0;\n"
2235 " setp.ne.u32 %p7, %r3, %r12;\n"
2236 " @%p7 bra $Lt_20_16130;\n"
2238 " ld.shared.f32 %f19, [__smem+0];\n"
2239 " ld.param.u64 %rd11, [__cudaparm_reduce_float_256_false_g_odata];\n"
2240 " cvt.u64.u32 %rd12, %r1;\n"
2241 " mul.wide.u32 %rd13, %r1, 4;\n"
2242 " add.u64 %rd14, %rd11, %rd13;\n"
2243 " st.global.f32 [%rd14+0], %f19;\n"
2247 "$LDWend_reduce_float_256_false:\n"
2248 " } // reduce_float_256_false\n"
2250 " .entry reduce_float_512_false (\n"
2251 " .param .u64 __cudaparm_reduce_float_512_false_g_idata,\n"
2252 " .param .u64 __cudaparm_reduce_float_512_false_g_odata,\n"
2253 " .param .u32 __cudaparm_reduce_float_512_false_n)\n"
2255 " .reg .u16 %rh<3>;\n"
2256 " .reg .u32 %r<15>;\n"
2257 " .reg .u64 %rd<16>;\n"
2258 " .reg .f32 %f<22>;\n"
2259 " .reg .pred %p<10>;\n"
2261 "$LDWbegin_reduce_float_512_false:\n"
2263 " cvt.u32.u16 %r1, %ctaid.x;\n"
2264 " mul.lo.u32 %r2, %r1, 1024;\n"
2265 " cvt.u32.u16 %r3, %tid.x;\n"
2266 " add.u32 %r4, %r2, %r3;\n"
2267 " ld.param.u32 %r5, [__cudaparm_reduce_float_512_false_n];\n"
2268 " setp.ge.u32 %p1, %r4, %r5;\n"
2269 " @%p1 bra $Lt_21_16898;\n"
2270 " add.u32 %r6, %r4, 512;\n"
2271 " ld.param.u32 %r5, [__cudaparm_reduce_float_512_false_n];\n"
2272 " add.u32 %r7, %r5, 512;\n"
2273 " mov.u16 %rh1, %nctaid.x;\n"
2274 " mul.wide.u16 %r8, %rh1, 1024;\n"
2275 " cvt.s64.u32 %rd1, %r8;\n"
2276 " ld.param.u64 %rd2, [__cudaparm_reduce_float_512_false_g_idata];\n"
2277 " cvt.u64.u32 %rd3, %r4;\n"
2278 " mul.wide.u32 %rd4, %r4, 4;\n"
2279 " add.u64 %rd5, %rd2, %rd4;\n"
2280 " mul.wide.u32 %rd6, %r8, 4;\n"
2281 " mov.f32 %f1, 0f00000000; // 0\n"
2283 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2285 " ld.global.f32 %f2, [%rd5+0];\n"
2286 " add.f32 %f1, %f2, %f1;\n"
2288 " ld.param.u32 %r5, [__cudaparm_reduce_float_512_false_n];\n"
2290 " setp.ge.u32 %p2, %r6, %r5;\n"
2291 " @%p2 bra $Lt_21_13570;\n"
2292 " //<loop> Part of loop body line 181, head labeled $Lt_21_13314\n"
2294 " ld.global.f32 %f3, [%rd5+2048];\n"
2295 " add.f32 %f1, %f3, %f1;\n"
2297 " //<loop> Part of loop body line 181, head labeled $Lt_21_13314\n"
2298 " add.u32 %r6, %r6, %r8;\n"
2299 " add.u64 %rd5, %rd5, %rd6;\n"
2300 " setp.lt.u32 %p3, %r6, %r7;\n"
2301 " @%p3 bra $Lt_21_13314;\n"
2302 " bra.uni $Lt_21_12802;\n"
2304 " mov.f32 %f1, 0f00000000; // 0\n"
2307 " mov.f32 %f4, %f1;\n"
2308 " mov.f32 %f5, %f4;\n"
2310 " mov.u64 %rd7, __smem;\n"
2311 " cvt.u64.u32 %rd8, %r3;\n"
2312 " mul.wide.u32 %rd9, %r3, 4;\n"
2313 " add.u64 %rd10, %rd7, %rd9;\n"
2314 " st.volatile.shared.f32 [%rd10+0], %f4;\n"
2317 " mov.u32 %r9, 255;\n"
2318 " setp.gt.u32 %p4, %r3, %r9;\n"
2319 " @%p4 bra $Lt_21_14338;\n"
2321 " ld.volatile.shared.f32 %f6, [%rd10+1024];\n"
2322 " add.f32 %f5, %f6, %f4;\n"
2323 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2326 " mov.u32 %r10, 127;\n"
2327 " setp.gt.u32 %p5, %r3, %r10;\n"
2328 " @%p5 bra $Lt_21_14850;\n"
2330 " ld.volatile.shared.f32 %f7, [%rd10+512];\n"
2331 " add.f32 %f5, %f7, %f5;\n"
2332 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2335 " mov.u32 %r11, 63;\n"
2336 " setp.gt.u32 %p6, %r3, %r11;\n"
2337 " @%p6 bra $Lt_21_15362;\n"
2339 " ld.volatile.shared.f32 %f8, [%rd10+256];\n"
2340 " add.f32 %f5, %f8, %f5;\n"
2341 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2344 " mov.u32 %r12, 31;\n"
2345 " setp.gt.u32 %p7, %r3, %r12;\n"
2346 " @%p7 bra $Lt_21_15874;\n"
2348 " ld.volatile.shared.f32 %f9, [%rd10+128];\n"
2349 " add.f32 %f10, %f9, %f5;\n"
2350 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
2352 " ld.volatile.shared.f32 %f11, [%rd10+64];\n"
2353 " add.f32 %f12, %f11, %f10;\n"
2354 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
2356 " ld.volatile.shared.f32 %f13, [%rd10+32];\n"
2357 " add.f32 %f14, %f13, %f12;\n"
2358 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
2360 " ld.volatile.shared.f32 %f15, [%rd10+16];\n"
2361 " add.f32 %f16, %f15, %f14;\n"
2362 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
2364 " ld.volatile.shared.f32 %f17, [%rd10+8];\n"
2365 " add.f32 %f18, %f17, %f16;\n"
2366 " st.volatile.shared.f32 [%rd10+0], %f18;\n"
2368 " ld.volatile.shared.f32 %f19, [%rd10+4];\n"
2369 " add.f32 %f5, %f19, %f18;\n"
2370 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2373 " mov.u32 %r13, 0;\n"
2374 " setp.ne.u32 %p8, %r3, %r13;\n"
2375 " @%p8 bra $Lt_21_16386;\n"
2377 " ld.shared.f32 %f20, [__smem+0];\n"
2378 " ld.param.u64 %rd11, [__cudaparm_reduce_float_512_false_g_odata];\n"
2379 " cvt.u64.u32 %rd12, %r1;\n"
2380 " mul.wide.u32 %rd13, %r1, 4;\n"
2381 " add.u64 %rd14, %rd11, %rd13;\n"
2382 " st.global.f32 [%rd14+0], %f20;\n"
2386 "$LDWend_reduce_float_512_false:\n"
2387 " } // reduce_float_512_false\n"
2389 " .entry tex_reduce_256_false (\n"
2390 " .param .u64 __cudaparm_tex_reduce_256_false_g_odata,\n"
2391 " .param .u32 __cudaparm_tex_reduce_256_false_n,\n"
2392 " .param .u32 __cudaparm_tex_reduce_256_false_stride)\n"
2394 " .reg .u16 %rh<3>;\n"
2395 " .reg .u32 %r<19>;\n"
2396 " .reg .u64 %rd<10>;\n"
2397 " .reg .f32 %f<37>;\n"
2398 " .reg .pred %p<9>;\n"
2400 "$LDWbegin_tex_reduce_256_false:\n"
2402 " cvt.u32.u16 %r1, %ctaid.x;\n"
2403 " mul.lo.u32 %r2, %r1, 512;\n"
2404 " cvt.u32.u16 %r3, %tid.x;\n"
2405 " add.u32 %r4, %r2, %r3;\n"
2406 " mov.s32 %r5, %r4;\n"
2407 " ld.param.u32 %r6, [__cudaparm_tex_reduce_256_false_n];\n"
2408 " setp.ge.u32 %p1, %r4, %r6;\n"
2409 " @%p1 bra $Lt_22_16642;\n"
2410 " mov.u16 %rh1, %nctaid.x;\n"
2411 " mul.wide.u16 %r7, %rh1, 512;\n"
2412 " ld.param.u32 %r8, [__cudaparm_tex_reduce_256_false_stride];\n"
2413 " mov.f32 %f1, 0f00000000; // 0\n"
2415 " //<loop> Loop body line 240, nesting depth: 1, estimated iterations: unknown\n"
2416 " ld.param.u32 %r8, [__cudaparm_tex_reduce_256_false_stride];\n"
2417 " rem.u32 %r9, %r5, %r8;\n"
2418 " cvt.rn.f32.u32 %f2, %r9;\n"
2419 " div.u32 %r10, %r5, %r8;\n"
2420 " cvt.rn.f32.u32 %f3, %r10;\n"
2421 " mov.f32 %f4, 0f00000000; // 0\n"
2422 " mov.f32 %f5, 0f00000000; // 0\n"
2423 " tex.2d.v4.f32.f32 {%f6,%f7,%f8,%f9},[tex_ref_1,{%f2,%f3,%f4,%f5}];\n"
2424 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2426 " mov.f32 %f10, %f6;\n"
2427 " add.f32 %f1, %f10, %f1;\n"
2428 " add.u32 %r11, %r5, 256;\n"
2430 " ld.param.u32 %r6, [__cudaparm_tex_reduce_256_false_n];\n"
2432 " setp.ge.u32 %p2, %r11, %r6;\n"
2433 " @%p2 bra $Lt_22_13826;\n"
2434 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2436 " ld.param.u32 %r8, [__cudaparm_tex_reduce_256_false_stride];\n"
2438 " rem.u32 %r12, %r11, %r8;\n"
2439 " cvt.rn.f32.u32 %f11, %r12;\n"
2440 " div.u32 %r13, %r11, %r8;\n"
2441 " cvt.rn.f32.u32 %f12, %r13;\n"
2442 " mov.f32 %f13, 0f00000000; // 0\n"
2443 " mov.f32 %f14, 0f00000000; // 0\n"
2444 " tex.2d.v4.f32.f32 {%f15,%f16,%f17,%f18},[tex_ref_1,{%f11,%f12,%f13,%f14}];\n"
2445 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2447 " mov.f32 %f19, %f15;\n"
2448 " add.f32 %f1, %f19, %f1;\n"
2450 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2451 " add.u32 %r5, %r7, %r5;\n"
2453 " ld.param.u32 %r6, [__cudaparm_tex_reduce_256_false_n];\n"
2455 " setp.lt.u32 %p3, %r5, %r6;\n"
2456 " @%p3 bra $Lt_22_13570;\n"
2457 " bra.uni $Lt_22_13058;\n"
2459 " mov.f32 %f1, 0f00000000; // 0\n"
2462 " mov.f32 %f20, %f1;\n"
2463 " mov.f32 %f21, %f20;\n"
2465 " mov.u64 %rd1, __smem;\n"
2466 " cvt.u64.u32 %rd2, %r3;\n"
2467 " mul.wide.u32 %rd3, %r3, 4;\n"
2468 " add.u64 %rd4, %rd1, %rd3;\n"
2469 " st.volatile.shared.f32 [%rd4+0], %f20;\n"
2472 " mov.u32 %r14, 127;\n"
2473 " setp.gt.u32 %p4, %r3, %r14;\n"
2474 " @%p4 bra $Lt_22_14594;\n"
2476 " ld.volatile.shared.f32 %f22, [%rd4+512];\n"
2477 " add.f32 %f21, %f22, %f20;\n"
2478 " st.volatile.shared.f32 [%rd4+0], %f21;\n"
2481 " mov.u32 %r15, 63;\n"
2482 " setp.gt.u32 %p5, %r3, %r15;\n"
2483 " @%p5 bra $Lt_22_15106;\n"
2485 " ld.volatile.shared.f32 %f23, [%rd4+256];\n"
2486 " add.f32 %f21, %f23, %f21;\n"
2487 " st.volatile.shared.f32 [%rd4+0], %f21;\n"
2490 " mov.u32 %r16, 31;\n"
2491 " setp.gt.u32 %p6, %r3, %r16;\n"
2492 " @%p6 bra $Lt_22_15618;\n"
2494 " ld.volatile.shared.f32 %f24, [%rd4+128];\n"
2495 " add.f32 %f25, %f24, %f21;\n"
2496 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2498 " ld.volatile.shared.f32 %f26, [%rd4+64];\n"
2499 " add.f32 %f27, %f26, %f25;\n"
2500 " st.volatile.shared.f32 [%rd4+0], %f27;\n"
2502 " ld.volatile.shared.f32 %f28, [%rd4+32];\n"
2503 " add.f32 %f29, %f28, %f27;\n"
2504 " st.volatile.shared.f32 [%rd4+0], %f29;\n"
2506 " ld.volatile.shared.f32 %f30, [%rd4+16];\n"
2507 " add.f32 %f31, %f30, %f29;\n"
2508 " st.volatile.shared.f32 [%rd4+0], %f31;\n"
2510 " ld.volatile.shared.f32 %f32, [%rd4+8];\n"
2511 " add.f32 %f33, %f32, %f31;\n"
2512 " st.volatile.shared.f32 [%rd4+0], %f33;\n"
2514 " ld.volatile.shared.f32 %f34, [%rd4+4];\n"
2515 " add.f32 %f21, %f34, %f33;\n"
2516 " st.volatile.shared.f32 [%rd4+0], %f21;\n"
2519 " mov.u32 %r17, 0;\n"
2520 " setp.ne.u32 %p7, %r3, %r17;\n"
2521 " @%p7 bra $Lt_22_16130;\n"
2523 " ld.shared.f32 %f35, [__smem+0];\n"
2524 " ld.param.u64 %rd5, [__cudaparm_tex_reduce_256_false_g_odata];\n"
2525 " cvt.u64.u32 %rd6, %r1;\n"
2526 " mul.wide.u32 %rd7, %r1, 4;\n"
2527 " add.u64 %rd8, %rd5, %rd7;\n"
2528 " st.global.f32 [%rd8+0], %f35;\n"
2532 "$LDWend_tex_reduce_256_false:\n"
2533 " } // tex_reduce_256_false\n"
2535 " .entry tex_count_256_false (\n"
2536 " .param .u64 __cudaparm_tex_count_256_false_g_odata,\n"
2537 " .param .u32 __cudaparm_tex_count_256_false_n,\n"
2538 " .param .u32 __cudaparm_tex_count_256_false_stride)\n"
2540 " .reg .u16 %rh<3>;\n"
2541 " .reg .u32 %r<19>;\n"
2542 " .reg .u64 %rd<10>;\n"
2543 " .reg .f32 %f<41>;\n"
2544 " .reg .f64 %fd<6>;\n"
2545 " .reg .pred %p<11>;\n"
2547 "$LDWbegin_tex_count_256_false:\n"
2549 " cvt.u32.u16 %r1, %ctaid.x;\n"
2550 " mul.lo.u32 %r2, %r1, 512;\n"
2551 " cvt.u32.u16 %r3, %tid.x;\n"
2552 " add.u32 %r4, %r2, %r3;\n"
2553 " mov.s32 %r5, %r4;\n"
2554 " ld.param.u32 %r6, [__cudaparm_tex_count_256_false_n];\n"
2555 " setp.ge.u32 %p1, %r4, %r6;\n"
2556 " @%p1 bra $Lt_23_18178;\n"
2557 " mov.u16 %rh1, %nctaid.x;\n"
2558 " mul.wide.u16 %r7, %rh1, 512;\n"
2559 " ld.param.u32 %r8, [__cudaparm_tex_count_256_false_stride];\n"
2560 " mov.f32 %f1, 0f00000000; // 0\n"
2562 " //<loop> Loop body line 333, nesting depth: 1, estimated iterations: unknown\n"
2563 " ld.param.u32 %r8, [__cudaparm_tex_count_256_false_stride];\n"
2564 " rem.u32 %r9, %r5, %r8;\n"
2565 " cvt.rn.f32.u32 %f2, %r9;\n"
2566 " div.u32 %r10, %r5, %r8;\n"
2567 " cvt.rn.f32.u32 %f3, %r10;\n"
2568 " mov.f32 %f4, 0f00000000; // 0\n"
2569 " mov.f32 %f5, 0f00000000; // 0\n"
2570 " tex.2d.v4.f32.f32 {%f6,%f7,%f8,%f9},[tex_ref_1,{%f2,%f3,%f4,%f5}];\n"
2571 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2573 " mov.f32 %f10, %f6;\n"
2575 " mov.f32 %f11, 0f3f800000; // 1\n"
2576 " add.f32 %f12, %f1, %f11;\n"
2577 " cvt.f64.f32 %fd1, %f10;\n"
2578 " mov.f64 %fd2, 0d0000000000000000; // 0\n"
2579 " setp.ne.f64 %p2, %fd1, %fd2;\n"
2580 " selp.f32 %f1, %f12, %f1, %p2;\n"
2581 " add.u32 %r11, %r5, 256;\n"
2583 " ld.param.u32 %r6, [__cudaparm_tex_count_256_false_n];\n"
2585 " setp.ge.u32 %p3, %r11, %r6;\n"
2586 " @%p3 bra $Lt_23_15362;\n"
2587 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2589 " ld.param.u32 %r8, [__cudaparm_tex_count_256_false_stride];\n"
2591 " rem.u32 %r12, %r11, %r8;\n"
2592 " cvt.rn.f32.u32 %f13, %r12;\n"
2593 " div.u32 %r13, %r11, %r8;\n"
2594 " cvt.rn.f32.u32 %f14, %r13;\n"
2595 " mov.f32 %f15, 0f00000000; // 0\n"
2596 " mov.f32 %f16, 0f00000000; // 0\n"
2597 " tex.2d.v4.f32.f32 {%f17,%f18,%f19,%f20},[tex_ref_1,{%f13,%f14,%f15,%f16}];\n"
2598 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2600 " mov.f32 %f21, %f17;\n"
2602 " mov.f32 %f22, 0f3f800000; // 1\n"
2603 " add.f32 %f23, %f1, %f22;\n"
2604 " cvt.f64.f32 %fd3, %f21;\n"
2605 " mov.f64 %fd4, 0d0000000000000000; // 0\n"
2606 " setp.ne.f64 %p4, %fd3, %fd4;\n"
2607 " selp.f32 %f1, %f23, %f1, %p4;\n"
2609 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2610 " add.u32 %r5, %r7, %r5;\n"
2612 " ld.param.u32 %r6, [__cudaparm_tex_count_256_false_n];\n"
2614 " setp.lt.u32 %p5, %r5, %r6;\n"
2615 " @%p5 bra $Lt_23_15106;\n"
2616 " bra.uni $Lt_23_14594;\n"
2618 " mov.f32 %f1, 0f00000000; // 0\n"
2621 " mov.f32 %f24, %f1;\n"
2622 " mov.f32 %f25, %f24;\n"
2624 " mov.u64 %rd1, __smem;\n"
2625 " cvt.u64.u32 %rd2, %r3;\n"
2626 " mul.wide.u32 %rd3, %r3, 4;\n"
2627 " add.u64 %rd4, %rd1, %rd3;\n"
2628 " st.volatile.shared.f32 [%rd4+0], %f24;\n"
2631 " mov.u32 %r14, 127;\n"
2632 " setp.gt.u32 %p6, %r3, %r14;\n"
2633 " @%p6 bra $Lt_23_16130;\n"
2635 " ld.volatile.shared.f32 %f26, [%rd4+512];\n"
2636 " add.f32 %f25, %f26, %f24;\n"
2637 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2640 " mov.u32 %r15, 63;\n"
2641 " setp.gt.u32 %p7, %r3, %r15;\n"
2642 " @%p7 bra $Lt_23_16642;\n"
2644 " ld.volatile.shared.f32 %f27, [%rd4+256];\n"
2645 " add.f32 %f25, %f27, %f25;\n"
2646 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2649 " mov.u32 %r16, 31;\n"
2650 " setp.gt.u32 %p8, %r3, %r16;\n"
2651 " @%p8 bra $Lt_23_17154;\n"
2653 " ld.volatile.shared.f32 %f28, [%rd4+128];\n"
2654 " add.f32 %f29, %f28, %f25;\n"
2655 " st.volatile.shared.f32 [%rd4+0], %f29;\n"
2657 " ld.volatile.shared.f32 %f30, [%rd4+64];\n"
2658 " add.f32 %f31, %f30, %f29;\n"
2659 " st.volatile.shared.f32 [%rd4+0], %f31;\n"
2661 " ld.volatile.shared.f32 %f32, [%rd4+32];\n"
2662 " add.f32 %f33, %f32, %f31;\n"
2663 " st.volatile.shared.f32 [%rd4+0], %f33;\n"
2665 " ld.volatile.shared.f32 %f34, [%rd4+16];\n"
2666 " add.f32 %f35, %f34, %f33;\n"
2667 " st.volatile.shared.f32 [%rd4+0], %f35;\n"
2669 " ld.volatile.shared.f32 %f36, [%rd4+8];\n"
2670 " add.f32 %f37, %f36, %f35;\n"
2671 " st.volatile.shared.f32 [%rd4+0], %f37;\n"
2673 " ld.volatile.shared.f32 %f38, [%rd4+4];\n"
2674 " add.f32 %f25, %f38, %f37;\n"
2675 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2678 " mov.u32 %r17, 0;\n"
2679 " setp.ne.u32 %p9, %r3, %r17;\n"
2680 " @%p9 bra $Lt_23_17666;\n"
2682 " ld.shared.f32 %f39, [__smem+0];\n"
2683 " ld.param.u64 %rd5, [__cudaparm_tex_count_256_false_g_odata];\n"
2684 " cvt.u64.u32 %rd6, %r1;\n"
2685 " mul.wide.u32 %rd7, %r1, 4;\n"
2686 " add.u64 %rd8, %rd5, %rd7;\n"
2687 " st.global.f32 [%rd8+0], %f39;\n"
2691 "$LDWend_tex_count_256_false:\n"
2692 " } // tex_count_256_false\n"
2694 " .entry chamfer_reduce_256_false (\n"
2695 " .param .u64 __cudaparm_chamfer_reduce_256_false_g_odata,\n"
2696 " .param .u32 __cudaparm_chamfer_reduce_256_false_n,\n"
2697 " .param .u32 __cudaparm_chamfer_reduce_256_false_stride)\n"
2699 " .reg .u16 %rh<3>;\n"
2700 " .reg .u32 %r<19>;\n"
2701 " .reg .u64 %rd<10>;\n"
2702 " .reg .f32 %f<59>;\n"
2703 " .reg .pred %p<9>;\n"
2705 "$LDWbegin_chamfer_reduce_256_false:\n"
2707 " cvt.u32.u16 %r1, %ctaid.x;\n"
2708 " mul.lo.u32 %r2, %r1, 512;\n"
2709 " cvt.u32.u16 %r3, %tid.x;\n"
2710 " add.u32 %r4, %r2, %r3;\n"
2711 " mov.s32 %r5, %r4;\n"
2712 " ld.param.u32 %r6, [__cudaparm_chamfer_reduce_256_false_n];\n"
2713 " setp.ge.u32 %p1, %r4, %r6;\n"
2714 " @%p1 bra $Lt_24_16642;\n"
2715 " mov.u16 %rh1, %nctaid.x;\n"
2716 " mul.wide.u16 %r7, %rh1, 512;\n"
2717 " ld.param.u32 %r8, [__cudaparm_chamfer_reduce_256_false_stride];\n"
2718 " mov.f32 %f1, 0f00000000; // 0\n"
2720 " //<loop> Loop body line 287, nesting depth: 1, estimated iterations: unknown\n"
2721 " ld.param.u32 %r8, [__cudaparm_chamfer_reduce_256_false_stride];\n"
2722 " rem.u32 %r9, %r5, %r8;\n"
2723 " cvt.rn.f32.u32 %f2, %r9;\n"
2724 " div.u32 %r10, %r5, %r8;\n"
2725 " cvt.rn.f32.u32 %f3, %r10;\n"
2726 " mov.f32 %f4, %f2;\n"
2727 " mov.f32 %f5, %f3;\n"
2728 " mov.f32 %f6, 0f00000000; // 0\n"
2729 " mov.f32 %f7, 0f00000000; // 0\n"
2730 " tex.2d.v4.f32.f32 {%f8,%f9,%f10,%f11},[tex_ref_1,{%f4,%f5,%f6,%f7}];\n"
2731 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2733 " mov.f32 %f12, %f8;\n"
2734 " mov.f32 %f13, %f2;\n"
2735 " mov.f32 %f14, %f3;\n"
2736 " mov.f32 %f15, 0f00000000; // 0\n"
2737 " mov.f32 %f16, 0f00000000; // 0\n"
2738 " tex.2d.v4.f32.f32 {%f17,%f18,%f19,%f20},[tex_ref_2,{%f13,%f14,%f15,%f16}];\n"
2739 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2740 " mov.f32 %f21, %f17;\n"
2741 " mad.f32 %f1, %f12, %f21, %f1;\n"
2742 " add.u32 %r11, %r5, 256;\n"
2744 " ld.param.u32 %r6, [__cudaparm_chamfer_reduce_256_false_n];\n"
2746 " setp.ge.u32 %p2, %r11, %r6;\n"
2747 " @%p2 bra $Lt_24_13826;\n"
2748 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2750 " ld.param.u32 %r8, [__cudaparm_chamfer_reduce_256_false_stride];\n"
2752 " rem.u32 %r12, %r11, %r8;\n"
2753 " cvt.rn.f32.u32 %f22, %r12;\n"
2754 " div.u32 %r13, %r11, %r8;\n"
2755 " cvt.rn.f32.u32 %f23, %r13;\n"
2756 " mov.f32 %f24, %f22;\n"
2757 " mov.f32 %f25, %f23;\n"
2758 " mov.f32 %f26, 0f00000000; // 0\n"
2759 " mov.f32 %f27, 0f00000000; // 0\n"
2760 " tex.2d.v4.f32.f32 {%f28,%f29,%f30,%f31},[tex_ref_1,{%f24,%f25,%f26,%f27}];\n"
2761 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2763 " mov.f32 %f32, %f28;\n"
2764 " mov.f32 %f33, %f22;\n"
2765 " mov.f32 %f34, %f23;\n"
2766 " mov.f32 %f35, 0f00000000; // 0\n"
2767 " mov.f32 %f36, 0f00000000; // 0\n"
2768 " tex.2d.v4.f32.f32 {%f37,%f38,%f39,%f40},[tex_ref_2,{%f33,%f34,%f35,%f36}];\n"
2769 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2770 " mov.f32 %f41, %f37;\n"
2771 " mad.f32 %f1, %f32, %f41, %f1;\n"
2773 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2774 " add.u32 %r5, %r7, %r5;\n"
2776 " ld.param.u32 %r6, [__cudaparm_chamfer_reduce_256_false_n];\n"
2778 " setp.lt.u32 %p3, %r5, %r6;\n"
2779 " @%p3 bra $Lt_24_13570;\n"
2780 " bra.uni $Lt_24_13058;\n"
2782 " mov.f32 %f1, 0f00000000; // 0\n"
2785 " mov.f32 %f42, %f1;\n"
2786 " mov.f32 %f43, %f42;\n"
2788 " mov.u64 %rd1, __smem;\n"
2789 " cvt.u64.u32 %rd2, %r3;\n"
2790 " mul.wide.u32 %rd3, %r3, 4;\n"
2791 " add.u64 %rd4, %rd1, %rd3;\n"
2792 " st.volatile.shared.f32 [%rd4+0], %f42;\n"
2795 " mov.u32 %r14, 127;\n"
2796 " setp.gt.u32 %p4, %r3, %r14;\n"
2797 " @%p4 bra $Lt_24_14594;\n"
2799 " ld.volatile.shared.f32 %f44, [%rd4+512];\n"
2800 " add.f32 %f43, %f44, %f42;\n"
2801 " st.volatile.shared.f32 [%rd4+0], %f43;\n"
2804 " mov.u32 %r15, 63;\n"
2805 " setp.gt.u32 %p5, %r3, %r15;\n"
2806 " @%p5 bra $Lt_24_15106;\n"
2808 " ld.volatile.shared.f32 %f45, [%rd4+256];\n"
2809 " add.f32 %f43, %f45, %f43;\n"
2810 " st.volatile.shared.f32 [%rd4+0], %f43;\n"
2813 " mov.u32 %r16, 31;\n"
2814 " setp.gt.u32 %p6, %r3, %r16;\n"
2815 " @%p6 bra $Lt_24_15618;\n"
2817 " ld.volatile.shared.f32 %f46, [%rd4+128];\n"
2818 " add.f32 %f47, %f46, %f43;\n"
2819 " st.volatile.shared.f32 [%rd4+0], %f47;\n"
2821 " ld.volatile.shared.f32 %f48, [%rd4+64];\n"
2822 " add.f32 %f49, %f48, %f47;\n"
2823 " st.volatile.shared.f32 [%rd4+0], %f49;\n"
2825 " ld.volatile.shared.f32 %f50, [%rd4+32];\n"
2826 " add.f32 %f51, %f50, %f49;\n"
2827 " st.volatile.shared.f32 [%rd4+0], %f51;\n"
2829 " ld.volatile.shared.f32 %f52, [%rd4+16];\n"
2830 " add.f32 %f53, %f52, %f51;\n"
2831 " st.volatile.shared.f32 [%rd4+0], %f53;\n"
2833 " ld.volatile.shared.f32 %f54, [%rd4+8];\n"
2834 " add.f32 %f55, %f54, %f53;\n"
2835 " st.volatile.shared.f32 [%rd4+0], %f55;\n"
2837 " ld.volatile.shared.f32 %f56, [%rd4+4];\n"
2838 " add.f32 %f43, %f56, %f55;\n"
2839 " st.volatile.shared.f32 [%rd4+0], %f43;\n"
2842 " mov.u32 %r17, 0;\n"
2843 " setp.ne.u32 %p7, %r3, %r17;\n"
2844 " @%p7 bra $Lt_24_16130;\n"
2846 " ld.shared.f32 %f57, [__smem+0];\n"
2847 " ld.param.u64 %rd5, [__cudaparm_chamfer_reduce_256_false_g_odata];\n"
2848 " cvt.u64.u32 %rd6, %r1;\n"
2849 " mul.wide.u32 %rd7, %r1, 4;\n"
2850 " add.u64 %rd8, %rd5, %rd7;\n"
2851 " st.global.f32 [%rd8+0], %f57;\n"
2855 "$LDWend_chamfer_reduce_256_false:\n"
2856 " } // chamfer_reduce_256_false\n"
2858 " .entry reduce_uchar_1_true (\n"
2859 " .param .u64 __cudaparm_reduce_uchar_1_true_g_idata,\n"
2860 " .param .u64 __cudaparm_reduce_uchar_1_true_g_odata,\n"
2861 " .param .u32 __cudaparm_reduce_uchar_1_true_n)\n"
2863 " .reg .u16 %rh<3>;\n"
2864 " .reg .u32 %r<12>;\n"
2865 " .reg .u64 %rd<14>;\n"
2866 " .reg .f32 %f<7>;\n"
2867 " .reg .pred %p<5>;\n"
2869 "$LDWbegin_reduce_uchar_1_true:\n"
2871 " cvt.u32.u16 %r1, %ctaid.x;\n"
2872 " mul24.lo.u32 %r2, %r1, 2;\n"
2873 " cvt.u32.u16 %r3, %tid.x;\n"
2874 " add.u32 %r4, %r2, %r3;\n"
2875 " mov.s32 %r5, %r4;\n"
2876 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_1_true_n];\n"
2877 " setp.ge.u32 %p1, %r4, %r6;\n"
2878 " @%p1 bra $Lt_25_16642;\n"
2879 " mov.u16 %rh1, %nctaid.x;\n"
2880 " mul.wide.u16 %r7, %rh1, 2;\n"
2881 " cvt.u64.u32 %rd1, %r4;\n"
2882 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_1_true_g_idata];\n"
2883 " add.u64 %rd3, %rd1, %rd2;\n"
2884 " cvt.s64.u32 %rd4, %r7;\n"
2885 " mov.f32 %f1, 0f00000000; // 0\n"
2887 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2889 " ld.global.u8 %r8, [%rd3+0];\n"
2890 " cvt.rn.f32.u32 %f2, %r8;\n"
2891 " add.f32 %f3, %f2, %f1;\n"
2893 " ld.global.u8 %r9, [%rd3+1];\n"
2894 " cvt.rn.f32.u32 %f4, %r9;\n"
2895 " add.f32 %f1, %f4, %f3;\n"
2896 " add.u32 %r5, %r7, %r5;\n"
2897 " add.u64 %rd3, %rd4, %rd3;\n"
2899 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_1_true_n];\n"
2901 " setp.lt.u32 %p2, %r5, %r6;\n"
2902 " @%p2 bra $Lt_25_15618;\n"
2903 " bra.uni $Lt_25_15106;\n"
2905 " mov.f32 %f1, 0f00000000; // 0\n"
2908 " mov.u64 %rd5, __smem;\n"
2909 " cvt.u64.u32 %rd6, %r3;\n"
2910 " mul.wide.u32 %rd7, %r3, 4;\n"
2911 " add.u64 %rd8, %rd5, %rd7;\n"
2912 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
2916 " mov.u32 %r10, 0;\n"
2917 " setp.ne.u32 %p3, %r3, %r10;\n"
2918 " @%p3 bra $Lt_25_16130;\n"
2920 " ld.shared.f32 %f5, [__smem+0];\n"
2921 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_1_true_g_odata];\n"
2922 " cvt.u64.u32 %rd10, %r1;\n"
2923 " mul.wide.u32 %rd11, %r1, 4;\n"
2924 " add.u64 %rd12, %rd9, %rd11;\n"
2925 " st.global.f32 [%rd12+0], %f5;\n"
2929 "$LDWend_reduce_uchar_1_true:\n"
2930 " } // reduce_uchar_1_true\n"
2932 " .entry reduce_uchar_2_true (\n"
2933 " .param .u64 __cudaparm_reduce_uchar_2_true_g_idata,\n"
2934 " .param .u64 __cudaparm_reduce_uchar_2_true_g_odata,\n"
2935 " .param .u32 __cudaparm_reduce_uchar_2_true_n)\n"
2937 " .reg .u16 %rh<3>;\n"
2938 " .reg .u32 %r<13>;\n"
2939 " .reg .u64 %rd<14>;\n"
2940 " .reg .f32 %f<9>;\n"
2941 " .reg .pred %p<6>;\n"
2943 "$LDWbegin_reduce_uchar_2_true:\n"
2945 " cvt.u32.u16 %r1, %ctaid.x;\n"
2946 " mul24.lo.u32 %r2, %r1, 4;\n"
2947 " cvt.u32.u16 %r3, %tid.x;\n"
2948 " add.u32 %r4, %r2, %r3;\n"
2949 " mov.s32 %r5, %r4;\n"
2950 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_2_true_n];\n"
2951 " setp.ge.u32 %p1, %r4, %r6;\n"
2952 " @%p1 bra $Lt_26_16898;\n"
2953 " mov.u16 %rh1, %nctaid.x;\n"
2954 " mul.wide.u16 %r7, %rh1, 4;\n"
2955 " cvt.u64.u32 %rd1, %r4;\n"
2956 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_2_true_g_idata];\n"
2957 " add.u64 %rd3, %rd1, %rd2;\n"
2958 " cvt.s64.u32 %rd4, %r7;\n"
2959 " mov.f32 %f1, 0f00000000; // 0\n"
2961 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2963 " ld.global.u8 %r8, [%rd3+0];\n"
2964 " cvt.rn.f32.u32 %f2, %r8;\n"
2965 " add.f32 %f3, %f2, %f1;\n"
2967 " ld.global.u8 %r9, [%rd3+2];\n"
2968 " cvt.rn.f32.u32 %f4, %r9;\n"
2969 " add.f32 %f1, %f4, %f3;\n"
2970 " add.u32 %r5, %r7, %r5;\n"
2971 " add.u64 %rd3, %rd4, %rd3;\n"
2973 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_2_true_n];\n"
2975 " setp.lt.u32 %p2, %r5, %r6;\n"
2976 " @%p2 bra $Lt_26_15362;\n"
2977 " bra.uni $Lt_26_14850;\n"
2979 " mov.f32 %f1, 0f00000000; // 0\n"
2982 " mov.u64 %rd5, __smem;\n"
2983 " cvt.u64.u32 %rd6, %r3;\n"
2984 " mul.wide.u32 %rd7, %r3, 4;\n"
2985 " add.u64 %rd8, %rd5, %rd7;\n"
2986 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
2989 " mov.u32 %r10, 31;\n"
2990 " setp.gt.u32 %p3, %r3, %r10;\n"
2991 " @%p3 bra $Lt_26_15874;\n"
2993 " ld.volatile.shared.f32 %f5, [%rd8+4];\n"
2994 " add.f32 %f6, %f5, %f1;\n"
2995 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
2998 " mov.u32 %r11, 0;\n"
2999 " setp.ne.u32 %p4, %r3, %r11;\n"
3000 " @%p4 bra $Lt_26_16386;\n"
3002 " ld.shared.f32 %f7, [__smem+0];\n"
3003 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_2_true_g_odata];\n"
3004 " cvt.u64.u32 %rd10, %r1;\n"
3005 " mul.wide.u32 %rd11, %r1, 4;\n"
3006 " add.u64 %rd12, %rd9, %rd11;\n"
3007 " st.global.f32 [%rd12+0], %f7;\n"
3011 "$LDWend_reduce_uchar_2_true:\n"
3012 " } // reduce_uchar_2_true\n"
3014 " .entry reduce_uchar_4_true (\n"
3015 " .param .u64 __cudaparm_reduce_uchar_4_true_g_idata,\n"
3016 " .param .u64 __cudaparm_reduce_uchar_4_true_g_odata,\n"
3017 " .param .u32 __cudaparm_reduce_uchar_4_true_n)\n"
3019 " .reg .u16 %rh<3>;\n"
3020 " .reg .u32 %r<13>;\n"
3021 " .reg .u64 %rd<14>;\n"
3022 " .reg .f32 %f<11>;\n"
3023 " .reg .pred %p<6>;\n"
3025 "$LDWbegin_reduce_uchar_4_true:\n"
3027 " cvt.u32.u16 %r1, %ctaid.x;\n"
3028 " mul24.lo.u32 %r2, %r1, 8;\n"
3029 " cvt.u32.u16 %r3, %tid.x;\n"
3030 " add.u32 %r4, %r2, %r3;\n"
3031 " mov.s32 %r5, %r4;\n"
3032 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_4_true_n];\n"
3033 " setp.ge.u32 %p1, %r4, %r6;\n"
3034 " @%p1 bra $Lt_27_16642;\n"
3035 " mov.u16 %rh1, %nctaid.x;\n"
3036 " mul.wide.u16 %r7, %rh1, 8;\n"
3037 " cvt.u64.u32 %rd1, %r4;\n"
3038 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_4_true_g_idata];\n"
3039 " add.u64 %rd3, %rd1, %rd2;\n"
3040 " cvt.s64.u32 %rd4, %r7;\n"
3041 " mov.f32 %f1, 0f00000000; // 0\n"
3043 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3045 " ld.global.u8 %r8, [%rd3+0];\n"
3046 " cvt.rn.f32.u32 %f2, %r8;\n"
3047 " add.f32 %f3, %f2, %f1;\n"
3049 " ld.global.u8 %r9, [%rd3+4];\n"
3050 " cvt.rn.f32.u32 %f4, %r9;\n"
3051 " add.f32 %f1, %f4, %f3;\n"
3052 " add.u32 %r5, %r7, %r5;\n"
3053 " add.u64 %rd3, %rd4, %rd3;\n"
3055 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_4_true_n];\n"
3057 " setp.lt.u32 %p2, %r5, %r6;\n"
3058 " @%p2 bra $Lt_27_15106;\n"
3059 " bra.uni $Lt_27_14594;\n"
3061 " mov.f32 %f1, 0f00000000; // 0\n"
3064 " mov.u64 %rd5, __smem;\n"
3065 " cvt.u64.u32 %rd6, %r3;\n"
3066 " mul.wide.u32 %rd7, %r3, 4;\n"
3067 " add.u64 %rd8, %rd5, %rd7;\n"
3068 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3071 " mov.u32 %r10, 31;\n"
3072 " setp.gt.u32 %p3, %r3, %r10;\n"
3073 " @%p3 bra $Lt_27_15618;\n"
3075 " ld.volatile.shared.f32 %f5, [%rd8+8];\n"
3076 " add.f32 %f6, %f5, %f1;\n"
3077 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3079 " ld.volatile.shared.f32 %f7, [%rd8+4];\n"
3080 " add.f32 %f8, %f7, %f6;\n"
3081 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3084 " mov.u32 %r11, 0;\n"
3085 " setp.ne.u32 %p4, %r3, %r11;\n"
3086 " @%p4 bra $Lt_27_16130;\n"
3088 " ld.shared.f32 %f9, [__smem+0];\n"
3089 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_4_true_g_odata];\n"
3090 " cvt.u64.u32 %rd10, %r1;\n"
3091 " mul.wide.u32 %rd11, %r1, 4;\n"
3092 " add.u64 %rd12, %rd9, %rd11;\n"
3093 " st.global.f32 [%rd12+0], %f9;\n"
3097 "$LDWend_reduce_uchar_4_true:\n"
3098 " } // reduce_uchar_4_true\n"
3100 " .entry reduce_uchar_8_true (\n"
3101 " .param .u64 __cudaparm_reduce_uchar_8_true_g_idata,\n"
3102 " .param .u64 __cudaparm_reduce_uchar_8_true_g_odata,\n"
3103 " .param .u32 __cudaparm_reduce_uchar_8_true_n)\n"
3105 " .reg .u16 %rh<3>;\n"
3106 " .reg .u32 %r<13>;\n"
3107 " .reg .u64 %rd<14>;\n"
3108 " .reg .f32 %f<13>;\n"
3109 " .reg .pred %p<6>;\n"
3111 "$LDWbegin_reduce_uchar_8_true:\n"
3113 " cvt.u32.u16 %r1, %ctaid.x;\n"
3114 " mul24.lo.u32 %r2, %r1, 16;\n"
3115 " cvt.u32.u16 %r3, %tid.x;\n"
3116 " add.u32 %r4, %r2, %r3;\n"
3117 " mov.s32 %r5, %r4;\n"
3118 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_8_true_n];\n"
3119 " setp.ge.u32 %p1, %r4, %r6;\n"
3120 " @%p1 bra $Lt_28_16386;\n"
3121 " mov.u16 %rh1, %nctaid.x;\n"
3122 " mul.wide.u16 %r7, %rh1, 16;\n"
3123 " cvt.u64.u32 %rd1, %r4;\n"
3124 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_8_true_g_idata];\n"
3125 " add.u64 %rd3, %rd1, %rd2;\n"
3126 " cvt.s64.u32 %rd4, %r7;\n"
3127 " mov.f32 %f1, 0f00000000; // 0\n"
3129 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3131 " ld.global.u8 %r8, [%rd3+0];\n"
3132 " cvt.rn.f32.u32 %f2, %r8;\n"
3133 " add.f32 %f3, %f2, %f1;\n"
3135 " ld.global.u8 %r9, [%rd3+8];\n"
3136 " cvt.rn.f32.u32 %f4, %r9;\n"
3137 " add.f32 %f1, %f4, %f3;\n"
3138 " add.u32 %r5, %r7, %r5;\n"
3139 " add.u64 %rd3, %rd4, %rd3;\n"
3141 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_8_true_n];\n"
3143 " setp.lt.u32 %p2, %r5, %r6;\n"
3144 " @%p2 bra $Lt_28_14850;\n"
3145 " bra.uni $Lt_28_14338;\n"
3147 " mov.f32 %f1, 0f00000000; // 0\n"
3150 " mov.u64 %rd5, __smem;\n"
3151 " cvt.u64.u32 %rd6, %r3;\n"
3152 " mul.wide.u32 %rd7, %r3, 4;\n"
3153 " add.u64 %rd8, %rd5, %rd7;\n"
3154 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3157 " mov.u32 %r10, 31;\n"
3158 " setp.gt.u32 %p3, %r3, %r10;\n"
3159 " @%p3 bra $Lt_28_15362;\n"
3161 " ld.volatile.shared.f32 %f5, [%rd8+16];\n"
3162 " add.f32 %f6, %f5, %f1;\n"
3163 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3165 " ld.volatile.shared.f32 %f7, [%rd8+8];\n"
3166 " add.f32 %f8, %f7, %f6;\n"
3167 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3169 " ld.volatile.shared.f32 %f9, [%rd8+4];\n"
3170 " add.f32 %f10, %f9, %f8;\n"
3171 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3174 " mov.u32 %r11, 0;\n"
3175 " setp.ne.u32 %p4, %r3, %r11;\n"
3176 " @%p4 bra $Lt_28_15874;\n"
3178 " ld.shared.f32 %f11, [__smem+0];\n"
3179 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_8_true_g_odata];\n"
3180 " cvt.u64.u32 %rd10, %r1;\n"
3181 " mul.wide.u32 %rd11, %r1, 4;\n"
3182 " add.u64 %rd12, %rd9, %rd11;\n"
3183 " st.global.f32 [%rd12+0], %f11;\n"
3187 "$LDWend_reduce_uchar_8_true:\n"
3188 " } // reduce_uchar_8_true\n"
3190 " .entry reduce_uchar_16_true (\n"
3191 " .param .u64 __cudaparm_reduce_uchar_16_true_g_idata,\n"
3192 " .param .u64 __cudaparm_reduce_uchar_16_true_g_odata,\n"
3193 " .param .u32 __cudaparm_reduce_uchar_16_true_n)\n"
3195 " .reg .u16 %rh<3>;\n"
3196 " .reg .u32 %r<13>;\n"
3197 " .reg .u64 %rd<14>;\n"
3198 " .reg .f32 %f<15>;\n"
3199 " .reg .pred %p<6>;\n"
3201 "$LDWbegin_reduce_uchar_16_true:\n"
3203 " cvt.u32.u16 %r1, %ctaid.x;\n"
3204 " mul24.lo.u32 %r2, %r1, 32;\n"
3205 " cvt.u32.u16 %r3, %tid.x;\n"
3206 " add.u32 %r4, %r2, %r3;\n"
3207 " mov.s32 %r5, %r4;\n"
3208 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_16_true_n];\n"
3209 " setp.ge.u32 %p1, %r4, %r6;\n"
3210 " @%p1 bra $Lt_29_16130;\n"
3211 " mov.u16 %rh1, %nctaid.x;\n"
3212 " mul.wide.u16 %r7, %rh1, 32;\n"
3213 " cvt.u64.u32 %rd1, %r4;\n"
3214 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_16_true_g_idata];\n"
3215 " add.u64 %rd3, %rd1, %rd2;\n"
3216 " cvt.s64.u32 %rd4, %r7;\n"
3217 " mov.f32 %f1, 0f00000000; // 0\n"
3219 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3221 " ld.global.u8 %r8, [%rd3+0];\n"
3222 " cvt.rn.f32.u32 %f2, %r8;\n"
3223 " add.f32 %f3, %f2, %f1;\n"
3225 " ld.global.u8 %r9, [%rd3+16];\n"
3226 " cvt.rn.f32.u32 %f4, %r9;\n"
3227 " add.f32 %f1, %f4, %f3;\n"
3228 " add.u32 %r5, %r7, %r5;\n"
3229 " add.u64 %rd3, %rd4, %rd3;\n"
3231 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_16_true_n];\n"
3233 " setp.lt.u32 %p2, %r5, %r6;\n"
3234 " @%p2 bra $Lt_29_14594;\n"
3235 " bra.uni $Lt_29_14082;\n"
3237 " mov.f32 %f1, 0f00000000; // 0\n"
3240 " mov.u64 %rd5, __smem;\n"
3241 " cvt.u64.u32 %rd6, %r3;\n"
3242 " mul.wide.u32 %rd7, %r3, 4;\n"
3243 " add.u64 %rd8, %rd5, %rd7;\n"
3244 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3247 " mov.u32 %r10, 31;\n"
3248 " setp.gt.u32 %p3, %r3, %r10;\n"
3249 " @%p3 bra $Lt_29_15106;\n"
3251 " ld.volatile.shared.f32 %f5, [%rd8+32];\n"
3252 " add.f32 %f6, %f5, %f1;\n"
3253 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3255 " ld.volatile.shared.f32 %f7, [%rd8+16];\n"
3256 " add.f32 %f8, %f7, %f6;\n"
3257 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3259 " ld.volatile.shared.f32 %f9, [%rd8+8];\n"
3260 " add.f32 %f10, %f9, %f8;\n"
3261 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3263 " ld.volatile.shared.f32 %f11, [%rd8+4];\n"
3264 " add.f32 %f12, %f11, %f10;\n"
3265 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3268 " mov.u32 %r11, 0;\n"
3269 " setp.ne.u32 %p4, %r3, %r11;\n"
3270 " @%p4 bra $Lt_29_15618;\n"
3272 " ld.shared.f32 %f13, [__smem+0];\n"
3273 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_16_true_g_odata];\n"
3274 " cvt.u64.u32 %rd10, %r1;\n"
3275 " mul.wide.u32 %rd11, %r1, 4;\n"
3276 " add.u64 %rd12, %rd9, %rd11;\n"
3277 " st.global.f32 [%rd12+0], %f13;\n"
3281 "$LDWend_reduce_uchar_16_true:\n"
3282 " } // reduce_uchar_16_true\n"
3284 " .entry reduce_uchar_32_true (\n"
3285 " .param .u64 __cudaparm_reduce_uchar_32_true_g_idata,\n"
3286 " .param .u64 __cudaparm_reduce_uchar_32_true_g_odata,\n"
3287 " .param .u32 __cudaparm_reduce_uchar_32_true_n)\n"
3289 " .reg .u16 %rh<3>;\n"
3290 " .reg .u32 %r<13>;\n"
3291 " .reg .u64 %rd<14>;\n"
3292 " .reg .f32 %f<17>;\n"
3293 " .reg .pred %p<6>;\n"
3295 "$LDWbegin_reduce_uchar_32_true:\n"
3297 " cvt.u32.u16 %r1, %ctaid.x;\n"
3298 " mul24.lo.u32 %r2, %r1, 64;\n"
3299 " cvt.u32.u16 %r3, %tid.x;\n"
3300 " add.u32 %r4, %r2, %r3;\n"
3301 " mov.s32 %r5, %r4;\n"
3302 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_32_true_n];\n"
3303 " setp.ge.u32 %p1, %r4, %r6;\n"
3304 " @%p1 bra $Lt_30_15874;\n"
3305 " mov.u16 %rh1, %nctaid.x;\n"
3306 " mul.wide.u16 %r7, %rh1, 64;\n"
3307 " cvt.u64.u32 %rd1, %r4;\n"
3308 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_32_true_g_idata];\n"
3309 " add.u64 %rd3, %rd1, %rd2;\n"
3310 " cvt.s64.u32 %rd4, %r7;\n"
3311 " mov.f32 %f1, 0f00000000; // 0\n"
3313 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3315 " ld.global.u8 %r8, [%rd3+0];\n"
3316 " cvt.rn.f32.u32 %f2, %r8;\n"
3317 " add.f32 %f3, %f2, %f1;\n"
3319 " ld.global.u8 %r9, [%rd3+32];\n"
3320 " cvt.rn.f32.u32 %f4, %r9;\n"
3321 " add.f32 %f1, %f4, %f3;\n"
3322 " add.u32 %r5, %r7, %r5;\n"
3323 " add.u64 %rd3, %rd4, %rd3;\n"
3325 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_32_true_n];\n"
3327 " setp.lt.u32 %p2, %r5, %r6;\n"
3328 " @%p2 bra $Lt_30_14338;\n"
3329 " bra.uni $Lt_30_13826;\n"
3331 " mov.f32 %f1, 0f00000000; // 0\n"
3334 " mov.u64 %rd5, __smem;\n"
3335 " cvt.u64.u32 %rd6, %r3;\n"
3336 " mul.wide.u32 %rd7, %r3, 4;\n"
3337 " add.u64 %rd8, %rd5, %rd7;\n"
3338 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3341 " mov.u32 %r10, 31;\n"
3342 " setp.gt.u32 %p3, %r3, %r10;\n"
3343 " @%p3 bra $Lt_30_14850;\n"
3345 " ld.volatile.shared.f32 %f5, [%rd8+64];\n"
3346 " add.f32 %f6, %f5, %f1;\n"
3347 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3349 " ld.volatile.shared.f32 %f7, [%rd8+32];\n"
3350 " add.f32 %f8, %f7, %f6;\n"
3351 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3353 " ld.volatile.shared.f32 %f9, [%rd8+16];\n"
3354 " add.f32 %f10, %f9, %f8;\n"
3355 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3357 " ld.volatile.shared.f32 %f11, [%rd8+8];\n"
3358 " add.f32 %f12, %f11, %f10;\n"
3359 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3361 " ld.volatile.shared.f32 %f13, [%rd8+4];\n"
3362 " add.f32 %f14, %f13, %f12;\n"
3363 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
3366 " mov.u32 %r11, 0;\n"
3367 " setp.ne.u32 %p4, %r3, %r11;\n"
3368 " @%p4 bra $Lt_30_15362;\n"
3370 " ld.shared.f32 %f15, [__smem+0];\n"
3371 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_32_true_g_odata];\n"
3372 " cvt.u64.u32 %rd10, %r1;\n"
3373 " mul.wide.u32 %rd11, %r1, 4;\n"
3374 " add.u64 %rd12, %rd9, %rd11;\n"
3375 " st.global.f32 [%rd12+0], %f15;\n"
3379 "$LDWend_reduce_uchar_32_true:\n"
3380 " } // reduce_uchar_32_true\n"
3382 " .entry reduce_uchar_64_true (\n"
3383 " .param .u64 __cudaparm_reduce_uchar_64_true_g_idata,\n"
3384 " .param .u64 __cudaparm_reduce_uchar_64_true_g_odata,\n"
3385 " .param .u32 __cudaparm_reduce_uchar_64_true_n)\n"
3387 " .reg .u16 %rh<3>;\n"
3388 " .reg .u32 %r<13>;\n"
3389 " .reg .u64 %rd<14>;\n"
3390 " .reg .f32 %f<19>;\n"
3391 " .reg .pred %p<6>;\n"
3393 "$LDWbegin_reduce_uchar_64_true:\n"
3395 " cvt.u32.u16 %r1, %ctaid.x;\n"
3396 " mul24.lo.u32 %r2, %r1, 128;\n"
3397 " cvt.u32.u16 %r3, %tid.x;\n"
3398 " add.u32 %r4, %r2, %r3;\n"
3399 " mov.s32 %r5, %r4;\n"
3400 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_64_true_n];\n"
3401 " setp.ge.u32 %p1, %r4, %r6;\n"
3402 " @%p1 bra $Lt_31_15618;\n"
3403 " mov.u16 %rh1, %nctaid.x;\n"
3404 " mul.wide.u16 %r7, %rh1, 128;\n"
3405 " cvt.u64.u32 %rd1, %r4;\n"
3406 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_64_true_g_idata];\n"
3407 " add.u64 %rd3, %rd1, %rd2;\n"
3408 " cvt.s64.u32 %rd4, %r7;\n"
3409 " mov.f32 %f1, 0f00000000; // 0\n"
3411 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3413 " ld.global.u8 %r8, [%rd3+0];\n"
3414 " cvt.rn.f32.u32 %f2, %r8;\n"
3415 " add.f32 %f3, %f2, %f1;\n"
3417 " ld.global.u8 %r9, [%rd3+64];\n"
3418 " cvt.rn.f32.u32 %f4, %r9;\n"
3419 " add.f32 %f1, %f4, %f3;\n"
3420 " add.u32 %r5, %r7, %r5;\n"
3421 " add.u64 %rd3, %rd4, %rd3;\n"
3423 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_64_true_n];\n"
3425 " setp.lt.u32 %p2, %r5, %r6;\n"
3426 " @%p2 bra $Lt_31_14082;\n"
3427 " bra.uni $Lt_31_13570;\n"
3429 " mov.f32 %f1, 0f00000000; // 0\n"
3432 " mov.u64 %rd5, __smem;\n"
3433 " cvt.u64.u32 %rd6, %r3;\n"
3434 " mul.wide.u32 %rd7, %r3, 4;\n"
3435 " add.u64 %rd8, %rd5, %rd7;\n"
3436 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3439 " mov.u32 %r10, 31;\n"
3440 " setp.gt.u32 %p3, %r3, %r10;\n"
3441 " @%p3 bra $Lt_31_14594;\n"
3443 " ld.volatile.shared.f32 %f5, [%rd8+128];\n"
3444 " add.f32 %f6, %f5, %f1;\n"
3445 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3447 " ld.volatile.shared.f32 %f7, [%rd8+64];\n"
3448 " add.f32 %f8, %f7, %f6;\n"
3449 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3451 " ld.volatile.shared.f32 %f9, [%rd8+32];\n"
3452 " add.f32 %f10, %f9, %f8;\n"
3453 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3455 " ld.volatile.shared.f32 %f11, [%rd8+16];\n"
3456 " add.f32 %f12, %f11, %f10;\n"
3457 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3459 " ld.volatile.shared.f32 %f13, [%rd8+8];\n"
3460 " add.f32 %f14, %f13, %f12;\n"
3461 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
3463 " ld.volatile.shared.f32 %f15, [%rd8+4];\n"
3464 " add.f32 %f16, %f15, %f14;\n"
3465 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
3468 " mov.u32 %r11, 0;\n"
3469 " setp.ne.u32 %p4, %r3, %r11;\n"
3470 " @%p4 bra $Lt_31_15106;\n"
3472 " ld.shared.f32 %f17, [__smem+0];\n"
3473 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_64_true_g_odata];\n"
3474 " cvt.u64.u32 %rd10, %r1;\n"
3475 " mul.wide.u32 %rd11, %r1, 4;\n"
3476 " add.u64 %rd12, %rd9, %rd11;\n"
3477 " st.global.f32 [%rd12+0], %f17;\n"
3481 "$LDWend_reduce_uchar_64_true:\n"
3482 " } // reduce_uchar_64_true\n"
3484 " .entry reduce_uchar_128_true (\n"
3485 " .param .u64 __cudaparm_reduce_uchar_128_true_g_idata,\n"
3486 " .param .u64 __cudaparm_reduce_uchar_128_true_g_odata,\n"
3487 " .param .u32 __cudaparm_reduce_uchar_128_true_n)\n"
3489 " .reg .u16 %rh<3>;\n"
3490 " .reg .u32 %r<14>;\n"
3491 " .reg .u64 %rd<14>;\n"
3492 " .reg .f32 %f<21>;\n"
3493 " .reg .pred %p<7>;\n"
3495 "$LDWbegin_reduce_uchar_128_true:\n"
3497 " cvt.u32.u16 %r1, %ctaid.x;\n"
3498 " mul.lo.u32 %r2, %r1, 256;\n"
3499 " cvt.u32.u16 %r3, %tid.x;\n"
3500 " add.u32 %r4, %r2, %r3;\n"
3501 " mov.s32 %r5, %r4;\n"
3502 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_128_true_n];\n"
3503 " setp.ge.u32 %p1, %r4, %r6;\n"
3504 " @%p1 bra $Lt_32_15874;\n"
3505 " mov.u16 %rh1, %nctaid.x;\n"
3506 " mul.wide.u16 %r7, %rh1, 256;\n"
3507 " cvt.u64.u32 %rd1, %r4;\n"
3508 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_128_true_g_idata];\n"
3509 " add.u64 %rd3, %rd1, %rd2;\n"
3510 " cvt.s64.u32 %rd4, %r7;\n"
3511 " mov.f32 %f1, 0f00000000; // 0\n"
3513 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3515 " ld.global.u8 %r8, [%rd3+0];\n"
3516 " cvt.rn.f32.u32 %f2, %r8;\n"
3517 " add.f32 %f3, %f2, %f1;\n"
3519 " ld.global.u8 %r9, [%rd3+128];\n"
3520 " cvt.rn.f32.u32 %f4, %r9;\n"
3521 " add.f32 %f1, %f4, %f3;\n"
3522 " add.u32 %r5, %r7, %r5;\n"
3523 " add.u64 %rd3, %rd4, %rd3;\n"
3525 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_128_true_n];\n"
3527 " setp.lt.u32 %p2, %r5, %r6;\n"
3528 " @%p2 bra $Lt_32_13826;\n"
3529 " bra.uni $Lt_32_13314;\n"
3531 " mov.f32 %f1, 0f00000000; // 0\n"
3534 " mov.f32 %f5, %f1;\n"
3535 " mov.f32 %f6, %f5;\n"
3537 " mov.u64 %rd5, __smem;\n"
3538 " cvt.u64.u32 %rd6, %r3;\n"
3539 " mul.wide.u32 %rd7, %r3, 4;\n"
3540 " add.u64 %rd8, %rd5, %rd7;\n"
3541 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
3544 " mov.u32 %r10, 63;\n"
3545 " setp.gt.u32 %p3, %r3, %r10;\n"
3546 " @%p3 bra $Lt_32_14338;\n"
3548 " ld.volatile.shared.f32 %f7, [%rd8+256];\n"
3549 " add.f32 %f6, %f7, %f5;\n"
3550 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3553 " mov.u32 %r11, 31;\n"
3554 " setp.gt.u32 %p4, %r3, %r11;\n"
3555 " @%p4 bra $Lt_32_14850;\n"
3557 " ld.volatile.shared.f32 %f8, [%rd8+128];\n"
3558 " add.f32 %f9, %f8, %f6;\n"
3559 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
3561 " ld.volatile.shared.f32 %f10, [%rd8+64];\n"
3562 " add.f32 %f11, %f10, %f9;\n"
3563 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
3565 " ld.volatile.shared.f32 %f12, [%rd8+32];\n"
3566 " add.f32 %f13, %f12, %f11;\n"
3567 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
3569 " ld.volatile.shared.f32 %f14, [%rd8+16];\n"
3570 " add.f32 %f15, %f14, %f13;\n"
3571 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
3573 " ld.volatile.shared.f32 %f16, [%rd8+8];\n"
3574 " add.f32 %f17, %f16, %f15;\n"
3575 " st.volatile.shared.f32 [%rd8+0], %f17;\n"
3577 " ld.volatile.shared.f32 %f18, [%rd8+4];\n"
3578 " add.f32 %f6, %f18, %f17;\n"
3579 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3582 " mov.u32 %r12, 0;\n"
3583 " setp.ne.u32 %p5, %r3, %r12;\n"
3584 " @%p5 bra $Lt_32_15362;\n"
3586 " ld.shared.f32 %f19, [__smem+0];\n"
3587 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_128_true_g_odata];\n"
3588 " cvt.u64.u32 %rd10, %r1;\n"
3589 " mul.wide.u32 %rd11, %r1, 4;\n"
3590 " add.u64 %rd12, %rd9, %rd11;\n"
3591 " st.global.f32 [%rd12+0], %f19;\n"
3595 "$LDWend_reduce_uchar_128_true:\n"
3596 " } // reduce_uchar_128_true\n"
3598 " .entry reduce_uchar_256_true (\n"
3599 " .param .u64 __cudaparm_reduce_uchar_256_true_g_idata,\n"
3600 " .param .u64 __cudaparm_reduce_uchar_256_true_g_odata,\n"
3601 " .param .u32 __cudaparm_reduce_uchar_256_true_n)\n"
3603 " .reg .u16 %rh<3>;\n"
3604 " .reg .u32 %r<15>;\n"
3605 " .reg .u64 %rd<14>;\n"
3606 " .reg .f32 %f<22>;\n"
3607 " .reg .pred %p<8>;\n"
3609 "$LDWbegin_reduce_uchar_256_true:\n"
3611 " cvt.u32.u16 %r1, %ctaid.x;\n"
3612 " mul.lo.u32 %r2, %r1, 512;\n"
3613 " cvt.u32.u16 %r3, %tid.x;\n"
3614 " add.u32 %r4, %r2, %r3;\n"
3615 " mov.s32 %r5, %r4;\n"
3616 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_256_true_n];\n"
3617 " setp.ge.u32 %p1, %r4, %r6;\n"
3618 " @%p1 bra $Lt_33_16130;\n"
3619 " mov.u16 %rh1, %nctaid.x;\n"
3620 " mul.wide.u16 %r7, %rh1, 512;\n"
3621 " cvt.u64.u32 %rd1, %r4;\n"
3622 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_256_true_g_idata];\n"
3623 " add.u64 %rd3, %rd1, %rd2;\n"
3624 " cvt.s64.u32 %rd4, %r7;\n"
3625 " mov.f32 %f1, 0f00000000; // 0\n"
3627 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3629 " ld.global.u8 %r8, [%rd3+0];\n"
3630 " cvt.rn.f32.u32 %f2, %r8;\n"
3631 " add.f32 %f3, %f2, %f1;\n"
3633 " ld.global.u8 %r9, [%rd3+256];\n"
3634 " cvt.rn.f32.u32 %f4, %r9;\n"
3635 " add.f32 %f1, %f4, %f3;\n"
3636 " add.u32 %r5, %r7, %r5;\n"
3637 " add.u64 %rd3, %rd4, %rd3;\n"
3639 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_256_true_n];\n"
3641 " setp.lt.u32 %p2, %r5, %r6;\n"
3642 " @%p2 bra $Lt_33_13570;\n"
3643 " bra.uni $Lt_33_13058;\n"
3645 " mov.f32 %f1, 0f00000000; // 0\n"
3648 " mov.f32 %f5, %f1;\n"
3649 " mov.f32 %f6, %f5;\n"
3651 " mov.u64 %rd5, __smem;\n"
3652 " cvt.u64.u32 %rd6, %r3;\n"
3653 " mul.wide.u32 %rd7, %r3, 4;\n"
3654 " add.u64 %rd8, %rd5, %rd7;\n"
3655 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
3658 " mov.u32 %r10, 127;\n"
3659 " setp.gt.u32 %p3, %r3, %r10;\n"
3660 " @%p3 bra $Lt_33_14082;\n"
3662 " ld.volatile.shared.f32 %f7, [%rd8+512];\n"
3663 " add.f32 %f6, %f7, %f5;\n"
3664 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3667 " mov.u32 %r11, 63;\n"
3668 " setp.gt.u32 %p4, %r3, %r11;\n"
3669 " @%p4 bra $Lt_33_14594;\n"
3671 " ld.volatile.shared.f32 %f8, [%rd8+256];\n"
3672 " add.f32 %f6, %f8, %f6;\n"
3673 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3676 " mov.u32 %r12, 31;\n"
3677 " setp.gt.u32 %p5, %r3, %r12;\n"
3678 " @%p5 bra $Lt_33_15106;\n"
3680 " ld.volatile.shared.f32 %f9, [%rd8+128];\n"
3681 " add.f32 %f10, %f9, %f6;\n"
3682 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3684 " ld.volatile.shared.f32 %f11, [%rd8+64];\n"
3685 " add.f32 %f12, %f11, %f10;\n"
3686 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3688 " ld.volatile.shared.f32 %f13, [%rd8+32];\n"
3689 " add.f32 %f14, %f13, %f12;\n"
3690 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
3692 " ld.volatile.shared.f32 %f15, [%rd8+16];\n"
3693 " add.f32 %f16, %f15, %f14;\n"
3694 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
3696 " ld.volatile.shared.f32 %f17, [%rd8+8];\n"
3697 " add.f32 %f18, %f17, %f16;\n"
3698 " st.volatile.shared.f32 [%rd8+0], %f18;\n"
3700 " ld.volatile.shared.f32 %f19, [%rd8+4];\n"
3701 " add.f32 %f6, %f19, %f18;\n"
3702 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3705 " mov.u32 %r13, 0;\n"
3706 " setp.ne.u32 %p6, %r3, %r13;\n"
3707 " @%p6 bra $Lt_33_15618;\n"
3709 " ld.shared.f32 %f20, [__smem+0];\n"
3710 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_256_true_g_odata];\n"
3711 " cvt.u64.u32 %rd10, %r1;\n"
3712 " mul.wide.u32 %rd11, %r1, 4;\n"
3713 " add.u64 %rd12, %rd9, %rd11;\n"
3714 " st.global.f32 [%rd12+0], %f20;\n"
3718 "$LDWend_reduce_uchar_256_true:\n"
3719 " } // reduce_uchar_256_true\n"
3721 " .entry reduce_uchar_512_true (\n"
3722 " .param .u64 __cudaparm_reduce_uchar_512_true_g_idata,\n"
3723 " .param .u64 __cudaparm_reduce_uchar_512_true_g_odata,\n"
3724 " .param .u32 __cudaparm_reduce_uchar_512_true_n)\n"
3726 " .reg .u16 %rh<3>;\n"
3727 " .reg .u32 %r<16>;\n"
3728 " .reg .u64 %rd<14>;\n"
3729 " .reg .f32 %f<23>;\n"
3730 " .reg .pred %p<9>;\n"
3732 "$LDWbegin_reduce_uchar_512_true:\n"
3734 " cvt.u32.u16 %r1, %ctaid.x;\n"
3735 " mul.lo.u32 %r2, %r1, 1024;\n"
3736 " cvt.u32.u16 %r3, %tid.x;\n"
3737 " add.u32 %r4, %r2, %r3;\n"
3738 " mov.s32 %r5, %r4;\n"
3739 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_512_true_n];\n"
3740 " setp.ge.u32 %p1, %r4, %r6;\n"
3741 " @%p1 bra $Lt_34_16386;\n"
3742 " mov.u16 %rh1, %nctaid.x;\n"
3743 " mul.wide.u16 %r7, %rh1, 1024;\n"
3744 " cvt.u64.u32 %rd1, %r4;\n"
3745 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_512_true_g_idata];\n"
3746 " add.u64 %rd3, %rd1, %rd2;\n"
3747 " cvt.s64.u32 %rd4, %r7;\n"
3748 " mov.f32 %f1, 0f00000000; // 0\n"
3750 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3752 " ld.global.u8 %r8, [%rd3+0];\n"
3753 " cvt.rn.f32.u32 %f2, %r8;\n"
3754 " add.f32 %f3, %f2, %f1;\n"
3756 " ld.global.u8 %r9, [%rd3+512];\n"
3757 " cvt.rn.f32.u32 %f4, %r9;\n"
3758 " add.f32 %f1, %f4, %f3;\n"
3759 " add.u32 %r5, %r7, %r5;\n"
3760 " add.u64 %rd3, %rd4, %rd3;\n"
3762 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_512_true_n];\n"
3764 " setp.lt.u32 %p2, %r5, %r6;\n"
3765 " @%p2 bra $Lt_34_13314;\n"
3766 " bra.uni $Lt_34_12802;\n"
3768 " mov.f32 %f1, 0f00000000; // 0\n"
3771 " mov.f32 %f5, %f1;\n"
3772 " mov.f32 %f6, %f5;\n"
3774 " mov.u64 %rd5, __smem;\n"
3775 " cvt.u64.u32 %rd6, %r3;\n"
3776 " mul.wide.u32 %rd7, %r3, 4;\n"
3777 " add.u64 %rd8, %rd5, %rd7;\n"
3778 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
3781 " mov.u32 %r10, 255;\n"
3782 " setp.gt.u32 %p3, %r3, %r10;\n"
3783 " @%p3 bra $Lt_34_13826;\n"
3785 " ld.volatile.shared.f32 %f7, [%rd8+1024];\n"
3786 " add.f32 %f6, %f7, %f5;\n"
3787 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3790 " mov.u32 %r11, 127;\n"
3791 " setp.gt.u32 %p4, %r3, %r11;\n"
3792 " @%p4 bra $Lt_34_14338;\n"
3794 " ld.volatile.shared.f32 %f8, [%rd8+512];\n"
3795 " add.f32 %f6, %f8, %f6;\n"
3796 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3799 " mov.u32 %r12, 63;\n"
3800 " setp.gt.u32 %p5, %r3, %r12;\n"
3801 " @%p5 bra $Lt_34_14850;\n"
3803 " ld.volatile.shared.f32 %f9, [%rd8+256];\n"
3804 " add.f32 %f6, %f9, %f6;\n"
3805 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3808 " mov.u32 %r13, 31;\n"
3809 " setp.gt.u32 %p6, %r3, %r13;\n"
3810 " @%p6 bra $Lt_34_15362;\n"
3812 " ld.volatile.shared.f32 %f10, [%rd8+128];\n"
3813 " add.f32 %f11, %f10, %f6;\n"
3814 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
3816 " ld.volatile.shared.f32 %f12, [%rd8+64];\n"
3817 " add.f32 %f13, %f12, %f11;\n"
3818 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
3820 " ld.volatile.shared.f32 %f14, [%rd8+32];\n"
3821 " add.f32 %f15, %f14, %f13;\n"
3822 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
3824 " ld.volatile.shared.f32 %f16, [%rd8+16];\n"
3825 " add.f32 %f17, %f16, %f15;\n"
3826 " st.volatile.shared.f32 [%rd8+0], %f17;\n"
3828 " ld.volatile.shared.f32 %f18, [%rd8+8];\n"
3829 " add.f32 %f19, %f18, %f17;\n"
3830 " st.volatile.shared.f32 [%rd8+0], %f19;\n"
3832 " ld.volatile.shared.f32 %f20, [%rd8+4];\n"
3833 " add.f32 %f6, %f20, %f19;\n"
3834 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3837 " mov.u32 %r14, 0;\n"
3838 " setp.ne.u32 %p7, %r3, %r14;\n"
3839 " @%p7 bra $Lt_34_15874;\n"
3841 " ld.shared.f32 %f21, [__smem+0];\n"
3842 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_512_true_g_odata];\n"
3843 " cvt.u64.u32 %rd10, %r1;\n"
3844 " mul.wide.u32 %rd11, %r1, 4;\n"
3845 " add.u64 %rd12, %rd9, %rd11;\n"
3846 " st.global.f32 [%rd12+0], %f21;\n"
3850 "$LDWend_reduce_uchar_512_true:\n"
3851 " } // reduce_uchar_512_true\n"
3853 " .entry reduce_uchar_1_false (\n"
3854 " .param .u64 __cudaparm_reduce_uchar_1_false_g_idata,\n"
3855 " .param .u64 __cudaparm_reduce_uchar_1_false_g_odata,\n"
3856 " .param .u32 __cudaparm_reduce_uchar_1_false_n)\n"
3858 " .reg .u16 %rh<3>;\n"
3859 " .reg .u32 %r<13>;\n"
3860 " .reg .u64 %rd<14>;\n"
3861 " .reg .f32 %f<6>;\n"
3862 " .reg .pred %p<6>;\n"
3864 "$LDWbegin_reduce_uchar_1_false:\n"
3866 " cvt.u32.u16 %r1, %ctaid.x;\n"
3867 " mul24.lo.u32 %r2, %r1, 2;\n"
3868 " cvt.u32.u16 %r3, %tid.x;\n"
3869 " add.u32 %r4, %r2, %r3;\n"
3870 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_1_false_n];\n"
3871 " setp.ge.u32 %p1, %r4, %r5;\n"
3872 " @%p1 bra $Lt_35_17154;\n"
3873 " add.u32 %r6, %r4, 1;\n"
3874 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_1_false_n];\n"
3875 " add.u32 %r7, %r5, 1;\n"
3876 " mov.u16 %rh1, %nctaid.x;\n"
3877 " mul.wide.u16 %r8, %rh1, 2;\n"
3878 " cvt.u64.u32 %rd1, %r4;\n"
3879 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_1_false_g_idata];\n"
3880 " add.u64 %rd3, %rd1, %rd2;\n"
3881 " cvt.s64.u32 %rd4, %r8;\n"
3882 " mov.f32 %f1, 0f00000000; // 0\n"
3884 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3886 " ld.global.u8 %r9, [%rd3+0];\n"
3887 " cvt.rn.f32.u32 %f2, %r9;\n"
3888 " add.f32 %f1, %f2, %f1;\n"
3890 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_1_false_n];\n"
3892 " setp.ge.u32 %p2, %r6, %r5;\n"
3893 " @%p2 bra $Lt_35_15874;\n"
3894 " //<loop> Part of loop body line 181, head labeled $Lt_35_15618\n"
3896 " ld.global.u8 %r10, [%rd3+1];\n"
3897 " cvt.rn.f32.u32 %f3, %r10;\n"
3898 " add.f32 %f1, %f3, %f1;\n"
3900 " //<loop> Part of loop body line 181, head labeled $Lt_35_15618\n"
3901 " add.u32 %r6, %r6, %r8;\n"
3902 " add.u64 %rd3, %rd4, %rd3;\n"
3903 " setp.lt.u32 %p3, %r6, %r7;\n"
3904 " @%p3 bra $Lt_35_15618;\n"
3905 " bra.uni $Lt_35_15106;\n"
3907 " mov.f32 %f1, 0f00000000; // 0\n"
3910 " mov.u64 %rd5, __smem;\n"
3911 " cvt.u64.u32 %rd6, %r3;\n"
3912 " mul.wide.u32 %rd7, %r3, 4;\n"
3913 " add.u64 %rd8, %rd5, %rd7;\n"
3914 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3918 " mov.u32 %r11, 0;\n"
3919 " setp.ne.u32 %p4, %r3, %r11;\n"
3920 " @%p4 bra $Lt_35_16642;\n"
3922 " ld.shared.f32 %f4, [__smem+0];\n"
3923 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_1_false_g_odata];\n"
3924 " cvt.u64.u32 %rd10, %r1;\n"
3925 " mul.wide.u32 %rd11, %r1, 4;\n"
3926 " add.u64 %rd12, %rd9, %rd11;\n"
3927 " st.global.f32 [%rd12+0], %f4;\n"
3931 "$LDWend_reduce_uchar_1_false:\n"
3932 " } // reduce_uchar_1_false\n"
3934 " .entry reduce_uchar_2_false (\n"
3935 " .param .u64 __cudaparm_reduce_uchar_2_false_g_idata,\n"
3936 " .param .u64 __cudaparm_reduce_uchar_2_false_g_odata,\n"
3937 " .param .u32 __cudaparm_reduce_uchar_2_false_n)\n"
3939 " .reg .u16 %rh<3>;\n"
3940 " .reg .u32 %r<14>;\n"
3941 " .reg .u64 %rd<14>;\n"
3942 " .reg .f32 %f<8>;\n"
3943 " .reg .pred %p<7>;\n"
3945 "$LDWbegin_reduce_uchar_2_false:\n"
3947 " cvt.u32.u16 %r1, %ctaid.x;\n"
3948 " mul24.lo.u32 %r2, %r1, 4;\n"
3949 " cvt.u32.u16 %r3, %tid.x;\n"
3950 " add.u32 %r4, %r2, %r3;\n"
3951 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_2_false_n];\n"
3952 " setp.ge.u32 %p1, %r4, %r5;\n"
3953 " @%p1 bra $Lt_36_17410;\n"
3954 " add.u32 %r6, %r4, 2;\n"
3955 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_2_false_n];\n"
3956 " add.u32 %r7, %r5, 2;\n"
3957 " mov.u16 %rh1, %nctaid.x;\n"
3958 " mul.wide.u16 %r8, %rh1, 4;\n"
3959 " cvt.u64.u32 %rd1, %r4;\n"
3960 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_2_false_g_idata];\n"
3961 " add.u64 %rd3, %rd1, %rd2;\n"
3962 " cvt.s64.u32 %rd4, %r8;\n"
3963 " mov.f32 %f1, 0f00000000; // 0\n"
3965 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3967 " ld.global.u8 %r9, [%rd3+0];\n"
3968 " cvt.rn.f32.u32 %f2, %r9;\n"
3969 " add.f32 %f1, %f2, %f1;\n"
3971 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_2_false_n];\n"
3973 " setp.ge.u32 %p2, %r6, %r5;\n"
3974 " @%p2 bra $Lt_36_15618;\n"
3975 " //<loop> Part of loop body line 181, head labeled $Lt_36_15362\n"
3977 " ld.global.u8 %r10, [%rd3+2];\n"
3978 " cvt.rn.f32.u32 %f3, %r10;\n"
3979 " add.f32 %f1, %f3, %f1;\n"
3981 " //<loop> Part of loop body line 181, head labeled $Lt_36_15362\n"
3982 " add.u32 %r6, %r6, %r8;\n"
3983 " add.u64 %rd3, %rd4, %rd3;\n"
3984 " setp.lt.u32 %p3, %r6, %r7;\n"
3985 " @%p3 bra $Lt_36_15362;\n"
3986 " bra.uni $Lt_36_14850;\n"
3988 " mov.f32 %f1, 0f00000000; // 0\n"
3991 " mov.u64 %rd5, __smem;\n"
3992 " cvt.u64.u32 %rd6, %r3;\n"
3993 " mul.wide.u32 %rd7, %r3, 4;\n"
3994 " add.u64 %rd8, %rd5, %rd7;\n"
3995 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3998 " mov.u32 %r11, 31;\n"
3999 " setp.gt.u32 %p4, %r3, %r11;\n"
4000 " @%p4 bra $Lt_36_16386;\n"
4002 " ld.volatile.shared.f32 %f4, [%rd8+4];\n"
4003 " add.f32 %f5, %f4, %f1;\n"
4004 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4007 " mov.u32 %r12, 0;\n"
4008 " setp.ne.u32 %p5, %r3, %r12;\n"
4009 " @%p5 bra $Lt_36_16898;\n"
4011 " ld.shared.f32 %f6, [__smem+0];\n"
4012 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_2_false_g_odata];\n"
4013 " cvt.u64.u32 %rd10, %r1;\n"
4014 " mul.wide.u32 %rd11, %r1, 4;\n"
4015 " add.u64 %rd12, %rd9, %rd11;\n"
4016 " st.global.f32 [%rd12+0], %f6;\n"
4020 "$LDWend_reduce_uchar_2_false:\n"
4021 " } // reduce_uchar_2_false\n"
4023 " .entry reduce_uchar_4_false (\n"
4024 " .param .u64 __cudaparm_reduce_uchar_4_false_g_idata,\n"
4025 " .param .u64 __cudaparm_reduce_uchar_4_false_g_odata,\n"
4026 " .param .u32 __cudaparm_reduce_uchar_4_false_n)\n"
4028 " .reg .u16 %rh<3>;\n"
4029 " .reg .u32 %r<14>;\n"
4030 " .reg .u64 %rd<14>;\n"
4031 " .reg .f32 %f<10>;\n"
4032 " .reg .pred %p<7>;\n"
4034 "$LDWbegin_reduce_uchar_4_false:\n"
4036 " cvt.u32.u16 %r1, %ctaid.x;\n"
4037 " mul24.lo.u32 %r2, %r1, 8;\n"
4038 " cvt.u32.u16 %r3, %tid.x;\n"
4039 " add.u32 %r4, %r2, %r3;\n"
4040 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_4_false_n];\n"
4041 " setp.ge.u32 %p1, %r4, %r5;\n"
4042 " @%p1 bra $Lt_37_17154;\n"
4043 " add.u32 %r6, %r4, 4;\n"
4044 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_4_false_n];\n"
4045 " add.u32 %r7, %r5, 4;\n"
4046 " mov.u16 %rh1, %nctaid.x;\n"
4047 " mul.wide.u16 %r8, %rh1, 8;\n"
4048 " cvt.u64.u32 %rd1, %r4;\n"
4049 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_4_false_g_idata];\n"
4050 " add.u64 %rd3, %rd1, %rd2;\n"
4051 " cvt.s64.u32 %rd4, %r8;\n"
4052 " mov.f32 %f1, 0f00000000; // 0\n"
4054 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4056 " ld.global.u8 %r9, [%rd3+0];\n"
4057 " cvt.rn.f32.u32 %f2, %r9;\n"
4058 " add.f32 %f1, %f2, %f1;\n"
4060 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_4_false_n];\n"
4062 " setp.ge.u32 %p2, %r6, %r5;\n"
4063 " @%p2 bra $Lt_37_15362;\n"
4064 " //<loop> Part of loop body line 181, head labeled $Lt_37_15106\n"
4066 " ld.global.u8 %r10, [%rd3+4];\n"
4067 " cvt.rn.f32.u32 %f3, %r10;\n"
4068 " add.f32 %f1, %f3, %f1;\n"
4070 " //<loop> Part of loop body line 181, head labeled $Lt_37_15106\n"
4071 " add.u32 %r6, %r6, %r8;\n"
4072 " add.u64 %rd3, %rd4, %rd3;\n"
4073 " setp.lt.u32 %p3, %r6, %r7;\n"
4074 " @%p3 bra $Lt_37_15106;\n"
4075 " bra.uni $Lt_37_14594;\n"
4077 " mov.f32 %f1, 0f00000000; // 0\n"
4080 " mov.u64 %rd5, __smem;\n"
4081 " cvt.u64.u32 %rd6, %r3;\n"
4082 " mul.wide.u32 %rd7, %r3, 4;\n"
4083 " add.u64 %rd8, %rd5, %rd7;\n"
4084 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4087 " mov.u32 %r11, 31;\n"
4088 " setp.gt.u32 %p4, %r3, %r11;\n"
4089 " @%p4 bra $Lt_37_16130;\n"
4091 " ld.volatile.shared.f32 %f4, [%rd8+8];\n"
4092 " add.f32 %f5, %f4, %f1;\n"
4093 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4095 " ld.volatile.shared.f32 %f6, [%rd8+4];\n"
4096 " add.f32 %f7, %f6, %f5;\n"
4097 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4100 " mov.u32 %r12, 0;\n"
4101 " setp.ne.u32 %p5, %r3, %r12;\n"
4102 " @%p5 bra $Lt_37_16642;\n"
4104 " ld.shared.f32 %f8, [__smem+0];\n"
4105 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_4_false_g_odata];\n"
4106 " cvt.u64.u32 %rd10, %r1;\n"
4107 " mul.wide.u32 %rd11, %r1, 4;\n"
4108 " add.u64 %rd12, %rd9, %rd11;\n"
4109 " st.global.f32 [%rd12+0], %f8;\n"
4113 "$LDWend_reduce_uchar_4_false:\n"
4114 " } // reduce_uchar_4_false\n"
4116 " .entry reduce_uchar_8_false (\n"
4117 " .param .u64 __cudaparm_reduce_uchar_8_false_g_idata,\n"
4118 " .param .u64 __cudaparm_reduce_uchar_8_false_g_odata,\n"
4119 " .param .u32 __cudaparm_reduce_uchar_8_false_n)\n"
4121 " .reg .u16 %rh<3>;\n"
4122 " .reg .u32 %r<14>;\n"
4123 " .reg .u64 %rd<14>;\n"
4124 " .reg .f32 %f<12>;\n"
4125 " .reg .pred %p<7>;\n"
4127 "$LDWbegin_reduce_uchar_8_false:\n"
4129 " cvt.u32.u16 %r1, %ctaid.x;\n"
4130 " mul24.lo.u32 %r2, %r1, 16;\n"
4131 " cvt.u32.u16 %r3, %tid.x;\n"
4132 " add.u32 %r4, %r2, %r3;\n"
4133 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_8_false_n];\n"
4134 " setp.ge.u32 %p1, %r4, %r5;\n"
4135 " @%p1 bra $Lt_38_16898;\n"
4136 " add.u32 %r6, %r4, 8;\n"
4137 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_8_false_n];\n"
4138 " add.u32 %r7, %r5, 8;\n"
4139 " mov.u16 %rh1, %nctaid.x;\n"
4140 " mul.wide.u16 %r8, %rh1, 16;\n"
4141 " cvt.u64.u32 %rd1, %r4;\n"
4142 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_8_false_g_idata];\n"
4143 " add.u64 %rd3, %rd1, %rd2;\n"
4144 " cvt.s64.u32 %rd4, %r8;\n"
4145 " mov.f32 %f1, 0f00000000; // 0\n"
4147 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4149 " ld.global.u8 %r9, [%rd3+0];\n"
4150 " cvt.rn.f32.u32 %f2, %r9;\n"
4151 " add.f32 %f1, %f2, %f1;\n"
4153 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_8_false_n];\n"
4155 " setp.ge.u32 %p2, %r6, %r5;\n"
4156 " @%p2 bra $Lt_38_15106;\n"
4157 " //<loop> Part of loop body line 181, head labeled $Lt_38_14850\n"
4159 " ld.global.u8 %r10, [%rd3+8];\n"
4160 " cvt.rn.f32.u32 %f3, %r10;\n"
4161 " add.f32 %f1, %f3, %f1;\n"
4163 " //<loop> Part of loop body line 181, head labeled $Lt_38_14850\n"
4164 " add.u32 %r6, %r6, %r8;\n"
4165 " add.u64 %rd3, %rd4, %rd3;\n"
4166 " setp.lt.u32 %p3, %r6, %r7;\n"
4167 " @%p3 bra $Lt_38_14850;\n"
4168 " bra.uni $Lt_38_14338;\n"
4170 " mov.f32 %f1, 0f00000000; // 0\n"
4173 " mov.u64 %rd5, __smem;\n"
4174 " cvt.u64.u32 %rd6, %r3;\n"
4175 " mul.wide.u32 %rd7, %r3, 4;\n"
4176 " add.u64 %rd8, %rd5, %rd7;\n"
4177 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4180 " mov.u32 %r11, 31;\n"
4181 " setp.gt.u32 %p4, %r3, %r11;\n"
4182 " @%p4 bra $Lt_38_15874;\n"
4184 " ld.volatile.shared.f32 %f4, [%rd8+16];\n"
4185 " add.f32 %f5, %f4, %f1;\n"
4186 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4188 " ld.volatile.shared.f32 %f6, [%rd8+8];\n"
4189 " add.f32 %f7, %f6, %f5;\n"
4190 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4192 " ld.volatile.shared.f32 %f8, [%rd8+4];\n"
4193 " add.f32 %f9, %f8, %f7;\n"
4194 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4197 " mov.u32 %r12, 0;\n"
4198 " setp.ne.u32 %p5, %r3, %r12;\n"
4199 " @%p5 bra $Lt_38_16386;\n"
4201 " ld.shared.f32 %f10, [__smem+0];\n"
4202 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_8_false_g_odata];\n"
4203 " cvt.u64.u32 %rd10, %r1;\n"
4204 " mul.wide.u32 %rd11, %r1, 4;\n"
4205 " add.u64 %rd12, %rd9, %rd11;\n"
4206 " st.global.f32 [%rd12+0], %f10;\n"
4210 "$LDWend_reduce_uchar_8_false:\n"
4211 " } // reduce_uchar_8_false\n"
4213 " .entry reduce_uchar_16_false (\n"
4214 " .param .u64 __cudaparm_reduce_uchar_16_false_g_idata,\n"
4215 " .param .u64 __cudaparm_reduce_uchar_16_false_g_odata,\n"
4216 " .param .u32 __cudaparm_reduce_uchar_16_false_n)\n"
4218 " .reg .u16 %rh<3>;\n"
4219 " .reg .u32 %r<14>;\n"
4220 " .reg .u64 %rd<14>;\n"
4221 " .reg .f32 %f<14>;\n"
4222 " .reg .pred %p<7>;\n"
4224 "$LDWbegin_reduce_uchar_16_false:\n"
4226 " cvt.u32.u16 %r1, %ctaid.x;\n"
4227 " mul24.lo.u32 %r2, %r1, 32;\n"
4228 " cvt.u32.u16 %r3, %tid.x;\n"
4229 " add.u32 %r4, %r2, %r3;\n"
4230 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_16_false_n];\n"
4231 " setp.ge.u32 %p1, %r4, %r5;\n"
4232 " @%p1 bra $Lt_39_16642;\n"
4233 " add.u32 %r6, %r4, 16;\n"
4234 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_16_false_n];\n"
4235 " add.u32 %r7, %r5, 16;\n"
4236 " mov.u16 %rh1, %nctaid.x;\n"
4237 " mul.wide.u16 %r8, %rh1, 32;\n"
4238 " cvt.u64.u32 %rd1, %r4;\n"
4239 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_16_false_g_idata];\n"
4240 " add.u64 %rd3, %rd1, %rd2;\n"
4241 " cvt.s64.u32 %rd4, %r8;\n"
4242 " mov.f32 %f1, 0f00000000; // 0\n"
4244 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4246 " ld.global.u8 %r9, [%rd3+0];\n"
4247 " cvt.rn.f32.u32 %f2, %r9;\n"
4248 " add.f32 %f1, %f2, %f1;\n"
4250 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_16_false_n];\n"
4252 " setp.ge.u32 %p2, %r6, %r5;\n"
4253 " @%p2 bra $Lt_39_14850;\n"
4254 " //<loop> Part of loop body line 181, head labeled $Lt_39_14594\n"
4256 " ld.global.u8 %r10, [%rd3+16];\n"
4257 " cvt.rn.f32.u32 %f3, %r10;\n"
4258 " add.f32 %f1, %f3, %f1;\n"
4260 " //<loop> Part of loop body line 181, head labeled $Lt_39_14594\n"
4261 " add.u32 %r6, %r6, %r8;\n"
4262 " add.u64 %rd3, %rd4, %rd3;\n"
4263 " setp.lt.u32 %p3, %r6, %r7;\n"
4264 " @%p3 bra $Lt_39_14594;\n"
4265 " bra.uni $Lt_39_14082;\n"
4267 " mov.f32 %f1, 0f00000000; // 0\n"
4270 " mov.u64 %rd5, __smem;\n"
4271 " cvt.u64.u32 %rd6, %r3;\n"
4272 " mul.wide.u32 %rd7, %r3, 4;\n"
4273 " add.u64 %rd8, %rd5, %rd7;\n"
4274 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4277 " mov.u32 %r11, 31;\n"
4278 " setp.gt.u32 %p4, %r3, %r11;\n"
4279 " @%p4 bra $Lt_39_15618;\n"
4281 " ld.volatile.shared.f32 %f4, [%rd8+32];\n"
4282 " add.f32 %f5, %f4, %f1;\n"
4283 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4285 " ld.volatile.shared.f32 %f6, [%rd8+16];\n"
4286 " add.f32 %f7, %f6, %f5;\n"
4287 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4289 " ld.volatile.shared.f32 %f8, [%rd8+8];\n"
4290 " add.f32 %f9, %f8, %f7;\n"
4291 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4293 " ld.volatile.shared.f32 %f10, [%rd8+4];\n"
4294 " add.f32 %f11, %f10, %f9;\n"
4295 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4298 " mov.u32 %r12, 0;\n"
4299 " setp.ne.u32 %p5, %r3, %r12;\n"
4300 " @%p5 bra $Lt_39_16130;\n"
4302 " ld.shared.f32 %f12, [__smem+0];\n"
4303 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_16_false_g_odata];\n"
4304 " cvt.u64.u32 %rd10, %r1;\n"
4305 " mul.wide.u32 %rd11, %r1, 4;\n"
4306 " add.u64 %rd12, %rd9, %rd11;\n"
4307 " st.global.f32 [%rd12+0], %f12;\n"
4311 "$LDWend_reduce_uchar_16_false:\n"
4312 " } // reduce_uchar_16_false\n"
4314 " .entry reduce_uchar_32_false (\n"
4315 " .param .u64 __cudaparm_reduce_uchar_32_false_g_idata,\n"
4316 " .param .u64 __cudaparm_reduce_uchar_32_false_g_odata,\n"
4317 " .param .u32 __cudaparm_reduce_uchar_32_false_n)\n"
4319 " .reg .u16 %rh<3>;\n"
4320 " .reg .u32 %r<14>;\n"
4321 " .reg .u64 %rd<14>;\n"
4322 " .reg .f32 %f<16>;\n"
4323 " .reg .pred %p<7>;\n"
4325 "$LDWbegin_reduce_uchar_32_false:\n"
4327 " cvt.u32.u16 %r1, %ctaid.x;\n"
4328 " mul24.lo.u32 %r2, %r1, 64;\n"
4329 " cvt.u32.u16 %r3, %tid.x;\n"
4330 " add.u32 %r4, %r2, %r3;\n"
4331 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_32_false_n];\n"
4332 " setp.ge.u32 %p1, %r4, %r5;\n"
4333 " @%p1 bra $Lt_40_16386;\n"
4334 " add.u32 %r6, %r4, 32;\n"
4335 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_32_false_n];\n"
4336 " add.u32 %r7, %r5, 32;\n"
4337 " mov.u16 %rh1, %nctaid.x;\n"
4338 " mul.wide.u16 %r8, %rh1, 64;\n"
4339 " cvt.u64.u32 %rd1, %r4;\n"
4340 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_32_false_g_idata];\n"
4341 " add.u64 %rd3, %rd1, %rd2;\n"
4342 " cvt.s64.u32 %rd4, %r8;\n"
4343 " mov.f32 %f1, 0f00000000; // 0\n"
4345 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4347 " ld.global.u8 %r9, [%rd3+0];\n"
4348 " cvt.rn.f32.u32 %f2, %r9;\n"
4349 " add.f32 %f1, %f2, %f1;\n"
4351 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_32_false_n];\n"
4353 " setp.ge.u32 %p2, %r6, %r5;\n"
4354 " @%p2 bra $Lt_40_14594;\n"
4355 " //<loop> Part of loop body line 181, head labeled $Lt_40_14338\n"
4357 " ld.global.u8 %r10, [%rd3+32];\n"
4358 " cvt.rn.f32.u32 %f3, %r10;\n"
4359 " add.f32 %f1, %f3, %f1;\n"
4361 " //<loop> Part of loop body line 181, head labeled $Lt_40_14338\n"
4362 " add.u32 %r6, %r6, %r8;\n"
4363 " add.u64 %rd3, %rd4, %rd3;\n"
4364 " setp.lt.u32 %p3, %r6, %r7;\n"
4365 " @%p3 bra $Lt_40_14338;\n"
4366 " bra.uni $Lt_40_13826;\n"
4368 " mov.f32 %f1, 0f00000000; // 0\n"
4371 " mov.u64 %rd5, __smem;\n"
4372 " cvt.u64.u32 %rd6, %r3;\n"
4373 " mul.wide.u32 %rd7, %r3, 4;\n"
4374 " add.u64 %rd8, %rd5, %rd7;\n"
4375 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4378 " mov.u32 %r11, 31;\n"
4379 " setp.gt.u32 %p4, %r3, %r11;\n"
4380 " @%p4 bra $Lt_40_15362;\n"
4382 " ld.volatile.shared.f32 %f4, [%rd8+64];\n"
4383 " add.f32 %f5, %f4, %f1;\n"
4384 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4386 " ld.volatile.shared.f32 %f6, [%rd8+32];\n"
4387 " add.f32 %f7, %f6, %f5;\n"
4388 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4390 " ld.volatile.shared.f32 %f8, [%rd8+16];\n"
4391 " add.f32 %f9, %f8, %f7;\n"
4392 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4394 " ld.volatile.shared.f32 %f10, [%rd8+8];\n"
4395 " add.f32 %f11, %f10, %f9;\n"
4396 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4398 " ld.volatile.shared.f32 %f12, [%rd8+4];\n"
4399 " add.f32 %f13, %f12, %f11;\n"
4400 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
4403 " mov.u32 %r12, 0;\n"
4404 " setp.ne.u32 %p5, %r3, %r12;\n"
4405 " @%p5 bra $Lt_40_15874;\n"
4407 " ld.shared.f32 %f14, [__smem+0];\n"
4408 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_32_false_g_odata];\n"
4409 " cvt.u64.u32 %rd10, %r1;\n"
4410 " mul.wide.u32 %rd11, %r1, 4;\n"
4411 " add.u64 %rd12, %rd9, %rd11;\n"
4412 " st.global.f32 [%rd12+0], %f14;\n"
4416 "$LDWend_reduce_uchar_32_false:\n"
4417 " } // reduce_uchar_32_false\n"
4419 " .entry reduce_uchar_64_false (\n"
4420 " .param .u64 __cudaparm_reduce_uchar_64_false_g_idata,\n"
4421 " .param .u64 __cudaparm_reduce_uchar_64_false_g_odata,\n"
4422 " .param .u32 __cudaparm_reduce_uchar_64_false_n)\n"
4424 " .reg .u16 %rh<3>;\n"
4425 " .reg .u32 %r<14>;\n"
4426 " .reg .u64 %rd<14>;\n"
4427 " .reg .f32 %f<18>;\n"
4428 " .reg .pred %p<7>;\n"
4430 "$LDWbegin_reduce_uchar_64_false:\n"
4432 " cvt.u32.u16 %r1, %ctaid.x;\n"
4433 " mul24.lo.u32 %r2, %r1, 128;\n"
4434 " cvt.u32.u16 %r3, %tid.x;\n"
4435 " add.u32 %r4, %r2, %r3;\n"
4436 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_64_false_n];\n"
4437 " setp.ge.u32 %p1, %r4, %r5;\n"
4438 " @%p1 bra $Lt_41_16130;\n"
4439 " add.u32 %r6, %r4, 64;\n"
4440 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_64_false_n];\n"
4441 " add.u32 %r7, %r5, 64;\n"
4442 " mov.u16 %rh1, %nctaid.x;\n"
4443 " mul.wide.u16 %r8, %rh1, 128;\n"
4444 " cvt.u64.u32 %rd1, %r4;\n"
4445 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_64_false_g_idata];\n"
4446 " add.u64 %rd3, %rd1, %rd2;\n"
4447 " cvt.s64.u32 %rd4, %r8;\n"
4448 " mov.f32 %f1, 0f00000000; // 0\n"
4450 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4452 " ld.global.u8 %r9, [%rd3+0];\n"
4453 " cvt.rn.f32.u32 %f2, %r9;\n"
4454 " add.f32 %f1, %f2, %f1;\n"
4456 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_64_false_n];\n"
4458 " setp.ge.u32 %p2, %r6, %r5;\n"
4459 " @%p2 bra $Lt_41_14338;\n"
4460 " //<loop> Part of loop body line 181, head labeled $Lt_41_14082\n"
4462 " ld.global.u8 %r10, [%rd3+64];\n"
4463 " cvt.rn.f32.u32 %f3, %r10;\n"
4464 " add.f32 %f1, %f3, %f1;\n"
4466 " //<loop> Part of loop body line 181, head labeled $Lt_41_14082\n"
4467 " add.u32 %r6, %r6, %r8;\n"
4468 " add.u64 %rd3, %rd4, %rd3;\n"
4469 " setp.lt.u32 %p3, %r6, %r7;\n"
4470 " @%p3 bra $Lt_41_14082;\n"
4471 " bra.uni $Lt_41_13570;\n"
4473 " mov.f32 %f1, 0f00000000; // 0\n"
4476 " mov.u64 %rd5, __smem;\n"
4477 " cvt.u64.u32 %rd6, %r3;\n"
4478 " mul.wide.u32 %rd7, %r3, 4;\n"
4479 " add.u64 %rd8, %rd5, %rd7;\n"
4480 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4483 " mov.u32 %r11, 31;\n"
4484 " setp.gt.u32 %p4, %r3, %r11;\n"
4485 " @%p4 bra $Lt_41_15106;\n"
4487 " ld.volatile.shared.f32 %f4, [%rd8+128];\n"
4488 " add.f32 %f5, %f4, %f1;\n"
4489 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4491 " ld.volatile.shared.f32 %f6, [%rd8+64];\n"
4492 " add.f32 %f7, %f6, %f5;\n"
4493 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4495 " ld.volatile.shared.f32 %f8, [%rd8+32];\n"
4496 " add.f32 %f9, %f8, %f7;\n"
4497 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4499 " ld.volatile.shared.f32 %f10, [%rd8+16];\n"
4500 " add.f32 %f11, %f10, %f9;\n"
4501 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4503 " ld.volatile.shared.f32 %f12, [%rd8+8];\n"
4504 " add.f32 %f13, %f12, %f11;\n"
4505 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
4507 " ld.volatile.shared.f32 %f14, [%rd8+4];\n"
4508 " add.f32 %f15, %f14, %f13;\n"
4509 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
4512 " mov.u32 %r12, 0;\n"
4513 " setp.ne.u32 %p5, %r3, %r12;\n"
4514 " @%p5 bra $Lt_41_15618;\n"
4516 " ld.shared.f32 %f16, [__smem+0];\n"
4517 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_64_false_g_odata];\n"
4518 " cvt.u64.u32 %rd10, %r1;\n"
4519 " mul.wide.u32 %rd11, %r1, 4;\n"
4520 " add.u64 %rd12, %rd9, %rd11;\n"
4521 " st.global.f32 [%rd12+0], %f16;\n"
4525 "$LDWend_reduce_uchar_64_false:\n"
4526 " } // reduce_uchar_64_false\n"
4528 " .entry reduce_uchar_128_false (\n"
4529 " .param .u64 __cudaparm_reduce_uchar_128_false_g_idata,\n"
4530 " .param .u64 __cudaparm_reduce_uchar_128_false_g_odata,\n"
4531 " .param .u32 __cudaparm_reduce_uchar_128_false_n)\n"
4533 " .reg .u16 %rh<3>;\n"
4534 " .reg .u32 %r<15>;\n"
4535 " .reg .u64 %rd<14>;\n"
4536 " .reg .f32 %f<20>;\n"
4537 " .reg .pred %p<8>;\n"
4539 "$LDWbegin_reduce_uchar_128_false:\n"
4541 " cvt.u32.u16 %r1, %ctaid.x;\n"
4542 " mul.lo.u32 %r2, %r1, 256;\n"
4543 " cvt.u32.u16 %r3, %tid.x;\n"
4544 " add.u32 %r4, %r2, %r3;\n"
4545 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_128_false_n];\n"
4546 " setp.ge.u32 %p1, %r4, %r5;\n"
4547 " @%p1 bra $Lt_42_16386;\n"
4548 " add.u32 %r6, %r4, 128;\n"
4549 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_128_false_n];\n"
4550 " add.u32 %r7, %r5, 128;\n"
4551 " mov.u16 %rh1, %nctaid.x;\n"
4552 " mul.wide.u16 %r8, %rh1, 256;\n"
4553 " cvt.u64.u32 %rd1, %r4;\n"
4554 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_128_false_g_idata];\n"
4555 " add.u64 %rd3, %rd1, %rd2;\n"
4556 " cvt.s64.u32 %rd4, %r8;\n"
4557 " mov.f32 %f1, 0f00000000; // 0\n"
4559 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4561 " ld.global.u8 %r9, [%rd3+0];\n"
4562 " cvt.rn.f32.u32 %f2, %r9;\n"
4563 " add.f32 %f1, %f2, %f1;\n"
4565 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_128_false_n];\n"
4567 " setp.ge.u32 %p2, %r6, %r5;\n"
4568 " @%p2 bra $Lt_42_14082;\n"
4569 " //<loop> Part of loop body line 181, head labeled $Lt_42_13826\n"
4571 " ld.global.u8 %r10, [%rd3+128];\n"
4572 " cvt.rn.f32.u32 %f3, %r10;\n"
4573 " add.f32 %f1, %f3, %f1;\n"
4575 " //<loop> Part of loop body line 181, head labeled $Lt_42_13826\n"
4576 " add.u32 %r6, %r6, %r8;\n"
4577 " add.u64 %rd3, %rd4, %rd3;\n"
4578 " setp.lt.u32 %p3, %r6, %r7;\n"
4579 " @%p3 bra $Lt_42_13826;\n"
4580 " bra.uni $Lt_42_13314;\n"
4582 " mov.f32 %f1, 0f00000000; // 0\n"
4585 " mov.f32 %f4, %f1;\n"
4586 " mov.f32 %f5, %f4;\n"
4588 " mov.u64 %rd5, __smem;\n"
4589 " cvt.u64.u32 %rd6, %r3;\n"
4590 " mul.wide.u32 %rd7, %r3, 4;\n"
4591 " add.u64 %rd8, %rd5, %rd7;\n"
4592 " st.volatile.shared.f32 [%rd8+0], %f4;\n"
4595 " mov.u32 %r11, 63;\n"
4596 " setp.gt.u32 %p4, %r3, %r11;\n"
4597 " @%p4 bra $Lt_42_14850;\n"
4599 " ld.volatile.shared.f32 %f6, [%rd8+256];\n"
4600 " add.f32 %f5, %f6, %f4;\n"
4601 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4604 " mov.u32 %r12, 31;\n"
4605 " setp.gt.u32 %p5, %r3, %r12;\n"
4606 " @%p5 bra $Lt_42_15362;\n"
4608 " ld.volatile.shared.f32 %f7, [%rd8+128];\n"
4609 " add.f32 %f8, %f7, %f5;\n"
4610 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
4612 " ld.volatile.shared.f32 %f9, [%rd8+64];\n"
4613 " add.f32 %f10, %f9, %f8;\n"
4614 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
4616 " ld.volatile.shared.f32 %f11, [%rd8+32];\n"
4617 " add.f32 %f12, %f11, %f10;\n"
4618 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
4620 " ld.volatile.shared.f32 %f13, [%rd8+16];\n"
4621 " add.f32 %f14, %f13, %f12;\n"
4622 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
4624 " ld.volatile.shared.f32 %f15, [%rd8+8];\n"
4625 " add.f32 %f16, %f15, %f14;\n"
4626 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
4628 " ld.volatile.shared.f32 %f17, [%rd8+4];\n"
4629 " add.f32 %f5, %f17, %f16;\n"
4630 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4633 " mov.u32 %r13, 0;\n"
4634 " setp.ne.u32 %p6, %r3, %r13;\n"
4635 " @%p6 bra $Lt_42_15874;\n"
4637 " ld.shared.f32 %f18, [__smem+0];\n"
4638 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_128_false_g_odata];\n"
4639 " cvt.u64.u32 %rd10, %r1;\n"
4640 " mul.wide.u32 %rd11, %r1, 4;\n"
4641 " add.u64 %rd12, %rd9, %rd11;\n"
4642 " st.global.f32 [%rd12+0], %f18;\n"
4646 "$LDWend_reduce_uchar_128_false:\n"
4647 " } // reduce_uchar_128_false\n"
4649 " .entry reduce_uchar_256_false (\n"
4650 " .param .u64 __cudaparm_reduce_uchar_256_false_g_idata,\n"
4651 " .param .u64 __cudaparm_reduce_uchar_256_false_g_odata,\n"
4652 " .param .u32 __cudaparm_reduce_uchar_256_false_n)\n"
4654 " .reg .u16 %rh<3>;\n"
4655 " .reg .u32 %r<16>;\n"
4656 " .reg .u64 %rd<14>;\n"
4657 " .reg .f32 %f<21>;\n"
4658 " .reg .pred %p<9>;\n"
4660 "$LDWbegin_reduce_uchar_256_false:\n"
4662 " cvt.u32.u16 %r1, %ctaid.x;\n"
4663 " mul.lo.u32 %r2, %r1, 512;\n"
4664 " cvt.u32.u16 %r3, %tid.x;\n"
4665 " add.u32 %r4, %r2, %r3;\n"
4666 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_256_false_n];\n"
4667 " setp.ge.u32 %p1, %r4, %r5;\n"
4668 " @%p1 bra $Lt_43_16642;\n"
4669 " add.u32 %r6, %r4, 256;\n"
4670 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_256_false_n];\n"
4671 " add.u32 %r7, %r5, 256;\n"
4672 " mov.u16 %rh1, %nctaid.x;\n"
4673 " mul.wide.u16 %r8, %rh1, 512;\n"
4674 " cvt.u64.u32 %rd1, %r4;\n"
4675 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_256_false_g_idata];\n"
4676 " add.u64 %rd3, %rd1, %rd2;\n"
4677 " cvt.s64.u32 %rd4, %r8;\n"
4678 " mov.f32 %f1, 0f00000000; // 0\n"
4680 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4682 " ld.global.u8 %r9, [%rd3+0];\n"
4683 " cvt.rn.f32.u32 %f2, %r9;\n"
4684 " add.f32 %f1, %f2, %f1;\n"
4686 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_256_false_n];\n"
4688 " setp.ge.u32 %p2, %r6, %r5;\n"
4689 " @%p2 bra $Lt_43_13826;\n"
4690 " //<loop> Part of loop body line 181, head labeled $Lt_43_13570\n"
4692 " ld.global.u8 %r10, [%rd3+256];\n"
4693 " cvt.rn.f32.u32 %f3, %r10;\n"
4694 " add.f32 %f1, %f3, %f1;\n"
4696 " //<loop> Part of loop body line 181, head labeled $Lt_43_13570\n"
4697 " add.u32 %r6, %r6, %r8;\n"
4698 " add.u64 %rd3, %rd4, %rd3;\n"
4699 " setp.lt.u32 %p3, %r6, %r7;\n"
4700 " @%p3 bra $Lt_43_13570;\n"
4701 " bra.uni $Lt_43_13058;\n"
4703 " mov.f32 %f1, 0f00000000; // 0\n"
4706 " mov.f32 %f4, %f1;\n"
4707 " mov.f32 %f5, %f4;\n"
4709 " mov.u64 %rd5, __smem;\n"
4710 " cvt.u64.u32 %rd6, %r3;\n"
4711 " mul.wide.u32 %rd7, %r3, 4;\n"
4712 " add.u64 %rd8, %rd5, %rd7;\n"
4713 " st.volatile.shared.f32 [%rd8+0], %f4;\n"
4716 " mov.u32 %r11, 127;\n"
4717 " setp.gt.u32 %p4, %r3, %r11;\n"
4718 " @%p4 bra $Lt_43_14594;\n"
4720 " ld.volatile.shared.f32 %f6, [%rd8+512];\n"
4721 " add.f32 %f5, %f6, %f4;\n"
4722 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4725 " mov.u32 %r12, 63;\n"
4726 " setp.gt.u32 %p5, %r3, %r12;\n"
4727 " @%p5 bra $Lt_43_15106;\n"
4729 " ld.volatile.shared.f32 %f7, [%rd8+256];\n"
4730 " add.f32 %f5, %f7, %f5;\n"
4731 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4734 " mov.u32 %r13, 31;\n"
4735 " setp.gt.u32 %p6, %r3, %r13;\n"
4736 " @%p6 bra $Lt_43_15618;\n"
4738 " ld.volatile.shared.f32 %f8, [%rd8+128];\n"
4739 " add.f32 %f9, %f8, %f5;\n"
4740 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4742 " ld.volatile.shared.f32 %f10, [%rd8+64];\n"
4743 " add.f32 %f11, %f10, %f9;\n"
4744 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4746 " ld.volatile.shared.f32 %f12, [%rd8+32];\n"
4747 " add.f32 %f13, %f12, %f11;\n"
4748 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
4750 " ld.volatile.shared.f32 %f14, [%rd8+16];\n"
4751 " add.f32 %f15, %f14, %f13;\n"
4752 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
4754 " ld.volatile.shared.f32 %f16, [%rd8+8];\n"
4755 " add.f32 %f17, %f16, %f15;\n"
4756 " st.volatile.shared.f32 [%rd8+0], %f17;\n"
4758 " ld.volatile.shared.f32 %f18, [%rd8+4];\n"
4759 " add.f32 %f5, %f18, %f17;\n"
4760 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4763 " mov.u32 %r14, 0;\n"
4764 " setp.ne.u32 %p7, %r3, %r14;\n"
4765 " @%p7 bra $Lt_43_16130;\n"
4767 " ld.shared.f32 %f19, [__smem+0];\n"
4768 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_256_false_g_odata];\n"
4769 " cvt.u64.u32 %rd10, %r1;\n"
4770 " mul.wide.u32 %rd11, %r1, 4;\n"
4771 " add.u64 %rd12, %rd9, %rd11;\n"
4772 " st.global.f32 [%rd12+0], %f19;\n"
4776 "$LDWend_reduce_uchar_256_false:\n"
4777 " } // reduce_uchar_256_false\n"
4779 " .entry reduce_uchar_512_false (\n"
4780 " .param .u64 __cudaparm_reduce_uchar_512_false_g_idata,\n"
4781 " .param .u64 __cudaparm_reduce_uchar_512_false_g_odata,\n"
4782 " .param .u32 __cudaparm_reduce_uchar_512_false_n)\n"
4784 " .reg .u16 %rh<3>;\n"
4785 " .reg .u32 %r<17>;\n"
4786 " .reg .u64 %rd<14>;\n"
4787 " .reg .f32 %f<22>;\n"
4788 " .reg .pred %p<10>;\n"
4790 "$LDWbegin_reduce_uchar_512_false:\n"
4792 " cvt.u32.u16 %r1, %ctaid.x;\n"
4793 " mul.lo.u32 %r2, %r1, 1024;\n"
4794 " cvt.u32.u16 %r3, %tid.x;\n"
4795 " add.u32 %r4, %r2, %r3;\n"
4796 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_512_false_n];\n"
4797 " setp.ge.u32 %p1, %r4, %r5;\n"
4798 " @%p1 bra $Lt_44_16898;\n"
4799 " add.u32 %r6, %r4, 512;\n"
4800 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_512_false_n];\n"
4801 " add.u32 %r7, %r5, 512;\n"
4802 " mov.u16 %rh1, %nctaid.x;\n"
4803 " mul.wide.u16 %r8, %rh1, 1024;\n"
4804 " cvt.u64.u32 %rd1, %r4;\n"
4805 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_512_false_g_idata];\n"
4806 " add.u64 %rd3, %rd1, %rd2;\n"
4807 " cvt.s64.u32 %rd4, %r8;\n"
4808 " mov.f32 %f1, 0f00000000; // 0\n"
4810 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4812 " ld.global.u8 %r9, [%rd3+0];\n"
4813 " cvt.rn.f32.u32 %f2, %r9;\n"
4814 " add.f32 %f1, %f2, %f1;\n"
4816 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_512_false_n];\n"
4818 " setp.ge.u32 %p2, %r6, %r5;\n"
4819 " @%p2 bra $Lt_44_13570;\n"
4820 " //<loop> Part of loop body line 181, head labeled $Lt_44_13314\n"
4822 " ld.global.u8 %r10, [%rd3+512];\n"
4823 " cvt.rn.f32.u32 %f3, %r10;\n"
4824 " add.f32 %f1, %f3, %f1;\n"
4826 " //<loop> Part of loop body line 181, head labeled $Lt_44_13314\n"
4827 " add.u32 %r6, %r6, %r8;\n"
4828 " add.u64 %rd3, %rd4, %rd3;\n"
4829 " setp.lt.u32 %p3, %r6, %r7;\n"
4830 " @%p3 bra $Lt_44_13314;\n"
4831 " bra.uni $Lt_44_12802;\n"
4833 " mov.f32 %f1, 0f00000000; // 0\n"
4836 " mov.f32 %f4, %f1;\n"
4837 " mov.f32 %f5, %f4;\n"
4839 " mov.u64 %rd5, __smem;\n"
4840 " cvt.u64.u32 %rd6, %r3;\n"
4841 " mul.wide.u32 %rd7, %r3, 4;\n"
4842 " add.u64 %rd8, %rd5, %rd7;\n"
4843 " st.volatile.shared.f32 [%rd8+0], %f4;\n"
4846 " mov.u32 %r11, 255;\n"
4847 " setp.gt.u32 %p4, %r3, %r11;\n"
4848 " @%p4 bra $Lt_44_14338;\n"
4850 " ld.volatile.shared.f32 %f6, [%rd8+1024];\n"
4851 " add.f32 %f5, %f6, %f4;\n"
4852 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4855 " mov.u32 %r12, 127;\n"
4856 " setp.gt.u32 %p5, %r3, %r12;\n"
4857 " @%p5 bra $Lt_44_14850;\n"
4859 " ld.volatile.shared.f32 %f7, [%rd8+512];\n"
4860 " add.f32 %f5, %f7, %f5;\n"
4861 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4864 " mov.u32 %r13, 63;\n"
4865 " setp.gt.u32 %p6, %r3, %r13;\n"
4866 " @%p6 bra $Lt_44_15362;\n"
4868 " ld.volatile.shared.f32 %f8, [%rd8+256];\n"
4869 " add.f32 %f5, %f8, %f5;\n"
4870 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4873 " mov.u32 %r14, 31;\n"
4874 " setp.gt.u32 %p7, %r3, %r14;\n"
4875 " @%p7 bra $Lt_44_15874;\n"
4877 " ld.volatile.shared.f32 %f9, [%rd8+128];\n"
4878 " add.f32 %f10, %f9, %f5;\n"
4879 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
4881 " ld.volatile.shared.f32 %f11, [%rd8+64];\n"
4882 " add.f32 %f12, %f11, %f10;\n"
4883 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
4885 " ld.volatile.shared.f32 %f13, [%rd8+32];\n"
4886 " add.f32 %f14, %f13, %f12;\n"
4887 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
4889 " ld.volatile.shared.f32 %f15, [%rd8+16];\n"
4890 " add.f32 %f16, %f15, %f14;\n"
4891 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
4893 " ld.volatile.shared.f32 %f17, [%rd8+8];\n"
4894 " add.f32 %f18, %f17, %f16;\n"
4895 " st.volatile.shared.f32 [%rd8+0], %f18;\n"
4897 " ld.volatile.shared.f32 %f19, [%rd8+4];\n"
4898 " add.f32 %f5, %f19, %f18;\n"
4899 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4902 " mov.u32 %r15, 0;\n"
4903 " setp.ne.u32 %p8, %r3, %r15;\n"
4904 " @%p8 bra $Lt_44_16386;\n"
4906 " ld.shared.f32 %f20, [__smem+0];\n"
4907 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_512_false_g_odata];\n"
4908 " cvt.u64.u32 %rd10, %r1;\n"
4909 " mul.wide.u32 %rd11, %r1, 4;\n"
4910 " add.u64 %rd12, %rd9, %rd11;\n"
4911 " st.global.f32 [%rd12+0], %f20;\n"
4915 "$LDWend_reduce_uchar_512_false:\n"
4916 " } // reduce_uchar_512_false\n"
4918 " .entry packed_float_reduce_1_false_false (\n"
4919 " .param .u64 __cudaparm_packed_float_reduce_1_false_false_g_idata,\n"
4920 " .param .u64 __cudaparm_packed_float_reduce_1_false_false_g_odata,\n"
4921 " .param .u32 __cudaparm_packed_float_reduce_1_false_false_n)\n"
4923 " .reg .u16 %rh<7>;\n"
4924 " .reg .u32 %r<14>;\n"
4925 " .reg .u64 %rd<13>;\n"
4926 " .reg .f32 %f<4>;\n"
4927 " .reg .pred %p<5>;\n"
4928 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
4930 "$LDWbegin_packed_float_reduce_1_false_false:\n"
4932 " cvt.u32.u16 %r1, %ctaid.x;\n"
4933 " mul24.lo.u32 %r2, %r1, 2;\n"
4934 " cvt.u32.u16 %r3, %tid.x;\n"
4935 " add.u32 %r4, %r2, %r3;\n"
4936 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_false_n];\n"
4937 " setp.ge.u32 %p1, %r4, %r5;\n"
4938 " @%p1 bra $Lt_45_18178;\n"
4939 " mul.lo.u32 %r6, %r4, 4;\n"
4940 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_false_n];\n"
4941 " mul.lo.u32 %r7, %r5, 4;\n"
4942 " mov.u16 %rh1, %nctaid.x;\n"
4943 " mul.wide.u16 %r8, %rh1, 8;\n"
4944 " add.u32 %r9, %r6, 4;\n"
4945 " add.u32 %r10, %r7, 4;\n"
4946 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_false_g_idata];\n"
4948 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
4950 " cvt.u8.u32 %r11, %r9;\n"
4951 " cvt.u64.u32 %rd2, %r11;\n"
4953 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_false_g_idata];\n"
4955 " add.u64 %rd3, %rd2, %rd1;\n"
4956 " ld.global.u8 %rh2, [%rd3+0];\n"
4957 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
4959 " ld.global.u8 %rh3, [%rd3+1];\n"
4960 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
4962 " ld.global.u8 %rh4, [%rd3+2];\n"
4963 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
4965 " ld.global.u8 %rh5, [%rd3+3];\n"
4966 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
4967 " add.u32 %r9, %r8, %r9;\n"
4968 " setp.lt.u32 %p2, %r9, %r10;\n"
4969 " @%p2 bra $Lt_45_17154;\n"
4970 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
4971 " bra.uni $Lt_45_16642;\n"
4973 " mov.f32 %f1, 0f00000000; // 0\n"
4976 " mov.u64 %rd4, __smem;\n"
4977 " cvt.u64.u32 %rd5, %r3;\n"
4978 " mul.wide.u32 %rd6, %r3, 4;\n"
4979 " add.u64 %rd7, %rd4, %rd6;\n"
4980 " st.shared.f32 [%rd7+0], %f1;\n"
4983 " mov.u32 %r12, 0;\n"
4984 " setp.ne.u32 %p3, %r3, %r12;\n"
4985 " @%p3 bra $Lt_45_17666;\n"
4987 " ld.shared.f32 %f2, [__smem+0];\n"
4988 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_false_false_g_odata];\n"
4989 " cvt.u64.u32 %rd9, %r1;\n"
4990 " mul.wide.u32 %rd10, %r1, 4;\n"
4991 " add.u64 %rd11, %rd8, %rd10;\n"
4992 " st.global.f32 [%rd11+0], %f2;\n"
4996 "$LDWend_packed_float_reduce_1_false_false:\n"
4997 " } // packed_float_reduce_1_false_false\n"
4999 " .entry packed_float_reduce_1_false_true (\n"
5000 " .param .u64 __cudaparm_packed_float_reduce_1_false_true_g_idata,\n"
5001 " .param .u64 __cudaparm_packed_float_reduce_1_false_true_g_odata,\n"
5002 " .param .u32 __cudaparm_packed_float_reduce_1_false_true_n)\n"
5004 " .reg .u16 %rh<7>;\n"
5005 " .reg .u32 %r<14>;\n"
5006 " .reg .u64 %rd<13>;\n"
5007 " .reg .f32 %f<4>;\n"
5008 " .reg .pred %p<5>;\n"
5009 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5011 "$LDWbegin_packed_float_reduce_1_false_true:\n"
5013 " cvt.u32.u16 %r1, %ctaid.x;\n"
5014 " mul24.lo.u32 %r2, %r1, 2;\n"
5015 " cvt.u32.u16 %r3, %tid.x;\n"
5016 " add.u32 %r4, %r2, %r3;\n"
5017 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_true_n];\n"
5018 " setp.ge.u32 %p1, %r4, %r5;\n"
5019 " @%p1 bra $Lt_46_18178;\n"
5020 " mul.lo.u32 %r6, %r4, 4;\n"
5021 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_true_n];\n"
5022 " mul.lo.u32 %r7, %r5, 4;\n"
5023 " mov.u16 %rh1, %nctaid.x;\n"
5024 " mul.wide.u16 %r8, %rh1, 8;\n"
5025 " add.u32 %r9, %r6, 4;\n"
5026 " add.u32 %r10, %r7, 4;\n"
5027 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_true_g_idata];\n"
5029 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5031 " cvt.u8.u32 %r11, %r9;\n"
5032 " cvt.u64.u32 %rd2, %r11;\n"
5034 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_true_g_idata];\n"
5036 " add.u64 %rd3, %rd2, %rd1;\n"
5037 " ld.global.u8 %rh2, [%rd3+0];\n"
5038 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5040 " ld.global.u8 %rh3, [%rd3+1];\n"
5041 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5043 " ld.global.u8 %rh4, [%rd3+2];\n"
5044 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5046 " ld.global.u8 %rh5, [%rd3+3];\n"
5047 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5048 " add.u32 %r9, %r8, %r9;\n"
5049 " setp.lt.u32 %p2, %r9, %r10;\n"
5050 " @%p2 bra $Lt_46_17154;\n"
5051 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5052 " bra.uni $Lt_46_16642;\n"
5054 " mov.f32 %f1, 0f00000000; // 0\n"
5057 " mov.u64 %rd4, __smem;\n"
5058 " cvt.u64.u32 %rd5, %r3;\n"
5059 " mul.wide.u32 %rd6, %r3, 4;\n"
5060 " add.u64 %rd7, %rd4, %rd6;\n"
5061 " st.shared.f32 [%rd7+0], %f1;\n"
5064 " mov.u32 %r12, 0;\n"
5065 " setp.ne.u32 %p3, %r3, %r12;\n"
5066 " @%p3 bra $Lt_46_17666;\n"
5068 " ld.shared.f32 %f2, [__smem+0];\n"
5069 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_false_true_g_odata];\n"
5070 " cvt.u64.u32 %rd9, %r1;\n"
5071 " mul.wide.u32 %rd10, %r1, 4;\n"
5072 " add.u64 %rd11, %rd8, %rd10;\n"
5073 " st.global.f32 [%rd11+0], %f2;\n"
5077 "$LDWend_packed_float_reduce_1_false_true:\n"
5078 " } // packed_float_reduce_1_false_true\n"
5080 " .entry packed_float_reduce_1_true_false (\n"
5081 " .param .u64 __cudaparm_packed_float_reduce_1_true_false_g_idata,\n"
5082 " .param .u64 __cudaparm_packed_float_reduce_1_true_false_g_odata,\n"
5083 " .param .u32 __cudaparm_packed_float_reduce_1_true_false_n)\n"
5085 " .reg .u16 %rh<7>;\n"
5086 " .reg .u32 %r<14>;\n"
5087 " .reg .u64 %rd<13>;\n"
5088 " .reg .f32 %f<4>;\n"
5089 " .reg .pred %p<5>;\n"
5090 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5092 "$LDWbegin_packed_float_reduce_1_true_false:\n"
5094 " cvt.u32.u16 %r1, %ctaid.x;\n"
5095 " mul24.lo.u32 %r2, %r1, 2;\n"
5096 " cvt.u32.u16 %r3, %tid.x;\n"
5097 " add.u32 %r4, %r2, %r3;\n"
5098 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_false_n];\n"
5099 " setp.ge.u32 %p1, %r4, %r5;\n"
5100 " @%p1 bra $Lt_47_18178;\n"
5101 " mul.lo.u32 %r6, %r4, 4;\n"
5102 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_false_n];\n"
5103 " mul.lo.u32 %r7, %r5, 4;\n"
5104 " mov.u16 %rh1, %nctaid.x;\n"
5105 " mul.wide.u16 %r8, %rh1, 8;\n"
5106 " add.u32 %r9, %r6, 4;\n"
5107 " add.u32 %r10, %r7, 4;\n"
5108 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_false_g_idata];\n"
5110 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5112 " cvt.u8.u32 %r11, %r9;\n"
5113 " cvt.u64.u32 %rd2, %r11;\n"
5115 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_false_g_idata];\n"
5117 " add.u64 %rd3, %rd2, %rd1;\n"
5118 " ld.global.u8 %rh2, [%rd3+0];\n"
5119 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5121 " ld.global.u8 %rh3, [%rd3+1];\n"
5122 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5124 " ld.global.u8 %rh4, [%rd3+2];\n"
5125 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5127 " ld.global.u8 %rh5, [%rd3+3];\n"
5128 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5129 " add.u32 %r9, %r8, %r9;\n"
5130 " setp.lt.u32 %p2, %r9, %r10;\n"
5131 " @%p2 bra $Lt_47_17154;\n"
5132 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5133 " bra.uni $Lt_47_16642;\n"
5135 " mov.f32 %f1, 0f00000000; // 0\n"
5138 " mov.u64 %rd4, __smem;\n"
5139 " cvt.u64.u32 %rd5, %r3;\n"
5140 " mul.wide.u32 %rd6, %r3, 4;\n"
5141 " add.u64 %rd7, %rd4, %rd6;\n"
5142 " st.shared.f32 [%rd7+0], %f1;\n"
5145 " mov.u32 %r12, 0;\n"
5146 " setp.ne.u32 %p3, %r3, %r12;\n"
5147 " @%p3 bra $Lt_47_17666;\n"
5149 " ld.shared.f32 %f2, [__smem+0];\n"
5150 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_true_false_g_odata];\n"
5151 " cvt.u64.u32 %rd9, %r1;\n"
5152 " mul.wide.u32 %rd10, %r1, 4;\n"
5153 " add.u64 %rd11, %rd8, %rd10;\n"
5154 " st.global.f32 [%rd11+0], %f2;\n"
5158 "$LDWend_packed_float_reduce_1_true_false:\n"
5159 " } // packed_float_reduce_1_true_false\n"
5161 " .entry packed_float_reduce_1_true_true (\n"
5162 " .param .u64 __cudaparm_packed_float_reduce_1_true_true_g_idata,\n"
5163 " .param .u64 __cudaparm_packed_float_reduce_1_true_true_g_odata,\n"
5164 " .param .u32 __cudaparm_packed_float_reduce_1_true_true_n)\n"
5166 " .reg .u16 %rh<7>;\n"
5167 " .reg .u32 %r<14>;\n"
5168 " .reg .u64 %rd<13>;\n"
5169 " .reg .f32 %f<4>;\n"
5170 " .reg .pred %p<5>;\n"
5171 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5173 "$LDWbegin_packed_float_reduce_1_true_true:\n"
5175 " cvt.u32.u16 %r1, %ctaid.x;\n"
5176 " mul24.lo.u32 %r2, %r1, 2;\n"
5177 " cvt.u32.u16 %r3, %tid.x;\n"
5178 " add.u32 %r4, %r2, %r3;\n"
5179 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_true_n];\n"
5180 " setp.ge.u32 %p1, %r4, %r5;\n"
5181 " @%p1 bra $Lt_48_18178;\n"
5182 " mul.lo.u32 %r6, %r4, 4;\n"
5183 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_true_n];\n"
5184 " mul.lo.u32 %r7, %r5, 4;\n"
5185 " mov.u16 %rh1, %nctaid.x;\n"
5186 " mul.wide.u16 %r8, %rh1, 8;\n"
5187 " add.u32 %r9, %r6, 4;\n"
5188 " add.u32 %r10, %r7, 4;\n"
5189 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_true_g_idata];\n"
5191 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5193 " cvt.u8.u32 %r11, %r9;\n"
5194 " cvt.u64.u32 %rd2, %r11;\n"
5196 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_true_g_idata];\n"
5198 " add.u64 %rd3, %rd2, %rd1;\n"
5199 " ld.global.u8 %rh2, [%rd3+0];\n"
5200 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5202 " ld.global.u8 %rh3, [%rd3+1];\n"
5203 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5205 " ld.global.u8 %rh4, [%rd3+2];\n"
5206 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5208 " ld.global.u8 %rh5, [%rd3+3];\n"
5209 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5210 " add.u32 %r9, %r8, %r9;\n"
5211 " setp.lt.u32 %p2, %r9, %r10;\n"
5212 " @%p2 bra $Lt_48_17154;\n"
5213 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5214 " bra.uni $Lt_48_16642;\n"
5216 " mov.f32 %f1, 0f00000000; // 0\n"
5219 " mov.u64 %rd4, __smem;\n"
5220 " cvt.u64.u32 %rd5, %r3;\n"
5221 " mul.wide.u32 %rd6, %r3, 4;\n"
5222 " add.u64 %rd7, %rd4, %rd6;\n"
5223 " st.shared.f32 [%rd7+0], %f1;\n"
5226 " mov.u32 %r12, 0;\n"
5227 " setp.ne.u32 %p3, %r3, %r12;\n"
5228 " @%p3 bra $Lt_48_17666;\n"
5230 " ld.shared.f32 %f2, [__smem+0];\n"
5231 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_true_true_g_odata];\n"
5232 " cvt.u64.u32 %rd9, %r1;\n"
5233 " mul.wide.u32 %rd10, %r1, 4;\n"
5234 " add.u64 %rd11, %rd8, %rd10;\n"
5235 " st.global.f32 [%rd11+0], %f2;\n"
5239 "$LDWend_packed_float_reduce_1_true_true:\n"
5240 " } // packed_float_reduce_1_true_true\n"
5242 " .entry packed_float_reduce_2_false_false (\n"
5243 " .param .u64 __cudaparm_packed_float_reduce_2_false_false_g_idata,\n"
5244 " .param .u64 __cudaparm_packed_float_reduce_2_false_false_g_odata,\n"
5245 " .param .u32 __cudaparm_packed_float_reduce_2_false_false_n)\n"
5247 " .reg .u16 %rh<7>;\n"
5248 " .reg .u32 %r<15>;\n"
5249 " .reg .u64 %rd<13>;\n"
5250 " .reg .f32 %f<5>;\n"
5251 " .reg .pred %p<6>;\n"
5252 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5254 "$LDWbegin_packed_float_reduce_2_false_false:\n"
5256 " cvt.u32.u16 %r1, %ctaid.x;\n"
5257 " mul24.lo.u32 %r2, %r1, 4;\n"
5258 " cvt.u32.u16 %r3, %tid.x;\n"
5259 " add.u32 %r4, %r2, %r3;\n"
5260 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_false_n];\n"
5261 " setp.ge.u32 %p1, %r4, %r5;\n"
5262 " @%p1 bra $Lt_49_18434;\n"
5263 " mul.lo.u32 %r6, %r4, 4;\n"
5264 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_false_n];\n"
5265 " mul.lo.u32 %r7, %r5, 4;\n"
5266 " mov.u16 %rh1, %nctaid.x;\n"
5267 " mul.wide.u16 %r8, %rh1, 16;\n"
5268 " add.u32 %r9, %r6, 8;\n"
5269 " add.u32 %r10, %r7, 8;\n"
5270 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_false_g_idata];\n"
5272 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5274 " cvt.u8.u32 %r11, %r9;\n"
5275 " cvt.u64.u32 %rd2, %r11;\n"
5277 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_false_g_idata];\n"
5279 " add.u64 %rd3, %rd2, %rd1;\n"
5280 " ld.global.u8 %rh2, [%rd3+0];\n"
5281 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5283 " ld.global.u8 %rh3, [%rd3+1];\n"
5284 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5286 " ld.global.u8 %rh4, [%rd3+2];\n"
5287 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5289 " ld.global.u8 %rh5, [%rd3+3];\n"
5290 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5291 " add.u32 %r9, %r8, %r9;\n"
5292 " setp.lt.u32 %p2, %r9, %r10;\n"
5293 " @%p2 bra $Lt_49_16898;\n"
5294 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5295 " bra.uni $Lt_49_16386;\n"
5297 " mov.f32 %f1, 0f00000000; // 0\n"
5300 " mov.u64 %rd4, __smem;\n"
5301 " cvt.u64.u32 %rd5, %r3;\n"
5302 " mul.wide.u32 %rd6, %r3, 4;\n"
5303 " add.u64 %rd7, %rd4, %rd6;\n"
5304 " st.shared.f32 [%rd7+0], %f1;\n"
5307 " mov.u32 %r12, 31;\n"
5308 " setp.gt.u32 %p3, %r3, %r12;\n"
5309 " @%p3 bra $Lt_49_17410;\n"
5311 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5312 " add.f32 %f1, %f2, %f1;\n"
5313 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5315 " mov.u32 %r13, 0;\n"
5316 " setp.ne.u32 %p4, %r3, %r13;\n"
5317 " @%p4 bra $Lt_49_17922;\n"
5319 " ld.shared.f32 %f3, [__smem+0];\n"
5320 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_false_false_g_odata];\n"
5321 " cvt.u64.u32 %rd9, %r1;\n"
5322 " mul.wide.u32 %rd10, %r1, 4;\n"
5323 " add.u64 %rd11, %rd8, %rd10;\n"
5324 " st.global.f32 [%rd11+0], %f3;\n"
5328 "$LDWend_packed_float_reduce_2_false_false:\n"
5329 " } // packed_float_reduce_2_false_false\n"
5331 " .entry packed_float_reduce_2_false_true (\n"
5332 " .param .u64 __cudaparm_packed_float_reduce_2_false_true_g_idata,\n"
5333 " .param .u64 __cudaparm_packed_float_reduce_2_false_true_g_odata,\n"
5334 " .param .u32 __cudaparm_packed_float_reduce_2_false_true_n)\n"
5336 " .reg .u16 %rh<7>;\n"
5337 " .reg .u32 %r<15>;\n"
5338 " .reg .u64 %rd<13>;\n"
5339 " .reg .f32 %f<5>;\n"
5340 " .reg .pred %p<6>;\n"
5341 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5343 "$LDWbegin_packed_float_reduce_2_false_true:\n"
5345 " cvt.u32.u16 %r1, %ctaid.x;\n"
5346 " mul24.lo.u32 %r2, %r1, 4;\n"
5347 " cvt.u32.u16 %r3, %tid.x;\n"
5348 " add.u32 %r4, %r2, %r3;\n"
5349 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_true_n];\n"
5350 " setp.ge.u32 %p1, %r4, %r5;\n"
5351 " @%p1 bra $Lt_50_18434;\n"
5352 " mul.lo.u32 %r6, %r4, 4;\n"
5353 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_true_n];\n"
5354 " mul.lo.u32 %r7, %r5, 4;\n"
5355 " mov.u16 %rh1, %nctaid.x;\n"
5356 " mul.wide.u16 %r8, %rh1, 16;\n"
5357 " add.u32 %r9, %r6, 8;\n"
5358 " add.u32 %r10, %r7, 8;\n"
5359 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_true_g_idata];\n"
5361 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5363 " cvt.u8.u32 %r11, %r9;\n"
5364 " cvt.u64.u32 %rd2, %r11;\n"
5366 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_true_g_idata];\n"
5368 " add.u64 %rd3, %rd2, %rd1;\n"
5369 " ld.global.u8 %rh2, [%rd3+0];\n"
5370 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5372 " ld.global.u8 %rh3, [%rd3+1];\n"
5373 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5375 " ld.global.u8 %rh4, [%rd3+2];\n"
5376 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5378 " ld.global.u8 %rh5, [%rd3+3];\n"
5379 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5380 " add.u32 %r9, %r8, %r9;\n"
5381 " setp.lt.u32 %p2, %r9, %r10;\n"
5382 " @%p2 bra $Lt_50_16898;\n"
5383 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5384 " bra.uni $Lt_50_16386;\n"
5386 " mov.f32 %f1, 0f00000000; // 0\n"
5389 " mov.u64 %rd4, __smem;\n"
5390 " cvt.u64.u32 %rd5, %r3;\n"
5391 " mul.wide.u32 %rd6, %r3, 4;\n"
5392 " add.u64 %rd7, %rd4, %rd6;\n"
5393 " st.shared.f32 [%rd7+0], %f1;\n"
5396 " mov.u32 %r12, 31;\n"
5397 " setp.gt.u32 %p3, %r3, %r12;\n"
5398 " @%p3 bra $Lt_50_17410;\n"
5400 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5401 " add.f32 %f1, %f2, %f1;\n"
5402 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5404 " mov.u32 %r13, 0;\n"
5405 " setp.ne.u32 %p4, %r3, %r13;\n"
5406 " @%p4 bra $Lt_50_17922;\n"
5408 " ld.shared.f32 %f3, [__smem+0];\n"
5409 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_false_true_g_odata];\n"
5410 " cvt.u64.u32 %rd9, %r1;\n"
5411 " mul.wide.u32 %rd10, %r1, 4;\n"
5412 " add.u64 %rd11, %rd8, %rd10;\n"
5413 " st.global.f32 [%rd11+0], %f3;\n"
5417 "$LDWend_packed_float_reduce_2_false_true:\n"
5418 " } // packed_float_reduce_2_false_true\n"
5420 " .entry packed_float_reduce_2_true_false (\n"
5421 " .param .u64 __cudaparm_packed_float_reduce_2_true_false_g_idata,\n"
5422 " .param .u64 __cudaparm_packed_float_reduce_2_true_false_g_odata,\n"
5423 " .param .u32 __cudaparm_packed_float_reduce_2_true_false_n)\n"
5425 " .reg .u16 %rh<7>;\n"
5426 " .reg .u32 %r<15>;\n"
5427 " .reg .u64 %rd<13>;\n"
5428 " .reg .f32 %f<5>;\n"
5429 " .reg .pred %p<6>;\n"
5430 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5432 "$LDWbegin_packed_float_reduce_2_true_false:\n"
5434 " cvt.u32.u16 %r1, %ctaid.x;\n"
5435 " mul24.lo.u32 %r2, %r1, 4;\n"
5436 " cvt.u32.u16 %r3, %tid.x;\n"
5437 " add.u32 %r4, %r2, %r3;\n"
5438 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_false_n];\n"
5439 " setp.ge.u32 %p1, %r4, %r5;\n"
5440 " @%p1 bra $Lt_51_18434;\n"
5441 " mul.lo.u32 %r6, %r4, 4;\n"
5442 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_false_n];\n"
5443 " mul.lo.u32 %r7, %r5, 4;\n"
5444 " mov.u16 %rh1, %nctaid.x;\n"
5445 " mul.wide.u16 %r8, %rh1, 16;\n"
5446 " add.u32 %r9, %r6, 8;\n"
5447 " add.u32 %r10, %r7, 8;\n"
5448 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_false_g_idata];\n"
5450 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5452 " cvt.u8.u32 %r11, %r9;\n"
5453 " cvt.u64.u32 %rd2, %r11;\n"
5455 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_false_g_idata];\n"
5457 " add.u64 %rd3, %rd2, %rd1;\n"
5458 " ld.global.u8 %rh2, [%rd3+0];\n"
5459 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5461 " ld.global.u8 %rh3, [%rd3+1];\n"
5462 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5464 " ld.global.u8 %rh4, [%rd3+2];\n"
5465 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5467 " ld.global.u8 %rh5, [%rd3+3];\n"
5468 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5469 " add.u32 %r9, %r8, %r9;\n"
5470 " setp.lt.u32 %p2, %r9, %r10;\n"
5471 " @%p2 bra $Lt_51_16898;\n"
5472 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5473 " bra.uni $Lt_51_16386;\n"
5475 " mov.f32 %f1, 0f00000000; // 0\n"
5478 " mov.u64 %rd4, __smem;\n"
5479 " cvt.u64.u32 %rd5, %r3;\n"
5480 " mul.wide.u32 %rd6, %r3, 4;\n"
5481 " add.u64 %rd7, %rd4, %rd6;\n"
5482 " st.shared.f32 [%rd7+0], %f1;\n"
5485 " mov.u32 %r12, 31;\n"
5486 " setp.gt.u32 %p3, %r3, %r12;\n"
5487 " @%p3 bra $Lt_51_17410;\n"
5489 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5490 " add.f32 %f1, %f2, %f1;\n"
5491 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5493 " mov.u32 %r13, 0;\n"
5494 " setp.ne.u32 %p4, %r3, %r13;\n"
5495 " @%p4 bra $Lt_51_17922;\n"
5497 " ld.shared.f32 %f3, [__smem+0];\n"
5498 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_true_false_g_odata];\n"
5499 " cvt.u64.u32 %rd9, %r1;\n"
5500 " mul.wide.u32 %rd10, %r1, 4;\n"
5501 " add.u64 %rd11, %rd8, %rd10;\n"
5502 " st.global.f32 [%rd11+0], %f3;\n"
5506 "$LDWend_packed_float_reduce_2_true_false:\n"
5507 " } // packed_float_reduce_2_true_false\n"
5509 " .entry packed_float_reduce_2_true_true (\n"
5510 " .param .u64 __cudaparm_packed_float_reduce_2_true_true_g_idata,\n"
5511 " .param .u64 __cudaparm_packed_float_reduce_2_true_true_g_odata,\n"
5512 " .param .u32 __cudaparm_packed_float_reduce_2_true_true_n)\n"
5514 " .reg .u16 %rh<7>;\n"
5515 " .reg .u32 %r<15>;\n"
5516 " .reg .u64 %rd<13>;\n"
5517 " .reg .f32 %f<5>;\n"
5518 " .reg .pred %p<6>;\n"
5519 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5521 "$LDWbegin_packed_float_reduce_2_true_true:\n"
5523 " cvt.u32.u16 %r1, %ctaid.x;\n"
5524 " mul24.lo.u32 %r2, %r1, 4;\n"
5525 " cvt.u32.u16 %r3, %tid.x;\n"
5526 " add.u32 %r4, %r2, %r3;\n"
5527 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_true_n];\n"
5528 " setp.ge.u32 %p1, %r4, %r5;\n"
5529 " @%p1 bra $Lt_52_18434;\n"
5530 " mul.lo.u32 %r6, %r4, 4;\n"
5531 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_true_n];\n"
5532 " mul.lo.u32 %r7, %r5, 4;\n"
5533 " mov.u16 %rh1, %nctaid.x;\n"
5534 " mul.wide.u16 %r8, %rh1, 16;\n"
5535 " add.u32 %r9, %r6, 8;\n"
5536 " add.u32 %r10, %r7, 8;\n"
5537 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_true_g_idata];\n"
5539 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5541 " cvt.u8.u32 %r11, %r9;\n"
5542 " cvt.u64.u32 %rd2, %r11;\n"
5544 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_true_g_idata];\n"
5546 " add.u64 %rd3, %rd2, %rd1;\n"
5547 " ld.global.u8 %rh2, [%rd3+0];\n"
5548 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5550 " ld.global.u8 %rh3, [%rd3+1];\n"
5551 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5553 " ld.global.u8 %rh4, [%rd3+2];\n"
5554 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5556 " ld.global.u8 %rh5, [%rd3+3];\n"
5557 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5558 " add.u32 %r9, %r8, %r9;\n"
5559 " setp.lt.u32 %p2, %r9, %r10;\n"
5560 " @%p2 bra $Lt_52_16898;\n"
5561 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5562 " bra.uni $Lt_52_16386;\n"
5564 " mov.f32 %f1, 0f00000000; // 0\n"
5567 " mov.u64 %rd4, __smem;\n"
5568 " cvt.u64.u32 %rd5, %r3;\n"
5569 " mul.wide.u32 %rd6, %r3, 4;\n"
5570 " add.u64 %rd7, %rd4, %rd6;\n"
5571 " st.shared.f32 [%rd7+0], %f1;\n"
5574 " mov.u32 %r12, 31;\n"
5575 " setp.gt.u32 %p3, %r3, %r12;\n"
5576 " @%p3 bra $Lt_52_17410;\n"
5578 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5579 " add.f32 %f1, %f2, %f1;\n"
5580 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5582 " mov.u32 %r13, 0;\n"
5583 " setp.ne.u32 %p4, %r3, %r13;\n"
5584 " @%p4 bra $Lt_52_17922;\n"
5586 " ld.shared.f32 %f3, [__smem+0];\n"
5587 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_true_true_g_odata];\n"
5588 " cvt.u64.u32 %rd9, %r1;\n"
5589 " mul.wide.u32 %rd10, %r1, 4;\n"
5590 " add.u64 %rd11, %rd8, %rd10;\n"
5591 " st.global.f32 [%rd11+0], %f3;\n"
5595 "$LDWend_packed_float_reduce_2_true_true:\n"
5596 " } // packed_float_reduce_2_true_true\n"
5598 " .entry packed_float_reduce_4_false_false (\n"
5599 " .param .u64 __cudaparm_packed_float_reduce_4_false_false_g_idata,\n"
5600 " .param .u64 __cudaparm_packed_float_reduce_4_false_false_g_odata,\n"
5601 " .param .u32 __cudaparm_packed_float_reduce_4_false_false_n)\n"
5603 " .reg .u16 %rh<7>;\n"
5604 " .reg .u32 %r<15>;\n"
5605 " .reg .u64 %rd<13>;\n"
5606 " .reg .f32 %f<7>;\n"
5607 " .reg .pred %p<6>;\n"
5608 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5610 "$LDWbegin_packed_float_reduce_4_false_false:\n"
5612 " cvt.u32.u16 %r1, %ctaid.x;\n"
5613 " mul24.lo.u32 %r2, %r1, 8;\n"
5614 " cvt.u32.u16 %r3, %tid.x;\n"
5615 " add.u32 %r4, %r2, %r3;\n"
5616 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_false_n];\n"
5617 " setp.ge.u32 %p1, %r4, %r5;\n"
5618 " @%p1 bra $Lt_53_18178;\n"
5619 " mul.lo.u32 %r6, %r4, 4;\n"
5620 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_false_n];\n"
5621 " mul.lo.u32 %r7, %r5, 4;\n"
5622 " mov.u16 %rh1, %nctaid.x;\n"
5623 " mul.wide.u16 %r8, %rh1, 32;\n"
5624 " add.u32 %r9, %r6, 16;\n"
5625 " add.u32 %r10, %r7, 16;\n"
5626 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_false_g_idata];\n"
5628 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5630 " cvt.u8.u32 %r11, %r9;\n"
5631 " cvt.u64.u32 %rd2, %r11;\n"
5633 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_false_g_idata];\n"
5635 " add.u64 %rd3, %rd2, %rd1;\n"
5636 " ld.global.u8 %rh2, [%rd3+0];\n"
5637 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5639 " ld.global.u8 %rh3, [%rd3+1];\n"
5640 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5642 " ld.global.u8 %rh4, [%rd3+2];\n"
5643 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5645 " ld.global.u8 %rh5, [%rd3+3];\n"
5646 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5647 " add.u32 %r9, %r8, %r9;\n"
5648 " setp.lt.u32 %p2, %r9, %r10;\n"
5649 " @%p2 bra $Lt_53_16642;\n"
5650 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5651 " bra.uni $Lt_53_16130;\n"
5653 " mov.f32 %f1, 0f00000000; // 0\n"
5656 " mov.u64 %rd4, __smem;\n"
5657 " cvt.u64.u32 %rd5, %r3;\n"
5658 " mul.wide.u32 %rd6, %r3, 4;\n"
5659 " add.u64 %rd7, %rd4, %rd6;\n"
5660 " st.shared.f32 [%rd7+0], %f1;\n"
5663 " mov.u32 %r12, 31;\n"
5664 " setp.gt.u32 %p3, %r3, %r12;\n"
5665 " @%p3 bra $Lt_53_17154;\n"
5667 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5668 " add.f32 %f3, %f2, %f1;\n"
5669 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5671 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5672 " add.f32 %f1, %f4, %f3;\n"
5673 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5675 " mov.u32 %r13, 0;\n"
5676 " setp.ne.u32 %p4, %r3, %r13;\n"
5677 " @%p4 bra $Lt_53_17666;\n"
5679 " ld.shared.f32 %f5, [__smem+0];\n"
5680 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_false_false_g_odata];\n"
5681 " cvt.u64.u32 %rd9, %r1;\n"
5682 " mul.wide.u32 %rd10, %r1, 4;\n"
5683 " add.u64 %rd11, %rd8, %rd10;\n"
5684 " st.global.f32 [%rd11+0], %f5;\n"
5688 "$LDWend_packed_float_reduce_4_false_false:\n"
5689 " } // packed_float_reduce_4_false_false\n"
5691 " .entry packed_float_reduce_4_false_true (\n"
5692 " .param .u64 __cudaparm_packed_float_reduce_4_false_true_g_idata,\n"
5693 " .param .u64 __cudaparm_packed_float_reduce_4_false_true_g_odata,\n"
5694 " .param .u32 __cudaparm_packed_float_reduce_4_false_true_n)\n"
5696 " .reg .u16 %rh<7>;\n"
5697 " .reg .u32 %r<15>;\n"
5698 " .reg .u64 %rd<13>;\n"
5699 " .reg .f32 %f<7>;\n"
5700 " .reg .pred %p<6>;\n"
5701 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5703 "$LDWbegin_packed_float_reduce_4_false_true:\n"
5705 " cvt.u32.u16 %r1, %ctaid.x;\n"
5706 " mul24.lo.u32 %r2, %r1, 8;\n"
5707 " cvt.u32.u16 %r3, %tid.x;\n"
5708 " add.u32 %r4, %r2, %r3;\n"
5709 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_true_n];\n"
5710 " setp.ge.u32 %p1, %r4, %r5;\n"
5711 " @%p1 bra $Lt_54_18178;\n"
5712 " mul.lo.u32 %r6, %r4, 4;\n"
5713 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_true_n];\n"
5714 " mul.lo.u32 %r7, %r5, 4;\n"
5715 " mov.u16 %rh1, %nctaid.x;\n"
5716 " mul.wide.u16 %r8, %rh1, 32;\n"
5717 " add.u32 %r9, %r6, 16;\n"
5718 " add.u32 %r10, %r7, 16;\n"
5719 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_true_g_idata];\n"
5721 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5723 " cvt.u8.u32 %r11, %r9;\n"
5724 " cvt.u64.u32 %rd2, %r11;\n"
5726 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_true_g_idata];\n"
5728 " add.u64 %rd3, %rd2, %rd1;\n"
5729 " ld.global.u8 %rh2, [%rd3+0];\n"
5730 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5732 " ld.global.u8 %rh3, [%rd3+1];\n"
5733 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5735 " ld.global.u8 %rh4, [%rd3+2];\n"
5736 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5738 " ld.global.u8 %rh5, [%rd3+3];\n"
5739 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5740 " add.u32 %r9, %r8, %r9;\n"
5741 " setp.lt.u32 %p2, %r9, %r10;\n"
5742 " @%p2 bra $Lt_54_16642;\n"
5743 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5744 " bra.uni $Lt_54_16130;\n"
5746 " mov.f32 %f1, 0f00000000; // 0\n"
5749 " mov.u64 %rd4, __smem;\n"
5750 " cvt.u64.u32 %rd5, %r3;\n"
5751 " mul.wide.u32 %rd6, %r3, 4;\n"
5752 " add.u64 %rd7, %rd4, %rd6;\n"
5753 " st.shared.f32 [%rd7+0], %f1;\n"
5756 " mov.u32 %r12, 31;\n"
5757 " setp.gt.u32 %p3, %r3, %r12;\n"
5758 " @%p3 bra $Lt_54_17154;\n"
5760 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5761 " add.f32 %f3, %f2, %f1;\n"
5762 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5764 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5765 " add.f32 %f1, %f4, %f3;\n"
5766 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5768 " mov.u32 %r13, 0;\n"
5769 " setp.ne.u32 %p4, %r3, %r13;\n"
5770 " @%p4 bra $Lt_54_17666;\n"
5772 " ld.shared.f32 %f5, [__smem+0];\n"
5773 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_false_true_g_odata];\n"
5774 " cvt.u64.u32 %rd9, %r1;\n"
5775 " mul.wide.u32 %rd10, %r1, 4;\n"
5776 " add.u64 %rd11, %rd8, %rd10;\n"
5777 " st.global.f32 [%rd11+0], %f5;\n"
5781 "$LDWend_packed_float_reduce_4_false_true:\n"
5782 " } // packed_float_reduce_4_false_true\n"
5784 " .entry packed_float_reduce_4_true_false (\n"
5785 " .param .u64 __cudaparm_packed_float_reduce_4_true_false_g_idata,\n"
5786 " .param .u64 __cudaparm_packed_float_reduce_4_true_false_g_odata,\n"
5787 " .param .u32 __cudaparm_packed_float_reduce_4_true_false_n)\n"
5789 " .reg .u16 %rh<7>;\n"
5790 " .reg .u32 %r<15>;\n"
5791 " .reg .u64 %rd<13>;\n"
5792 " .reg .f32 %f<7>;\n"
5793 " .reg .pred %p<6>;\n"
5794 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5796 "$LDWbegin_packed_float_reduce_4_true_false:\n"
5798 " cvt.u32.u16 %r1, %ctaid.x;\n"
5799 " mul24.lo.u32 %r2, %r1, 8;\n"
5800 " cvt.u32.u16 %r3, %tid.x;\n"
5801 " add.u32 %r4, %r2, %r3;\n"
5802 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_false_n];\n"
5803 " setp.ge.u32 %p1, %r4, %r5;\n"
5804 " @%p1 bra $Lt_55_18178;\n"
5805 " mul.lo.u32 %r6, %r4, 4;\n"
5806 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_false_n];\n"
5807 " mul.lo.u32 %r7, %r5, 4;\n"
5808 " mov.u16 %rh1, %nctaid.x;\n"
5809 " mul.wide.u16 %r8, %rh1, 32;\n"
5810 " add.u32 %r9, %r6, 16;\n"
5811 " add.u32 %r10, %r7, 16;\n"
5812 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_false_g_idata];\n"
5814 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5816 " cvt.u8.u32 %r11, %r9;\n"
5817 " cvt.u64.u32 %rd2, %r11;\n"
5819 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_false_g_idata];\n"
5821 " add.u64 %rd3, %rd2, %rd1;\n"
5822 " ld.global.u8 %rh2, [%rd3+0];\n"
5823 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5825 " ld.global.u8 %rh3, [%rd3+1];\n"
5826 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5828 " ld.global.u8 %rh4, [%rd3+2];\n"
5829 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5831 " ld.global.u8 %rh5, [%rd3+3];\n"
5832 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5833 " add.u32 %r9, %r8, %r9;\n"
5834 " setp.lt.u32 %p2, %r9, %r10;\n"
5835 " @%p2 bra $Lt_55_16642;\n"
5836 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5837 " bra.uni $Lt_55_16130;\n"
5839 " mov.f32 %f1, 0f00000000; // 0\n"
5842 " mov.u64 %rd4, __smem;\n"
5843 " cvt.u64.u32 %rd5, %r3;\n"
5844 " mul.wide.u32 %rd6, %r3, 4;\n"
5845 " add.u64 %rd7, %rd4, %rd6;\n"
5846 " st.shared.f32 [%rd7+0], %f1;\n"
5849 " mov.u32 %r12, 31;\n"
5850 " setp.gt.u32 %p3, %r3, %r12;\n"
5851 " @%p3 bra $Lt_55_17154;\n"
5853 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5854 " add.f32 %f3, %f2, %f1;\n"
5855 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5857 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5858 " add.f32 %f1, %f4, %f3;\n"
5859 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5861 " mov.u32 %r13, 0;\n"
5862 " setp.ne.u32 %p4, %r3, %r13;\n"
5863 " @%p4 bra $Lt_55_17666;\n"
5865 " ld.shared.f32 %f5, [__smem+0];\n"
5866 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_true_false_g_odata];\n"
5867 " cvt.u64.u32 %rd9, %r1;\n"
5868 " mul.wide.u32 %rd10, %r1, 4;\n"
5869 " add.u64 %rd11, %rd8, %rd10;\n"
5870 " st.global.f32 [%rd11+0], %f5;\n"
5874 "$LDWend_packed_float_reduce_4_true_false:\n"
5875 " } // packed_float_reduce_4_true_false\n"
5877 " .entry packed_float_reduce_4_true_true (\n"
5878 " .param .u64 __cudaparm_packed_float_reduce_4_true_true_g_idata,\n"
5879 " .param .u64 __cudaparm_packed_float_reduce_4_true_true_g_odata,\n"
5880 " .param .u32 __cudaparm_packed_float_reduce_4_true_true_n)\n"
5882 " .reg .u16 %rh<7>;\n"
5883 " .reg .u32 %r<15>;\n"
5884 " .reg .u64 %rd<13>;\n"
5885 " .reg .f32 %f<7>;\n"
5886 " .reg .pred %p<6>;\n"
5887 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5889 "$LDWbegin_packed_float_reduce_4_true_true:\n"
5891 " cvt.u32.u16 %r1, %ctaid.x;\n"
5892 " mul24.lo.u32 %r2, %r1, 8;\n"
5893 " cvt.u32.u16 %r3, %tid.x;\n"
5894 " add.u32 %r4, %r2, %r3;\n"
5895 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_true_n];\n"
5896 " setp.ge.u32 %p1, %r4, %r5;\n"
5897 " @%p1 bra $Lt_56_18178;\n"
5898 " mul.lo.u32 %r6, %r4, 4;\n"
5899 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_true_n];\n"
5900 " mul.lo.u32 %r7, %r5, 4;\n"
5901 " mov.u16 %rh1, %nctaid.x;\n"
5902 " mul.wide.u16 %r8, %rh1, 32;\n"
5903 " add.u32 %r9, %r6, 16;\n"
5904 " add.u32 %r10, %r7, 16;\n"
5905 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_true_g_idata];\n"
5907 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5909 " cvt.u8.u32 %r11, %r9;\n"
5910 " cvt.u64.u32 %rd2, %r11;\n"
5912 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_true_g_idata];\n"
5914 " add.u64 %rd3, %rd2, %rd1;\n"
5915 " ld.global.u8 %rh2, [%rd3+0];\n"
5916 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5918 " ld.global.u8 %rh3, [%rd3+1];\n"
5919 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5921 " ld.global.u8 %rh4, [%rd3+2];\n"
5922 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5924 " ld.global.u8 %rh5, [%rd3+3];\n"
5925 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5926 " add.u32 %r9, %r8, %r9;\n"
5927 " setp.lt.u32 %p2, %r9, %r10;\n"
5928 " @%p2 bra $Lt_56_16642;\n"
5929 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5930 " bra.uni $Lt_56_16130;\n"
5932 " mov.f32 %f1, 0f00000000; // 0\n"
5935 " mov.u64 %rd4, __smem;\n"
5936 " cvt.u64.u32 %rd5, %r3;\n"
5937 " mul.wide.u32 %rd6, %r3, 4;\n"
5938 " add.u64 %rd7, %rd4, %rd6;\n"
5939 " st.shared.f32 [%rd7+0], %f1;\n"
5942 " mov.u32 %r12, 31;\n"
5943 " setp.gt.u32 %p3, %r3, %r12;\n"
5944 " @%p3 bra $Lt_56_17154;\n"
5946 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5947 " add.f32 %f3, %f2, %f1;\n"
5948 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5950 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5951 " add.f32 %f1, %f4, %f3;\n"
5952 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5954 " mov.u32 %r13, 0;\n"
5955 " setp.ne.u32 %p4, %r3, %r13;\n"
5956 " @%p4 bra $Lt_56_17666;\n"
5958 " ld.shared.f32 %f5, [__smem+0];\n"
5959 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_true_true_g_odata];\n"
5960 " cvt.u64.u32 %rd9, %r1;\n"
5961 " mul.wide.u32 %rd10, %r1, 4;\n"
5962 " add.u64 %rd11, %rd8, %rd10;\n"
5963 " st.global.f32 [%rd11+0], %f5;\n"
5967 "$LDWend_packed_float_reduce_4_true_true:\n"
5968 " } // packed_float_reduce_4_true_true\n"
5970 " .entry packed_float_reduce_8_false_false (\n"
5971 " .param .u64 __cudaparm_packed_float_reduce_8_false_false_g_idata,\n"
5972 " .param .u64 __cudaparm_packed_float_reduce_8_false_false_g_odata,\n"
5973 " .param .u32 __cudaparm_packed_float_reduce_8_false_false_n)\n"
5975 " .reg .u16 %rh<7>;\n"
5976 " .reg .u32 %r<15>;\n"
5977 " .reg .u64 %rd<13>;\n"
5978 " .reg .f32 %f<9>;\n"
5979 " .reg .pred %p<6>;\n"
5980 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5982 "$LDWbegin_packed_float_reduce_8_false_false:\n"
5984 " cvt.u32.u16 %r1, %ctaid.x;\n"
5985 " mul24.lo.u32 %r2, %r1, 16;\n"
5986 " cvt.u32.u16 %r3, %tid.x;\n"
5987 " add.u32 %r4, %r2, %r3;\n"
5988 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_false_false_n];\n"
5989 " setp.ge.u32 %p1, %r4, %r5;\n"
5990 " @%p1 bra $Lt_57_17922;\n"
5991 " mul.lo.u32 %r6, %r4, 4;\n"
5992 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_false_false_n];\n"
5993 " mul.lo.u32 %r7, %r5, 4;\n"
5994 " mov.u16 %rh1, %nctaid.x;\n"
5995 " mul.wide.u16 %r8, %rh1, 64;\n"
5996 " add.u32 %r9, %r6, 32;\n"
5997 " add.u32 %r10, %r7, 32;\n"
5998 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_false_false_g_idata];\n"
6000 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6002 " cvt.u8.u32 %r11, %r9;\n"
6003 " cvt.u64.u32 %rd2, %r11;\n"
6005 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_false_false_g_idata];\n"
6007 " add.u64 %rd3, %rd2, %rd1;\n"
6008 " ld.global.u8 %rh2, [%rd3+0];\n"
6009 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6011 " ld.global.u8 %rh3, [%rd3+1];\n"
6012 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6014 " ld.global.u8 %rh4, [%rd3+2];\n"
6015 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6017 " ld.global.u8 %rh5, [%rd3+3];\n"
6018 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6019 " add.u32 %r9, %r8, %r9;\n"
6020 " setp.lt.u32 %p2, %r9, %r10;\n"
6021 " @%p2 bra $Lt_57_16386;\n"
6022 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6023 " bra.uni $Lt_57_15874;\n"
6025 " mov.f32 %f1, 0f00000000; // 0\n"
6028 " mov.u64 %rd4, __smem;\n"
6029 " cvt.u64.u32 %rd5, %r3;\n"
6030 " mul.wide.u32 %rd6, %r3, 4;\n"
6031 " add.u64 %rd7, %rd4, %rd6;\n"
6032 " st.shared.f32 [%rd7+0], %f1;\n"
6035 " mov.u32 %r12, 31;\n"
6036 " setp.gt.u32 %p3, %r3, %r12;\n"
6037 " @%p3 bra $Lt_57_16898;\n"
6039 " ld.volatile.shared.f32 %f2, [%rd7+16];\n"
6040 " add.f32 %f3, %f2, %f1;\n"
6041 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6043 " ld.volatile.shared.f32 %f4, [%rd7+8];\n"
6044 " add.f32 %f5, %f4, %f3;\n"
6045 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6047 " ld.volatile.shared.f32 %f6, [%rd7+4];\n"
6048 " add.f32 %f1, %f6, %f5;\n"
6049 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6051 " mov.u32 %r13, 0;\n"
6052 " setp.ne.u32 %p4, %r3, %r13;\n"
6053 " @%p4 bra $Lt_57_17410;\n"
6055 " ld.shared.f32 %f7, [__smem+0];\n"
6056 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_8_false_false_g_odata];\n"