KJB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
reduce_ptx.h
Go to the documentation of this file.
1 static const char* reduce_ptx =
2 " .version 1.4\n"
3 " .target sm_10, map_f64_to_f32\n"
4 " // compiled with /usr/local/cuda/open64/lib//be\n"
5 " // nvopencc 3.1 built on 2010-06-07\n"
6 "\n"
7 " //-----------------------------------------------------------\n"
8 " // Compiling /tmp/tmpxft_00007884_00000000-7_reduce.cpp3.i (/tmp/ccBI#.LV1fMO)\n"
9 " //-----------------------------------------------------------\n"
10 "\n"
11 " //-----------------------------------------------------------\n"
12 " // Options:\n"
13 " //-----------------------------------------------------------\n"
14 " // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64\n"
15 " // -O3 (Optimization level)\n"
16 " // -g0 (Debug level)\n"
17 " // -m2 (Report advisories)\n"
18 " //-----------------------------------------------------------\n"
19 "\n"
20 " .file 1 \"<command-line>\"\n"
21 " .file 2 \"/tmp/tmpxft_00007884_00000000-6_reduce.cudafe2.gpu\"\n"
22 " .file 3 \"reduce.cu\"\n"
23 " .file 4 \"/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h\"\n"
24 " .file 5 \"/usr/local/cuda/bin/../include/crt/device_runtime.h\"\n"
25 " .file 6 \"/usr/local/cuda/bin/../include/host_defines.h\"\n"
26 " .file 7 \"/usr/local/cuda/bin/../include/builtin_types.h\"\n"
27 " .file 8 \"/usr/local/cuda/bin/../include/device_types.h\"\n"
28 " .file 9 \"/usr/local/cuda/bin/../include/driver_types.h\"\n"
29 " .file 10 \"/usr/local/cuda/bin/../include/surface_types.h\"\n"
30 " .file 11 \"/usr/local/cuda/bin/../include/texture_types.h\"\n"
31 " .file 12 \"/usr/local/cuda/bin/../include/vector_types.h\"\n"
32 " .file 13 \"/usr/local/cuda/bin/../include/device_launch_parameters.h\"\n"
33 " .file 14 \"/usr/local/cuda/bin/../include/crt/storage_class.h\"\n"
34 " .file 15 \"/usr/include/bits/types.h\"\n"
35 " .file 16 \"/usr/include/time.h\"\n"
36 " .file 17 \"/usr/local/cuda/bin/../include/common_functions.h\"\n"
37 " .file 18 \"/usr/local/cuda/bin/../include/math_functions.h\"\n"
38 " .file 19 \"/usr/local/cuda/bin/../include/math_constants.h\"\n"
39 " .file 20 \"/usr/local/cuda/bin/../include/device_functions.h\"\n"
40 " .file 21 \"/usr/local/cuda/bin/../include/sm_11_atomic_functions.h\"\n"
41 " .file 22 \"/usr/local/cuda/bin/../include/sm_12_atomic_functions.h\"\n"
42 " .file 23 \"/usr/local/cuda/bin/../include/sm_13_double_functions.h\"\n"
43 " .file 24 \"/usr/local/cuda/bin/../include/sm_20_atomic_functions.h\"\n"
44 " .file 25 \"/usr/local/cuda/bin/../include/sm_20_intrinsics.h\"\n"
45 " .file 26 \"/usr/local/cuda/bin/../include/surface_functions.h\"\n"
46 " .file 27 \"/usr/local/cuda/bin/../include/texture_fetch_functions.h\"\n"
47 " .file 28 \"/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h\"\n"
48 "\n"
49 " .extern .shared .align 4 .b8 __smem[];\n"
50 " .tex .u64 tex_ref_1;\n"
51 " .tex .u64 tex_ref_2;\n"
52 "\n"
53 " .entry chamfer_and_reduce (\n"
54 " .param .u64 __cudaparm_chamfer_and_reduce_g_idata_1,\n"
55 " .param .u64 __cudaparm_chamfer_and_reduce_g_idata_2,\n"
56 " .param .u64 __cudaparm_chamfer_and_reduce_g_odata,\n"
57 " .param .u32 __cudaparm_chamfer_and_reduce_n)\n"
58 " {\n"
59 " .reg .u16 %rh<3>;\n"
60 " .reg .u32 %r<14>;\n"
61 " .reg .u64 %rd<18>;\n"
62 " .reg .f32 %f<23>;\n"
63 " .reg .pred %p<9>;\n"
64 " .loc 3 148 0\n"
65 "$LDWbegin_chamfer_and_reduce:\n"
66 " .loc 3 105 0\n"
67 " cvt.u32.u16 %r1, %ctaid.x;\n"
68 " mul.lo.u32 %r2, %r1, 512;\n"
69 " cvt.u32.u16 %r3, %tid.x;\n"
70 " add.u32 %r4, %r2, %r3;\n"
71 " ld.param.u32 %r5, [__cudaparm_chamfer_and_reduce_n];\n"
72 " setp.ge.u32 %p1, %r4, %r5;\n"
73 " @%p1 bra $Lt_0_18178;\n"
74 " add.u32 %r6, %r4, 256;\n"
75 " ld.param.u32 %r5, [__cudaparm_chamfer_and_reduce_n];\n"
76 " add.u32 %r7, %r5, 256;\n"
77 " mov.u16 %rh1, %nctaid.x;\n"
78 " mul.wide.u16 %r8, %rh1, 512;\n"
79 " cvt.u64.u32 %rd1, %r4;\n"
80 " mul.wide.u32 %rd2, %r4, 4;\n"
81 " cvt.s64.u32 %rd3, %r8;\n"
82 " ld.param.u64 %rd4, [__cudaparm_chamfer_and_reduce_g_idata_1];\n"
83 " add.u64 %rd5, %rd4, %rd2;\n"
84 " mul.wide.u32 %rd6, %r8, 4;\n"
85 " ld.param.u64 %rd7, [__cudaparm_chamfer_and_reduce_g_idata_2];\n"
86 " add.u64 %rd8, %rd7, %rd2;\n"
87 " mov.f32 %f1, 0f00000000; // 0\n"
88 "$Lt_0_15106:\n"
89 " //<loop> Loop body line 105, nesting depth: 1, estimated iterations: unknown\n"
90 " .loc 3 119 0\n"
91 " ld.global.f32 %f2, [%rd5+0];\n"
92 " ld.global.f32 %f3, [%rd8+0];\n"
93 " mad.f32 %f1, %f2, %f3, %f1;\n"
94 " .loc 3 105 0\n"
95 " ld.param.u32 %r5, [__cudaparm_chamfer_and_reduce_n];\n"
96 " .loc 3 119 0\n"
97 " setp.ge.u32 %p2, %r6, %r5;\n"
98 " @%p2 bra $Lt_0_15362;\n"
99 " //<loop> Part of loop body line 105, head labeled $Lt_0_15106\n"
100 " .loc 3 133 0\n"
101 " ld.global.f32 %f4, [%rd5+1024];\n"
102 " ld.global.f32 %f5, [%rd8+1024];\n"
103 " mad.f32 %f1, %f4, %f5, %f1;\n"
104 "$Lt_0_15362:\n"
105 " //<loop> Part of loop body line 105, head labeled $Lt_0_15106\n"
106 " add.u32 %r6, %r6, %r8;\n"
107 " add.u64 %rd8, %rd8, %rd6;\n"
108 " add.u64 %rd5, %rd5, %rd6;\n"
109 " setp.lt.u32 %p3, %r6, %r7;\n"
110 " @%p3 bra $Lt_0_15106;\n"
111 " bra.uni $Lt_0_14594;\n"
112 "$Lt_0_18178:\n"
113 " mov.f32 %f1, 0f00000000; // 0\n"
114 "$Lt_0_14594:\n"
115 " .loc 3 139 0\n"
116 " mov.f32 %f6, %f1;\n"
117 " mov.f32 %f7, %f6;\n"
118 " .loc 3 71 0\n"
119 " mov.u64 %rd9, __smem;\n"
120 " cvt.u64.u32 %rd10, %r3;\n"
121 " mul.wide.u32 %rd11, %r3, 4;\n"
122 " add.u64 %rd12, %rd9, %rd11;\n"
123 " st.volatile.shared.f32 [%rd12+0], %f6;\n"
124 " .loc 3 72 0\n"
125 " bar.sync 0;\n"
126 " mov.u32 %r9, 127;\n"
127 " setp.gt.u32 %p4, %r3, %r9;\n"
128 " @%p4 bra $Lt_0_16130;\n"
129 " .loc 3 76 0\n"
130 " ld.volatile.shared.f32 %f8, [%rd12+512];\n"
131 " add.f32 %f7, %f8, %f6;\n"
132 " st.volatile.shared.f32 [%rd12+0], %f7;\n"
133 "$Lt_0_16130:\n"
134 " bar.sync 0;\n"
135 " mov.u32 %r10, 63;\n"
136 " setp.gt.u32 %p5, %r3, %r10;\n"
137 " @%p5 bra $Lt_0_16642;\n"
138 " .loc 3 77 0\n"
139 " ld.volatile.shared.f32 %f9, [%rd12+256];\n"
140 " add.f32 %f7, %f9, %f7;\n"
141 " st.volatile.shared.f32 [%rd12+0], %f7;\n"
142 "$Lt_0_16642:\n"
143 " bar.sync 0;\n"
144 " mov.u32 %r11, 31;\n"
145 " setp.gt.u32 %p6, %r3, %r11;\n"
146 " @%p6 bra $Lt_0_17154;\n"
147 " .loc 3 83 0\n"
148 " ld.volatile.shared.f32 %f10, [%rd12+128];\n"
149 " add.f32 %f11, %f10, %f7;\n"
150 " st.volatile.shared.f32 [%rd12+0], %f11;\n"
151 " .loc 3 84 0\n"
152 " ld.volatile.shared.f32 %f12, [%rd12+64];\n"
153 " add.f32 %f13, %f12, %f11;\n"
154 " st.volatile.shared.f32 [%rd12+0], %f13;\n"
155 " .loc 3 85 0\n"
156 " ld.volatile.shared.f32 %f14, [%rd12+32];\n"
157 " add.f32 %f15, %f14, %f13;\n"
158 " st.volatile.shared.f32 [%rd12+0], %f15;\n"
159 " .loc 3 86 0\n"
160 " ld.volatile.shared.f32 %f16, [%rd12+16];\n"
161 " add.f32 %f17, %f16, %f15;\n"
162 " st.volatile.shared.f32 [%rd12+0], %f17;\n"
163 " .loc 3 87 0\n"
164 " ld.volatile.shared.f32 %f18, [%rd12+8];\n"
165 " add.f32 %f19, %f18, %f17;\n"
166 " st.volatile.shared.f32 [%rd12+0], %f19;\n"
167 " .loc 3 88 0\n"
168 " ld.volatile.shared.f32 %f20, [%rd12+4];\n"
169 " add.f32 %f7, %f20, %f19;\n"
170 " st.volatile.shared.f32 [%rd12+0], %f7;\n"
171 "$Lt_0_17154:\n"
172 " .loc 3 139 0\n"
173 " mov.u32 %r12, 0;\n"
174 " setp.ne.u32 %p7, %r3, %r12;\n"
175 " @%p7 bra $Lt_0_17666;\n"
176 " .loc 3 143 0\n"
177 " ld.shared.f32 %f21, [__smem+0];\n"
178 " ld.param.u64 %rd13, [__cudaparm_chamfer_and_reduce_g_odata];\n"
179 " cvt.u64.u32 %rd14, %r1;\n"
180 " mul.wide.u32 %rd15, %r1, 4;\n"
181 " add.u64 %rd16, %rd13, %rd15;\n"
182 " st.global.f32 [%rd16+0], %f21;\n"
183 "$Lt_0_17666:\n"
184 " .loc 3 151 0\n"
185 " exit;\n"
186 "$LDWend_chamfer_and_reduce:\n"
187 " } // chamfer_and_reduce\n"
188 "\n"
189 " .entry squared_chamfer_and_reduce (\n"
190 " .param .u64 __cudaparm_squared_chamfer_and_reduce_g_idata_1,\n"
191 " .param .u64 __cudaparm_squared_chamfer_and_reduce_g_idata_2,\n"
192 " .param .u64 __cudaparm_squared_chamfer_and_reduce_g_odata,\n"
193 " .param .u32 __cudaparm_squared_chamfer_and_reduce_n)\n"
194 " {\n"
195 " .reg .u16 %rh<3>;\n"
196 " .reg .u32 %r<14>;\n"
197 " .reg .u64 %rd<18>;\n"
198 " .reg .f32 %f<25>;\n"
199 " .reg .pred %p<9>;\n"
200 " .loc 3 154 0\n"
201 "$LDWbegin_squared_chamfer_and_reduce:\n"
202 " .loc 3 105 0\n"
203 " cvt.u32.u16 %r1, %ctaid.x;\n"
204 " mul.lo.u32 %r2, %r1, 512;\n"
205 " cvt.u32.u16 %r3, %tid.x;\n"
206 " add.u32 %r4, %r2, %r3;\n"
207 " ld.param.u32 %r5, [__cudaparm_squared_chamfer_and_reduce_n];\n"
208 " setp.ge.u32 %p1, %r4, %r5;\n"
209 " @%p1 bra $Lt_1_18178;\n"
210 " add.u32 %r6, %r4, 256;\n"
211 " ld.param.u32 %r5, [__cudaparm_squared_chamfer_and_reduce_n];\n"
212 " add.u32 %r7, %r5, 256;\n"
213 " mov.u16 %rh1, %nctaid.x;\n"
214 " mul.wide.u16 %r8, %rh1, 512;\n"
215 " cvt.u64.u32 %rd1, %r4;\n"
216 " mul.wide.u32 %rd2, %r4, 4;\n"
217 " cvt.s64.u32 %rd3, %r8;\n"
218 " ld.param.u64 %rd4, [__cudaparm_squared_chamfer_and_reduce_g_idata_1];\n"
219 " add.u64 %rd5, %rd4, %rd2;\n"
220 " mul.wide.u32 %rd6, %r8, 4;\n"
221 " ld.param.u64 %rd7, [__cudaparm_squared_chamfer_and_reduce_g_idata_2];\n"
222 " add.u64 %rd8, %rd7, %rd2;\n"
223 " mov.f32 %f1, 0f00000000; // 0\n"
224 "$Lt_1_15106:\n"
225 " //<loop> Loop body line 105, nesting depth: 1, estimated iterations: unknown\n"
226 " .loc 3 114 0\n"
227 " ld.global.f32 %f2, [%rd5+0];\n"
228 " ld.global.f32 %f3, [%rd8+0];\n"
229 " mul.f32 %f4, %f2, %f3;\n"
230 " .loc 3 115 0\n"
231 " mad.f32 %f1, %f4, %f4, %f1;\n"
232 " .loc 3 105 0\n"
233 " ld.param.u32 %r5, [__cudaparm_squared_chamfer_and_reduce_n];\n"
234 " .loc 3 115 0\n"
235 " setp.ge.u32 %p2, %r6, %r5;\n"
236 " @%p2 bra $Lt_1_15362;\n"
237 " //<loop> Part of loop body line 105, head labeled $Lt_1_15106\n"
238 " .loc 3 127 0\n"
239 " ld.global.f32 %f5, [%rd5+1024];\n"
240 " ld.global.f32 %f6, [%rd8+1024];\n"
241 " mul.f32 %f7, %f5, %f6;\n"
242 " .loc 3 129 0\n"
243 " mad.f32 %f1, %f7, %f7, %f1;\n"
244 "$Lt_1_15362:\n"
245 " //<loop> Part of loop body line 105, head labeled $Lt_1_15106\n"
246 " add.u32 %r6, %r6, %r8;\n"
247 " add.u64 %rd8, %rd8, %rd6;\n"
248 " add.u64 %rd5, %rd5, %rd6;\n"
249 " setp.lt.u32 %p3, %r6, %r7;\n"
250 " @%p3 bra $Lt_1_15106;\n"
251 " bra.uni $Lt_1_14594;\n"
252 "$Lt_1_18178:\n"
253 " mov.f32 %f1, 0f00000000; // 0\n"
254 "$Lt_1_14594:\n"
255 " .loc 3 139 0\n"
256 " mov.f32 %f8, %f1;\n"
257 " mov.f32 %f9, %f8;\n"
258 " .loc 3 71 0\n"
259 " mov.u64 %rd9, __smem;\n"
260 " cvt.u64.u32 %rd10, %r3;\n"
261 " mul.wide.u32 %rd11, %r3, 4;\n"
262 " add.u64 %rd12, %rd9, %rd11;\n"
263 " st.volatile.shared.f32 [%rd12+0], %f8;\n"
264 " .loc 3 72 0\n"
265 " bar.sync 0;\n"
266 " mov.u32 %r9, 127;\n"
267 " setp.gt.u32 %p4, %r3, %r9;\n"
268 " @%p4 bra $Lt_1_16130;\n"
269 " .loc 3 76 0\n"
270 " ld.volatile.shared.f32 %f10, [%rd12+512];\n"
271 " add.f32 %f9, %f10, %f8;\n"
272 " st.volatile.shared.f32 [%rd12+0], %f9;\n"
273 "$Lt_1_16130:\n"
274 " bar.sync 0;\n"
275 " mov.u32 %r10, 63;\n"
276 " setp.gt.u32 %p5, %r3, %r10;\n"
277 " @%p5 bra $Lt_1_16642;\n"
278 " .loc 3 77 0\n"
279 " ld.volatile.shared.f32 %f11, [%rd12+256];\n"
280 " add.f32 %f9, %f11, %f9;\n"
281 " st.volatile.shared.f32 [%rd12+0], %f9;\n"
282 "$Lt_1_16642:\n"
283 " bar.sync 0;\n"
284 " mov.u32 %r11, 31;\n"
285 " setp.gt.u32 %p6, %r3, %r11;\n"
286 " @%p6 bra $Lt_1_17154;\n"
287 " .loc 3 83 0\n"
288 " ld.volatile.shared.f32 %f12, [%rd12+128];\n"
289 " add.f32 %f13, %f12, %f9;\n"
290 " st.volatile.shared.f32 [%rd12+0], %f13;\n"
291 " .loc 3 84 0\n"
292 " ld.volatile.shared.f32 %f14, [%rd12+64];\n"
293 " add.f32 %f15, %f14, %f13;\n"
294 " st.volatile.shared.f32 [%rd12+0], %f15;\n"
295 " .loc 3 85 0\n"
296 " ld.volatile.shared.f32 %f16, [%rd12+32];\n"
297 " add.f32 %f17, %f16, %f15;\n"
298 " st.volatile.shared.f32 [%rd12+0], %f17;\n"
299 " .loc 3 86 0\n"
300 " ld.volatile.shared.f32 %f18, [%rd12+16];\n"
301 " add.f32 %f19, %f18, %f17;\n"
302 " st.volatile.shared.f32 [%rd12+0], %f19;\n"
303 " .loc 3 87 0\n"
304 " ld.volatile.shared.f32 %f20, [%rd12+8];\n"
305 " add.f32 %f21, %f20, %f19;\n"
306 " st.volatile.shared.f32 [%rd12+0], %f21;\n"
307 " .loc 3 88 0\n"
308 " ld.volatile.shared.f32 %f22, [%rd12+4];\n"
309 " add.f32 %f9, %f22, %f21;\n"
310 " st.volatile.shared.f32 [%rd12+0], %f9;\n"
311 "$Lt_1_17154:\n"
312 " .loc 3 139 0\n"
313 " mov.u32 %r12, 0;\n"
314 " setp.ne.u32 %p7, %r3, %r12;\n"
315 " @%p7 bra $Lt_1_17666;\n"
316 " .loc 3 143 0\n"
317 " ld.shared.f32 %f23, [__smem+0];\n"
318 " ld.param.u64 %rd13, [__cudaparm_squared_chamfer_and_reduce_g_odata];\n"
319 " cvt.u64.u32 %rd14, %r1;\n"
320 " mul.wide.u32 %rd15, %r1, 4;\n"
321 " add.u64 %rd16, %rd13, %rd15;\n"
322 " st.global.f32 [%rd16+0], %f23;\n"
323 "$Lt_1_17666:\n"
324 " .loc 3 157 0\n"
325 " exit;\n"
326 "$LDWend_squared_chamfer_and_reduce:\n"
327 " } // squared_chamfer_and_reduce\n"
328 "\n"
329 " .entry reduce_float_1_true (\n"
330 " .param .u64 __cudaparm_reduce_float_1_true_g_idata,\n"
331 " .param .u64 __cudaparm_reduce_float_1_true_g_odata,\n"
332 " .param .u32 __cudaparm_reduce_float_1_true_n)\n"
333 " {\n"
334 " .reg .u16 %rh<3>;\n"
335 " .reg .u32 %r<10>;\n"
336 " .reg .u64 %rd<16>;\n"
337 " .reg .f32 %f<7>;\n"
338 " .reg .pred %p<5>;\n"
339 " .loc 3 372 0\n"
340 "$LDWbegin_reduce_float_1_true:\n"
341 " .loc 3 181 0\n"
342 " cvt.u32.u16 %r1, %ctaid.x;\n"
343 " mul24.lo.u32 %r2, %r1, 2;\n"
344 " cvt.u32.u16 %r3, %tid.x;\n"
345 " add.u32 %r4, %r2, %r3;\n"
346 " mov.s32 %r5, %r4;\n"
347 " ld.param.u32 %r6, [__cudaparm_reduce_float_1_true_n];\n"
348 " setp.ge.u32 %p1, %r4, %r6;\n"
349 " @%p1 bra $Lt_2_16642;\n"
350 " mov.u16 %rh1, %nctaid.x;\n"
351 " mul.wide.u16 %r7, %rh1, 2;\n"
352 " cvt.s64.u32 %rd1, %r7;\n"
353 " ld.param.u64 %rd2, [__cudaparm_reduce_float_1_true_g_idata];\n"
354 " cvt.u64.u32 %rd3, %r4;\n"
355 " mul.wide.u32 %rd4, %r4, 4;\n"
356 " add.u64 %rd5, %rd2, %rd4;\n"
357 " mul.wide.u32 %rd6, %r7, 4;\n"
358 " mov.f32 %f1, 0f00000000; // 0\n"
359 "$Lt_2_15618:\n"
360 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
361 " .loc 3 188 0\n"
362 " ld.global.f32 %f2, [%rd5+0];\n"
363 " add.f32 %f3, %f2, %f1;\n"
364 " .loc 3 191 0\n"
365 " ld.global.f32 %f4, [%rd5+4];\n"
366 " add.f32 %f1, %f4, %f3;\n"
367 " add.u32 %r5, %r7, %r5;\n"
368 " add.u64 %rd5, %rd5, %rd6;\n"
369 " .loc 3 181 0\n"
370 " ld.param.u32 %r6, [__cudaparm_reduce_float_1_true_n];\n"
371 " .loc 3 191 0\n"
372 " setp.lt.u32 %p2, %r5, %r6;\n"
373 " @%p2 bra $Lt_2_15618;\n"
374 " bra.uni $Lt_2_15106;\n"
375 "$Lt_2_16642:\n"
376 " mov.f32 %f1, 0f00000000; // 0\n"
377 "$Lt_2_15106:\n"
378 " .loc 3 71 0\n"
379 " mov.u64 %rd7, __smem;\n"
380 " cvt.u64.u32 %rd8, %r3;\n"
381 " mul.wide.u32 %rd9, %r3, 4;\n"
382 " add.u64 %rd10, %rd7, %rd9;\n"
383 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
384 " .loc 3 72 0\n"
385 " bar.sync 0;\n"
386 " .loc 3 195 0\n"
387 " mov.u32 %r8, 0;\n"
388 " setp.ne.u32 %p3, %r3, %r8;\n"
389 " @%p3 bra $Lt_2_16130;\n"
390 " .loc 3 199 0\n"
391 " ld.shared.f32 %f5, [__smem+0];\n"
392 " ld.param.u64 %rd11, [__cudaparm_reduce_float_1_true_g_odata];\n"
393 " cvt.u64.u32 %rd12, %r1;\n"
394 " mul.wide.u32 %rd13, %r1, 4;\n"
395 " add.u64 %rd14, %rd11, %rd13;\n"
396 " st.global.f32 [%rd14+0], %f5;\n"
397 "$Lt_2_16130:\n"
398 " .loc 3 375 0\n"
399 " exit;\n"
400 "$LDWend_reduce_float_1_true:\n"
401 " } // reduce_float_1_true\n"
402 "\n"
403 " .entry reduce_float_2_true (\n"
404 " .param .u64 __cudaparm_reduce_float_2_true_g_idata,\n"
405 " .param .u64 __cudaparm_reduce_float_2_true_g_odata,\n"
406 " .param .u32 __cudaparm_reduce_float_2_true_n)\n"
407 " {\n"
408 " .reg .u16 %rh<3>;\n"
409 " .reg .u32 %r<11>;\n"
410 " .reg .u64 %rd<16>;\n"
411 " .reg .f32 %f<9>;\n"
412 " .reg .pred %p<6>;\n"
413 " .loc 3 377 0\n"
414 "$LDWbegin_reduce_float_2_true:\n"
415 " .loc 3 181 0\n"
416 " cvt.u32.u16 %r1, %ctaid.x;\n"
417 " mul24.lo.u32 %r2, %r1, 4;\n"
418 " cvt.u32.u16 %r3, %tid.x;\n"
419 " add.u32 %r4, %r2, %r3;\n"
420 " mov.s32 %r5, %r4;\n"
421 " ld.param.u32 %r6, [__cudaparm_reduce_float_2_true_n];\n"
422 " setp.ge.u32 %p1, %r4, %r6;\n"
423 " @%p1 bra $Lt_3_16898;\n"
424 " mov.u16 %rh1, %nctaid.x;\n"
425 " mul.wide.u16 %r7, %rh1, 4;\n"
426 " cvt.s64.u32 %rd1, %r7;\n"
427 " ld.param.u64 %rd2, [__cudaparm_reduce_float_2_true_g_idata];\n"
428 " cvt.u64.u32 %rd3, %r4;\n"
429 " mul.wide.u32 %rd4, %r4, 4;\n"
430 " add.u64 %rd5, %rd2, %rd4;\n"
431 " mul.wide.u32 %rd6, %r7, 4;\n"
432 " mov.f32 %f1, 0f00000000; // 0\n"
433 "$Lt_3_15362:\n"
434 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
435 " .loc 3 188 0\n"
436 " ld.global.f32 %f2, [%rd5+0];\n"
437 " add.f32 %f3, %f2, %f1;\n"
438 " .loc 3 191 0\n"
439 " ld.global.f32 %f4, [%rd5+8];\n"
440 " add.f32 %f1, %f4, %f3;\n"
441 " add.u32 %r5, %r7, %r5;\n"
442 " add.u64 %rd5, %rd5, %rd6;\n"
443 " .loc 3 181 0\n"
444 " ld.param.u32 %r6, [__cudaparm_reduce_float_2_true_n];\n"
445 " .loc 3 191 0\n"
446 " setp.lt.u32 %p2, %r5, %r6;\n"
447 " @%p2 bra $Lt_3_15362;\n"
448 " bra.uni $Lt_3_14850;\n"
449 "$Lt_3_16898:\n"
450 " mov.f32 %f1, 0f00000000; // 0\n"
451 "$Lt_3_14850:\n"
452 " .loc 3 71 0\n"
453 " mov.u64 %rd7, __smem;\n"
454 " cvt.u64.u32 %rd8, %r3;\n"
455 " mul.wide.u32 %rd9, %r3, 4;\n"
456 " add.u64 %rd10, %rd7, %rd9;\n"
457 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
458 " .loc 3 72 0\n"
459 " bar.sync 0;\n"
460 " mov.u32 %r8, 31;\n"
461 " setp.gt.u32 %p3, %r3, %r8;\n"
462 " @%p3 bra $Lt_3_15874;\n"
463 " .loc 3 88 0\n"
464 " ld.volatile.shared.f32 %f5, [%rd10+4];\n"
465 " add.f32 %f6, %f5, %f1;\n"
466 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
467 "$Lt_3_15874:\n"
468 " .loc 3 195 0\n"
469 " mov.u32 %r9, 0;\n"
470 " setp.ne.u32 %p4, %r3, %r9;\n"
471 " @%p4 bra $Lt_3_16386;\n"
472 " .loc 3 199 0\n"
473 " ld.shared.f32 %f7, [__smem+0];\n"
474 " ld.param.u64 %rd11, [__cudaparm_reduce_float_2_true_g_odata];\n"
475 " cvt.u64.u32 %rd12, %r1;\n"
476 " mul.wide.u32 %rd13, %r1, 4;\n"
477 " add.u64 %rd14, %rd11, %rd13;\n"
478 " st.global.f32 [%rd14+0], %f7;\n"
479 "$Lt_3_16386:\n"
480 " .loc 3 380 0\n"
481 " exit;\n"
482 "$LDWend_reduce_float_2_true:\n"
483 " } // reduce_float_2_true\n"
484 "\n"
485 " .entry reduce_float_4_true (\n"
486 " .param .u64 __cudaparm_reduce_float_4_true_g_idata,\n"
487 " .param .u64 __cudaparm_reduce_float_4_true_g_odata,\n"
488 " .param .u32 __cudaparm_reduce_float_4_true_n)\n"
489 " {\n"
490 " .reg .u16 %rh<3>;\n"
491 " .reg .u32 %r<11>;\n"
492 " .reg .u64 %rd<16>;\n"
493 " .reg .f32 %f<11>;\n"
494 " .reg .pred %p<6>;\n"
495 " .loc 3 382 0\n"
496 "$LDWbegin_reduce_float_4_true:\n"
497 " .loc 3 181 0\n"
498 " cvt.u32.u16 %r1, %ctaid.x;\n"
499 " mul24.lo.u32 %r2, %r1, 8;\n"
500 " cvt.u32.u16 %r3, %tid.x;\n"
501 " add.u32 %r4, %r2, %r3;\n"
502 " mov.s32 %r5, %r4;\n"
503 " ld.param.u32 %r6, [__cudaparm_reduce_float_4_true_n];\n"
504 " setp.ge.u32 %p1, %r4, %r6;\n"
505 " @%p1 bra $Lt_4_16642;\n"
506 " mov.u16 %rh1, %nctaid.x;\n"
507 " mul.wide.u16 %r7, %rh1, 8;\n"
508 " cvt.s64.u32 %rd1, %r7;\n"
509 " ld.param.u64 %rd2, [__cudaparm_reduce_float_4_true_g_idata];\n"
510 " cvt.u64.u32 %rd3, %r4;\n"
511 " mul.wide.u32 %rd4, %r4, 4;\n"
512 " add.u64 %rd5, %rd2, %rd4;\n"
513 " mul.wide.u32 %rd6, %r7, 4;\n"
514 " mov.f32 %f1, 0f00000000; // 0\n"
515 "$Lt_4_15106:\n"
516 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
517 " .loc 3 188 0\n"
518 " ld.global.f32 %f2, [%rd5+0];\n"
519 " add.f32 %f3, %f2, %f1;\n"
520 " .loc 3 191 0\n"
521 " ld.global.f32 %f4, [%rd5+16];\n"
522 " add.f32 %f1, %f4, %f3;\n"
523 " add.u32 %r5, %r7, %r5;\n"
524 " add.u64 %rd5, %rd5, %rd6;\n"
525 " .loc 3 181 0\n"
526 " ld.param.u32 %r6, [__cudaparm_reduce_float_4_true_n];\n"
527 " .loc 3 191 0\n"
528 " setp.lt.u32 %p2, %r5, %r6;\n"
529 " @%p2 bra $Lt_4_15106;\n"
530 " bra.uni $Lt_4_14594;\n"
531 "$Lt_4_16642:\n"
532 " mov.f32 %f1, 0f00000000; // 0\n"
533 "$Lt_4_14594:\n"
534 " .loc 3 71 0\n"
535 " mov.u64 %rd7, __smem;\n"
536 " cvt.u64.u32 %rd8, %r3;\n"
537 " mul.wide.u32 %rd9, %r3, 4;\n"
538 " add.u64 %rd10, %rd7, %rd9;\n"
539 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
540 " .loc 3 72 0\n"
541 " bar.sync 0;\n"
542 " mov.u32 %r8, 31;\n"
543 " setp.gt.u32 %p3, %r3, %r8;\n"
544 " @%p3 bra $Lt_4_15618;\n"
545 " .loc 3 87 0\n"
546 " ld.volatile.shared.f32 %f5, [%rd10+8];\n"
547 " add.f32 %f6, %f5, %f1;\n"
548 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
549 " .loc 3 88 0\n"
550 " ld.volatile.shared.f32 %f7, [%rd10+4];\n"
551 " add.f32 %f8, %f7, %f6;\n"
552 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
553 "$Lt_4_15618:\n"
554 " .loc 3 195 0\n"
555 " mov.u32 %r9, 0;\n"
556 " setp.ne.u32 %p4, %r3, %r9;\n"
557 " @%p4 bra $Lt_4_16130;\n"
558 " .loc 3 199 0\n"
559 " ld.shared.f32 %f9, [__smem+0];\n"
560 " ld.param.u64 %rd11, [__cudaparm_reduce_float_4_true_g_odata];\n"
561 " cvt.u64.u32 %rd12, %r1;\n"
562 " mul.wide.u32 %rd13, %r1, 4;\n"
563 " add.u64 %rd14, %rd11, %rd13;\n"
564 " st.global.f32 [%rd14+0], %f9;\n"
565 "$Lt_4_16130:\n"
566 " .loc 3 385 0\n"
567 " exit;\n"
568 "$LDWend_reduce_float_4_true:\n"
569 " } // reduce_float_4_true\n"
570 "\n"
571 " .entry reduce_float_8_true (\n"
572 " .param .u64 __cudaparm_reduce_float_8_true_g_idata,\n"
573 " .param .u64 __cudaparm_reduce_float_8_true_g_odata,\n"
574 " .param .u32 __cudaparm_reduce_float_8_true_n)\n"
575 " {\n"
576 " .reg .u16 %rh<3>;\n"
577 " .reg .u32 %r<11>;\n"
578 " .reg .u64 %rd<16>;\n"
579 " .reg .f32 %f<13>;\n"
580 " .reg .pred %p<6>;\n"
581 " .loc 3 387 0\n"
582 "$LDWbegin_reduce_float_8_true:\n"
583 " .loc 3 181 0\n"
584 " cvt.u32.u16 %r1, %ctaid.x;\n"
585 " mul24.lo.u32 %r2, %r1, 16;\n"
586 " cvt.u32.u16 %r3, %tid.x;\n"
587 " add.u32 %r4, %r2, %r3;\n"
588 " mov.s32 %r5, %r4;\n"
589 " ld.param.u32 %r6, [__cudaparm_reduce_float_8_true_n];\n"
590 " setp.ge.u32 %p1, %r4, %r6;\n"
591 " @%p1 bra $Lt_5_16386;\n"
592 " mov.u16 %rh1, %nctaid.x;\n"
593 " mul.wide.u16 %r7, %rh1, 16;\n"
594 " cvt.s64.u32 %rd1, %r7;\n"
595 " ld.param.u64 %rd2, [__cudaparm_reduce_float_8_true_g_idata];\n"
596 " cvt.u64.u32 %rd3, %r4;\n"
597 " mul.wide.u32 %rd4, %r4, 4;\n"
598 " add.u64 %rd5, %rd2, %rd4;\n"
599 " mul.wide.u32 %rd6, %r7, 4;\n"
600 " mov.f32 %f1, 0f00000000; // 0\n"
601 "$Lt_5_14850:\n"
602 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
603 " .loc 3 188 0\n"
604 " ld.global.f32 %f2, [%rd5+0];\n"
605 " add.f32 %f3, %f2, %f1;\n"
606 " .loc 3 191 0\n"
607 " ld.global.f32 %f4, [%rd5+32];\n"
608 " add.f32 %f1, %f4, %f3;\n"
609 " add.u32 %r5, %r7, %r5;\n"
610 " add.u64 %rd5, %rd5, %rd6;\n"
611 " .loc 3 181 0\n"
612 " ld.param.u32 %r6, [__cudaparm_reduce_float_8_true_n];\n"
613 " .loc 3 191 0\n"
614 " setp.lt.u32 %p2, %r5, %r6;\n"
615 " @%p2 bra $Lt_5_14850;\n"
616 " bra.uni $Lt_5_14338;\n"
617 "$Lt_5_16386:\n"
618 " mov.f32 %f1, 0f00000000; // 0\n"
619 "$Lt_5_14338:\n"
620 " .loc 3 71 0\n"
621 " mov.u64 %rd7, __smem;\n"
622 " cvt.u64.u32 %rd8, %r3;\n"
623 " mul.wide.u32 %rd9, %r3, 4;\n"
624 " add.u64 %rd10, %rd7, %rd9;\n"
625 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
626 " .loc 3 72 0\n"
627 " bar.sync 0;\n"
628 " mov.u32 %r8, 31;\n"
629 " setp.gt.u32 %p3, %r3, %r8;\n"
630 " @%p3 bra $Lt_5_15362;\n"
631 " .loc 3 86 0\n"
632 " ld.volatile.shared.f32 %f5, [%rd10+16];\n"
633 " add.f32 %f6, %f5, %f1;\n"
634 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
635 " .loc 3 87 0\n"
636 " ld.volatile.shared.f32 %f7, [%rd10+8];\n"
637 " add.f32 %f8, %f7, %f6;\n"
638 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
639 " .loc 3 88 0\n"
640 " ld.volatile.shared.f32 %f9, [%rd10+4];\n"
641 " add.f32 %f10, %f9, %f8;\n"
642 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
643 "$Lt_5_15362:\n"
644 " .loc 3 195 0\n"
645 " mov.u32 %r9, 0;\n"
646 " setp.ne.u32 %p4, %r3, %r9;\n"
647 " @%p4 bra $Lt_5_15874;\n"
648 " .loc 3 199 0\n"
649 " ld.shared.f32 %f11, [__smem+0];\n"
650 " ld.param.u64 %rd11, [__cudaparm_reduce_float_8_true_g_odata];\n"
651 " cvt.u64.u32 %rd12, %r1;\n"
652 " mul.wide.u32 %rd13, %r1, 4;\n"
653 " add.u64 %rd14, %rd11, %rd13;\n"
654 " st.global.f32 [%rd14+0], %f11;\n"
655 "$Lt_5_15874:\n"
656 " .loc 3 390 0\n"
657 " exit;\n"
658 "$LDWend_reduce_float_8_true:\n"
659 " } // reduce_float_8_true\n"
660 "\n"
661 " .entry reduce_float_16_true (\n"
662 " .param .u64 __cudaparm_reduce_float_16_true_g_idata,\n"
663 " .param .u64 __cudaparm_reduce_float_16_true_g_odata,\n"
664 " .param .u32 __cudaparm_reduce_float_16_true_n)\n"
665 " {\n"
666 " .reg .u16 %rh<3>;\n"
667 " .reg .u32 %r<11>;\n"
668 " .reg .u64 %rd<16>;\n"
669 " .reg .f32 %f<15>;\n"
670 " .reg .pred %p<6>;\n"
671 " .loc 3 392 0\n"
672 "$LDWbegin_reduce_float_16_true:\n"
673 " .loc 3 181 0\n"
674 " cvt.u32.u16 %r1, %ctaid.x;\n"
675 " mul24.lo.u32 %r2, %r1, 32;\n"
676 " cvt.u32.u16 %r3, %tid.x;\n"
677 " add.u32 %r4, %r2, %r3;\n"
678 " mov.s32 %r5, %r4;\n"
679 " ld.param.u32 %r6, [__cudaparm_reduce_float_16_true_n];\n"
680 " setp.ge.u32 %p1, %r4, %r6;\n"
681 " @%p1 bra $Lt_6_16130;\n"
682 " mov.u16 %rh1, %nctaid.x;\n"
683 " mul.wide.u16 %r7, %rh1, 32;\n"
684 " cvt.s64.u32 %rd1, %r7;\n"
685 " ld.param.u64 %rd2, [__cudaparm_reduce_float_16_true_g_idata];\n"
686 " cvt.u64.u32 %rd3, %r4;\n"
687 " mul.wide.u32 %rd4, %r4, 4;\n"
688 " add.u64 %rd5, %rd2, %rd4;\n"
689 " mul.wide.u32 %rd6, %r7, 4;\n"
690 " mov.f32 %f1, 0f00000000; // 0\n"
691 "$Lt_6_14594:\n"
692 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
693 " .loc 3 188 0\n"
694 " ld.global.f32 %f2, [%rd5+0];\n"
695 " add.f32 %f3, %f2, %f1;\n"
696 " .loc 3 191 0\n"
697 " ld.global.f32 %f4, [%rd5+64];\n"
698 " add.f32 %f1, %f4, %f3;\n"
699 " add.u32 %r5, %r7, %r5;\n"
700 " add.u64 %rd5, %rd5, %rd6;\n"
701 " .loc 3 181 0\n"
702 " ld.param.u32 %r6, [__cudaparm_reduce_float_16_true_n];\n"
703 " .loc 3 191 0\n"
704 " setp.lt.u32 %p2, %r5, %r6;\n"
705 " @%p2 bra $Lt_6_14594;\n"
706 " bra.uni $Lt_6_14082;\n"
707 "$Lt_6_16130:\n"
708 " mov.f32 %f1, 0f00000000; // 0\n"
709 "$Lt_6_14082:\n"
710 " .loc 3 71 0\n"
711 " mov.u64 %rd7, __smem;\n"
712 " cvt.u64.u32 %rd8, %r3;\n"
713 " mul.wide.u32 %rd9, %r3, 4;\n"
714 " add.u64 %rd10, %rd7, %rd9;\n"
715 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
716 " .loc 3 72 0\n"
717 " bar.sync 0;\n"
718 " mov.u32 %r8, 31;\n"
719 " setp.gt.u32 %p3, %r3, %r8;\n"
720 " @%p3 bra $Lt_6_15106;\n"
721 " .loc 3 85 0\n"
722 " ld.volatile.shared.f32 %f5, [%rd10+32];\n"
723 " add.f32 %f6, %f5, %f1;\n"
724 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
725 " .loc 3 86 0\n"
726 " ld.volatile.shared.f32 %f7, [%rd10+16];\n"
727 " add.f32 %f8, %f7, %f6;\n"
728 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
729 " .loc 3 87 0\n"
730 " ld.volatile.shared.f32 %f9, [%rd10+8];\n"
731 " add.f32 %f10, %f9, %f8;\n"
732 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
733 " .loc 3 88 0\n"
734 " ld.volatile.shared.f32 %f11, [%rd10+4];\n"
735 " add.f32 %f12, %f11, %f10;\n"
736 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
737 "$Lt_6_15106:\n"
738 " .loc 3 195 0\n"
739 " mov.u32 %r9, 0;\n"
740 " setp.ne.u32 %p4, %r3, %r9;\n"
741 " @%p4 bra $Lt_6_15618;\n"
742 " .loc 3 199 0\n"
743 " ld.shared.f32 %f13, [__smem+0];\n"
744 " ld.param.u64 %rd11, [__cudaparm_reduce_float_16_true_g_odata];\n"
745 " cvt.u64.u32 %rd12, %r1;\n"
746 " mul.wide.u32 %rd13, %r1, 4;\n"
747 " add.u64 %rd14, %rd11, %rd13;\n"
748 " st.global.f32 [%rd14+0], %f13;\n"
749 "$Lt_6_15618:\n"
750 " .loc 3 395 0\n"
751 " exit;\n"
752 "$LDWend_reduce_float_16_true:\n"
753 " } // reduce_float_16_true\n"
754 "\n"
755 " .entry reduce_float_32_true (\n"
756 " .param .u64 __cudaparm_reduce_float_32_true_g_idata,\n"
757 " .param .u64 __cudaparm_reduce_float_32_true_g_odata,\n"
758 " .param .u32 __cudaparm_reduce_float_32_true_n)\n"
759 " {\n"
760 " .reg .u16 %rh<3>;\n"
761 " .reg .u32 %r<11>;\n"
762 " .reg .u64 %rd<16>;\n"
763 " .reg .f32 %f<17>;\n"
764 " .reg .pred %p<6>;\n"
765 " .loc 3 397 0\n"
766 "$LDWbegin_reduce_float_32_true:\n"
767 " .loc 3 181 0\n"
768 " cvt.u32.u16 %r1, %ctaid.x;\n"
769 " mul24.lo.u32 %r2, %r1, 64;\n"
770 " cvt.u32.u16 %r3, %tid.x;\n"
771 " add.u32 %r4, %r2, %r3;\n"
772 " mov.s32 %r5, %r4;\n"
773 " ld.param.u32 %r6, [__cudaparm_reduce_float_32_true_n];\n"
774 " setp.ge.u32 %p1, %r4, %r6;\n"
775 " @%p1 bra $Lt_7_15874;\n"
776 " mov.u16 %rh1, %nctaid.x;\n"
777 " mul.wide.u16 %r7, %rh1, 64;\n"
778 " cvt.s64.u32 %rd1, %r7;\n"
779 " ld.param.u64 %rd2, [__cudaparm_reduce_float_32_true_g_idata];\n"
780 " cvt.u64.u32 %rd3, %r4;\n"
781 " mul.wide.u32 %rd4, %r4, 4;\n"
782 " add.u64 %rd5, %rd2, %rd4;\n"
783 " mul.wide.u32 %rd6, %r7, 4;\n"
784 " mov.f32 %f1, 0f00000000; // 0\n"
785 "$Lt_7_14338:\n"
786 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
787 " .loc 3 188 0\n"
788 " ld.global.f32 %f2, [%rd5+0];\n"
789 " add.f32 %f3, %f2, %f1;\n"
790 " .loc 3 191 0\n"
791 " ld.global.f32 %f4, [%rd5+128];\n"
792 " add.f32 %f1, %f4, %f3;\n"
793 " add.u32 %r5, %r7, %r5;\n"
794 " add.u64 %rd5, %rd5, %rd6;\n"
795 " .loc 3 181 0\n"
796 " ld.param.u32 %r6, [__cudaparm_reduce_float_32_true_n];\n"
797 " .loc 3 191 0\n"
798 " setp.lt.u32 %p2, %r5, %r6;\n"
799 " @%p2 bra $Lt_7_14338;\n"
800 " bra.uni $Lt_7_13826;\n"
801 "$Lt_7_15874:\n"
802 " mov.f32 %f1, 0f00000000; // 0\n"
803 "$Lt_7_13826:\n"
804 " .loc 3 71 0\n"
805 " mov.u64 %rd7, __smem;\n"
806 " cvt.u64.u32 %rd8, %r3;\n"
807 " mul.wide.u32 %rd9, %r3, 4;\n"
808 " add.u64 %rd10, %rd7, %rd9;\n"
809 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
810 " .loc 3 72 0\n"
811 " bar.sync 0;\n"
812 " mov.u32 %r8, 31;\n"
813 " setp.gt.u32 %p3, %r3, %r8;\n"
814 " @%p3 bra $Lt_7_14850;\n"
815 " .loc 3 84 0\n"
816 " ld.volatile.shared.f32 %f5, [%rd10+64];\n"
817 " add.f32 %f6, %f5, %f1;\n"
818 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
819 " .loc 3 85 0\n"
820 " ld.volatile.shared.f32 %f7, [%rd10+32];\n"
821 " add.f32 %f8, %f7, %f6;\n"
822 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
823 " .loc 3 86 0\n"
824 " ld.volatile.shared.f32 %f9, [%rd10+16];\n"
825 " add.f32 %f10, %f9, %f8;\n"
826 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
827 " .loc 3 87 0\n"
828 " ld.volatile.shared.f32 %f11, [%rd10+8];\n"
829 " add.f32 %f12, %f11, %f10;\n"
830 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
831 " .loc 3 88 0\n"
832 " ld.volatile.shared.f32 %f13, [%rd10+4];\n"
833 " add.f32 %f14, %f13, %f12;\n"
834 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
835 "$Lt_7_14850:\n"
836 " .loc 3 195 0\n"
837 " mov.u32 %r9, 0;\n"
838 " setp.ne.u32 %p4, %r3, %r9;\n"
839 " @%p4 bra $Lt_7_15362;\n"
840 " .loc 3 199 0\n"
841 " ld.shared.f32 %f15, [__smem+0];\n"
842 " ld.param.u64 %rd11, [__cudaparm_reduce_float_32_true_g_odata];\n"
843 " cvt.u64.u32 %rd12, %r1;\n"
844 " mul.wide.u32 %rd13, %r1, 4;\n"
845 " add.u64 %rd14, %rd11, %rd13;\n"
846 " st.global.f32 [%rd14+0], %f15;\n"
847 "$Lt_7_15362:\n"
848 " .loc 3 400 0\n"
849 " exit;\n"
850 "$LDWend_reduce_float_32_true:\n"
851 " } // reduce_float_32_true\n"
852 "\n"
853 " .entry reduce_float_64_true (\n"
854 " .param .u64 __cudaparm_reduce_float_64_true_g_idata,\n"
855 " .param .u64 __cudaparm_reduce_float_64_true_g_odata,\n"
856 " .param .u32 __cudaparm_reduce_float_64_true_n)\n"
857 " {\n"
858 " .reg .u16 %rh<3>;\n"
859 " .reg .u32 %r<11>;\n"
860 " .reg .u64 %rd<16>;\n"
861 " .reg .f32 %f<19>;\n"
862 " .reg .pred %p<6>;\n"
863 " .loc 3 402 0\n"
864 "$LDWbegin_reduce_float_64_true:\n"
865 " .loc 3 181 0\n"
866 " cvt.u32.u16 %r1, %ctaid.x;\n"
867 " mul24.lo.u32 %r2, %r1, 128;\n"
868 " cvt.u32.u16 %r3, %tid.x;\n"
869 " add.u32 %r4, %r2, %r3;\n"
870 " mov.s32 %r5, %r4;\n"
871 " ld.param.u32 %r6, [__cudaparm_reduce_float_64_true_n];\n"
872 " setp.ge.u32 %p1, %r4, %r6;\n"
873 " @%p1 bra $Lt_8_15618;\n"
874 " mov.u16 %rh1, %nctaid.x;\n"
875 " mul.wide.u16 %r7, %rh1, 128;\n"
876 " cvt.s64.u32 %rd1, %r7;\n"
877 " ld.param.u64 %rd2, [__cudaparm_reduce_float_64_true_g_idata];\n"
878 " cvt.u64.u32 %rd3, %r4;\n"
879 " mul.wide.u32 %rd4, %r4, 4;\n"
880 " add.u64 %rd5, %rd2, %rd4;\n"
881 " mul.wide.u32 %rd6, %r7, 4;\n"
882 " mov.f32 %f1, 0f00000000; // 0\n"
883 "$Lt_8_14082:\n"
884 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
885 " .loc 3 188 0\n"
886 " ld.global.f32 %f2, [%rd5+0];\n"
887 " add.f32 %f3, %f2, %f1;\n"
888 " .loc 3 191 0\n"
889 " ld.global.f32 %f4, [%rd5+256];\n"
890 " add.f32 %f1, %f4, %f3;\n"
891 " add.u32 %r5, %r7, %r5;\n"
892 " add.u64 %rd5, %rd5, %rd6;\n"
893 " .loc 3 181 0\n"
894 " ld.param.u32 %r6, [__cudaparm_reduce_float_64_true_n];\n"
895 " .loc 3 191 0\n"
896 " setp.lt.u32 %p2, %r5, %r6;\n"
897 " @%p2 bra $Lt_8_14082;\n"
898 " bra.uni $Lt_8_13570;\n"
899 "$Lt_8_15618:\n"
900 " mov.f32 %f1, 0f00000000; // 0\n"
901 "$Lt_8_13570:\n"
902 " .loc 3 71 0\n"
903 " mov.u64 %rd7, __smem;\n"
904 " cvt.u64.u32 %rd8, %r3;\n"
905 " mul.wide.u32 %rd9, %r3, 4;\n"
906 " add.u64 %rd10, %rd7, %rd9;\n"
907 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
908 " .loc 3 72 0\n"
909 " bar.sync 0;\n"
910 " mov.u32 %r8, 31;\n"
911 " setp.gt.u32 %p3, %r3, %r8;\n"
912 " @%p3 bra $Lt_8_14594;\n"
913 " .loc 3 83 0\n"
914 " ld.volatile.shared.f32 %f5, [%rd10+128];\n"
915 " add.f32 %f6, %f5, %f1;\n"
916 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
917 " .loc 3 84 0\n"
918 " ld.volatile.shared.f32 %f7, [%rd10+64];\n"
919 " add.f32 %f8, %f7, %f6;\n"
920 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
921 " .loc 3 85 0\n"
922 " ld.volatile.shared.f32 %f9, [%rd10+32];\n"
923 " add.f32 %f10, %f9, %f8;\n"
924 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
925 " .loc 3 86 0\n"
926 " ld.volatile.shared.f32 %f11, [%rd10+16];\n"
927 " add.f32 %f12, %f11, %f10;\n"
928 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
929 " .loc 3 87 0\n"
930 " ld.volatile.shared.f32 %f13, [%rd10+8];\n"
931 " add.f32 %f14, %f13, %f12;\n"
932 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
933 " .loc 3 88 0\n"
934 " ld.volatile.shared.f32 %f15, [%rd10+4];\n"
935 " add.f32 %f16, %f15, %f14;\n"
936 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
937 "$Lt_8_14594:\n"
938 " .loc 3 195 0\n"
939 " mov.u32 %r9, 0;\n"
940 " setp.ne.u32 %p4, %r3, %r9;\n"
941 " @%p4 bra $Lt_8_15106;\n"
942 " .loc 3 199 0\n"
943 " ld.shared.f32 %f17, [__smem+0];\n"
944 " ld.param.u64 %rd11, [__cudaparm_reduce_float_64_true_g_odata];\n"
945 " cvt.u64.u32 %rd12, %r1;\n"
946 " mul.wide.u32 %rd13, %r1, 4;\n"
947 " add.u64 %rd14, %rd11, %rd13;\n"
948 " st.global.f32 [%rd14+0], %f17;\n"
949 "$Lt_8_15106:\n"
950 " .loc 3 405 0\n"
951 " exit;\n"
952 "$LDWend_reduce_float_64_true:\n"
953 " } // reduce_float_64_true\n"
954 "\n"
955 " .entry reduce_float_128_true (\n"
956 " .param .u64 __cudaparm_reduce_float_128_true_g_idata,\n"
957 " .param .u64 __cudaparm_reduce_float_128_true_g_odata,\n"
958 " .param .u32 __cudaparm_reduce_float_128_true_n)\n"
959 " {\n"
960 " .reg .u16 %rh<3>;\n"
961 " .reg .u32 %r<12>;\n"
962 " .reg .u64 %rd<16>;\n"
963 " .reg .f32 %f<21>;\n"
964 " .reg .pred %p<7>;\n"
965 " .loc 3 407 0\n"
966 "$LDWbegin_reduce_float_128_true:\n"
967 " .loc 3 181 0\n"
968 " cvt.u32.u16 %r1, %ctaid.x;\n"
969 " mul.lo.u32 %r2, %r1, 256;\n"
970 " cvt.u32.u16 %r3, %tid.x;\n"
971 " add.u32 %r4, %r2, %r3;\n"
972 " mov.s32 %r5, %r4;\n"
973 " ld.param.u32 %r6, [__cudaparm_reduce_float_128_true_n];\n"
974 " setp.ge.u32 %p1, %r4, %r6;\n"
975 " @%p1 bra $Lt_9_15874;\n"
976 " mov.u16 %rh1, %nctaid.x;\n"
977 " mul.wide.u16 %r7, %rh1, 256;\n"
978 " cvt.s64.u32 %rd1, %r7;\n"
979 " ld.param.u64 %rd2, [__cudaparm_reduce_float_128_true_g_idata];\n"
980 " cvt.u64.u32 %rd3, %r4;\n"
981 " mul.wide.u32 %rd4, %r4, 4;\n"
982 " add.u64 %rd5, %rd2, %rd4;\n"
983 " mul.wide.u32 %rd6, %r7, 4;\n"
984 " mov.f32 %f1, 0f00000000; // 0\n"
985 "$Lt_9_13826:\n"
986 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
987 " .loc 3 188 0\n"
988 " ld.global.f32 %f2, [%rd5+0];\n"
989 " add.f32 %f3, %f2, %f1;\n"
990 " .loc 3 191 0\n"
991 " ld.global.f32 %f4, [%rd5+512];\n"
992 " add.f32 %f1, %f4, %f3;\n"
993 " add.u32 %r5, %r7, %r5;\n"
994 " add.u64 %rd5, %rd5, %rd6;\n"
995 " .loc 3 181 0\n"
996 " ld.param.u32 %r6, [__cudaparm_reduce_float_128_true_n];\n"
997 " .loc 3 191 0\n"
998 " setp.lt.u32 %p2, %r5, %r6;\n"
999 " @%p2 bra $Lt_9_13826;\n"
1000 " bra.uni $Lt_9_13314;\n"
1001 "$Lt_9_15874:\n"
1002 " mov.f32 %f1, 0f00000000; // 0\n"
1003 "$Lt_9_13314:\n"
1004 " .loc 3 195 0\n"
1005 " mov.f32 %f5, %f1;\n"
1006 " mov.f32 %f6, %f5;\n"
1007 " .loc 3 71 0\n"
1008 " mov.u64 %rd7, __smem;\n"
1009 " cvt.u64.u32 %rd8, %r3;\n"
1010 " mul.wide.u32 %rd9, %r3, 4;\n"
1011 " add.u64 %rd10, %rd7, %rd9;\n"
1012 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1013 " .loc 3 72 0\n"
1014 " bar.sync 0;\n"
1015 " mov.u32 %r8, 63;\n"
1016 " setp.gt.u32 %p3, %r3, %r8;\n"
1017 " @%p3 bra $Lt_9_14338;\n"
1018 " .loc 3 77 0\n"
1019 " ld.volatile.shared.f32 %f7, [%rd10+256];\n"
1020 " add.f32 %f6, %f7, %f5;\n"
1021 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1022 "$Lt_9_14338:\n"
1023 " bar.sync 0;\n"
1024 " mov.u32 %r9, 31;\n"
1025 " setp.gt.u32 %p4, %r3, %r9;\n"
1026 " @%p4 bra $Lt_9_14850;\n"
1027 " .loc 3 83 0\n"
1028 " ld.volatile.shared.f32 %f8, [%rd10+128];\n"
1029 " add.f32 %f9, %f8, %f6;\n"
1030 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1031 " .loc 3 84 0\n"
1032 " ld.volatile.shared.f32 %f10, [%rd10+64];\n"
1033 " add.f32 %f11, %f10, %f9;\n"
1034 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1035 " .loc 3 85 0\n"
1036 " ld.volatile.shared.f32 %f12, [%rd10+32];\n"
1037 " add.f32 %f13, %f12, %f11;\n"
1038 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1039 " .loc 3 86 0\n"
1040 " ld.volatile.shared.f32 %f14, [%rd10+16];\n"
1041 " add.f32 %f15, %f14, %f13;\n"
1042 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
1043 " .loc 3 87 0\n"
1044 " ld.volatile.shared.f32 %f16, [%rd10+8];\n"
1045 " add.f32 %f17, %f16, %f15;\n"
1046 " st.volatile.shared.f32 [%rd10+0], %f17;\n"
1047 " .loc 3 88 0\n"
1048 " ld.volatile.shared.f32 %f18, [%rd10+4];\n"
1049 " add.f32 %f6, %f18, %f17;\n"
1050 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1051 "$Lt_9_14850:\n"
1052 " .loc 3 195 0\n"
1053 " mov.u32 %r10, 0;\n"
1054 " setp.ne.u32 %p5, %r3, %r10;\n"
1055 " @%p5 bra $Lt_9_15362;\n"
1056 " .loc 3 199 0\n"
1057 " ld.shared.f32 %f19, [__smem+0];\n"
1058 " ld.param.u64 %rd11, [__cudaparm_reduce_float_128_true_g_odata];\n"
1059 " cvt.u64.u32 %rd12, %r1;\n"
1060 " mul.wide.u32 %rd13, %r1, 4;\n"
1061 " add.u64 %rd14, %rd11, %rd13;\n"
1062 " st.global.f32 [%rd14+0], %f19;\n"
1063 "$Lt_9_15362:\n"
1064 " .loc 3 410 0\n"
1065 " exit;\n"
1066 "$LDWend_reduce_float_128_true:\n"
1067 " } // reduce_float_128_true\n"
1068 "\n"
1069 " .entry reduce_float_256_true (\n"
1070 " .param .u64 __cudaparm_reduce_float_256_true_g_idata,\n"
1071 " .param .u64 __cudaparm_reduce_float_256_true_g_odata,\n"
1072 " .param .u32 __cudaparm_reduce_float_256_true_n)\n"
1073 " {\n"
1074 " .reg .u16 %rh<3>;\n"
1075 " .reg .u32 %r<13>;\n"
1076 " .reg .u64 %rd<16>;\n"
1077 " .reg .f32 %f<22>;\n"
1078 " .reg .pred %p<8>;\n"
1079 " .loc 3 412 0\n"
1080 "$LDWbegin_reduce_float_256_true:\n"
1081 " .loc 3 181 0\n"
1082 " cvt.u32.u16 %r1, %ctaid.x;\n"
1083 " mul.lo.u32 %r2, %r1, 512;\n"
1084 " cvt.u32.u16 %r3, %tid.x;\n"
1085 " add.u32 %r4, %r2, %r3;\n"
1086 " mov.s32 %r5, %r4;\n"
1087 " ld.param.u32 %r6, [__cudaparm_reduce_float_256_true_n];\n"
1088 " setp.ge.u32 %p1, %r4, %r6;\n"
1089 " @%p1 bra $Lt_10_16130;\n"
1090 " mov.u16 %rh1, %nctaid.x;\n"
1091 " mul.wide.u16 %r7, %rh1, 512;\n"
1092 " cvt.s64.u32 %rd1, %r7;\n"
1093 " ld.param.u64 %rd2, [__cudaparm_reduce_float_256_true_g_idata];\n"
1094 " cvt.u64.u32 %rd3, %r4;\n"
1095 " mul.wide.u32 %rd4, %r4, 4;\n"
1096 " add.u64 %rd5, %rd2, %rd4;\n"
1097 " mul.wide.u32 %rd6, %r7, 4;\n"
1098 " mov.f32 %f1, 0f00000000; // 0\n"
1099 "$Lt_10_13570:\n"
1100 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1101 " .loc 3 188 0\n"
1102 " ld.global.f32 %f2, [%rd5+0];\n"
1103 " add.f32 %f3, %f2, %f1;\n"
1104 " .loc 3 191 0\n"
1105 " ld.global.f32 %f4, [%rd5+1024];\n"
1106 " add.f32 %f1, %f4, %f3;\n"
1107 " add.u32 %r5, %r7, %r5;\n"
1108 " add.u64 %rd5, %rd5, %rd6;\n"
1109 " .loc 3 181 0\n"
1110 " ld.param.u32 %r6, [__cudaparm_reduce_float_256_true_n];\n"
1111 " .loc 3 191 0\n"
1112 " setp.lt.u32 %p2, %r5, %r6;\n"
1113 " @%p2 bra $Lt_10_13570;\n"
1114 " bra.uni $Lt_10_13058;\n"
1115 "$Lt_10_16130:\n"
1116 " mov.f32 %f1, 0f00000000; // 0\n"
1117 "$Lt_10_13058:\n"
1118 " .loc 3 195 0\n"
1119 " mov.f32 %f5, %f1;\n"
1120 " mov.f32 %f6, %f5;\n"
1121 " .loc 3 71 0\n"
1122 " mov.u64 %rd7, __smem;\n"
1123 " cvt.u64.u32 %rd8, %r3;\n"
1124 " mul.wide.u32 %rd9, %r3, 4;\n"
1125 " add.u64 %rd10, %rd7, %rd9;\n"
1126 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1127 " .loc 3 72 0\n"
1128 " bar.sync 0;\n"
1129 " mov.u32 %r8, 127;\n"
1130 " setp.gt.u32 %p3, %r3, %r8;\n"
1131 " @%p3 bra $Lt_10_14082;\n"
1132 " .loc 3 76 0\n"
1133 " ld.volatile.shared.f32 %f7, [%rd10+512];\n"
1134 " add.f32 %f6, %f7, %f5;\n"
1135 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1136 "$Lt_10_14082:\n"
1137 " bar.sync 0;\n"
1138 " mov.u32 %r9, 63;\n"
1139 " setp.gt.u32 %p4, %r3, %r9;\n"
1140 " @%p4 bra $Lt_10_14594;\n"
1141 " .loc 3 77 0\n"
1142 " ld.volatile.shared.f32 %f8, [%rd10+256];\n"
1143 " add.f32 %f6, %f8, %f6;\n"
1144 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1145 "$Lt_10_14594:\n"
1146 " bar.sync 0;\n"
1147 " mov.u32 %r10, 31;\n"
1148 " setp.gt.u32 %p5, %r3, %r10;\n"
1149 " @%p5 bra $Lt_10_15106;\n"
1150 " .loc 3 83 0\n"
1151 " ld.volatile.shared.f32 %f9, [%rd10+128];\n"
1152 " add.f32 %f10, %f9, %f6;\n"
1153 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
1154 " .loc 3 84 0\n"
1155 " ld.volatile.shared.f32 %f11, [%rd10+64];\n"
1156 " add.f32 %f12, %f11, %f10;\n"
1157 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
1158 " .loc 3 85 0\n"
1159 " ld.volatile.shared.f32 %f13, [%rd10+32];\n"
1160 " add.f32 %f14, %f13, %f12;\n"
1161 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
1162 " .loc 3 86 0\n"
1163 " ld.volatile.shared.f32 %f15, [%rd10+16];\n"
1164 " add.f32 %f16, %f15, %f14;\n"
1165 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
1166 " .loc 3 87 0\n"
1167 " ld.volatile.shared.f32 %f17, [%rd10+8];\n"
1168 " add.f32 %f18, %f17, %f16;\n"
1169 " st.volatile.shared.f32 [%rd10+0], %f18;\n"
1170 " .loc 3 88 0\n"
1171 " ld.volatile.shared.f32 %f19, [%rd10+4];\n"
1172 " add.f32 %f6, %f19, %f18;\n"
1173 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1174 "$Lt_10_15106:\n"
1175 " .loc 3 195 0\n"
1176 " mov.u32 %r11, 0;\n"
1177 " setp.ne.u32 %p6, %r3, %r11;\n"
1178 " @%p6 bra $Lt_10_15618;\n"
1179 " .loc 3 199 0\n"
1180 " ld.shared.f32 %f20, [__smem+0];\n"
1181 " ld.param.u64 %rd11, [__cudaparm_reduce_float_256_true_g_odata];\n"
1182 " cvt.u64.u32 %rd12, %r1;\n"
1183 " mul.wide.u32 %rd13, %r1, 4;\n"
1184 " add.u64 %rd14, %rd11, %rd13;\n"
1185 " st.global.f32 [%rd14+0], %f20;\n"
1186 "$Lt_10_15618:\n"
1187 " .loc 3 415 0\n"
1188 " exit;\n"
1189 "$LDWend_reduce_float_256_true:\n"
1190 " } // reduce_float_256_true\n"
1191 "\n"
1192 " .entry reduce_float_512_true (\n"
1193 " .param .u64 __cudaparm_reduce_float_512_true_g_idata,\n"
1194 " .param .u64 __cudaparm_reduce_float_512_true_g_odata,\n"
1195 " .param .u32 __cudaparm_reduce_float_512_true_n)\n"
1196 " {\n"
1197 " .reg .u16 %rh<3>;\n"
1198 " .reg .u32 %r<14>;\n"
1199 " .reg .u64 %rd<16>;\n"
1200 " .reg .f32 %f<23>;\n"
1201 " .reg .pred %p<9>;\n"
1202 " .loc 3 417 0\n"
1203 "$LDWbegin_reduce_float_512_true:\n"
1204 " .loc 3 181 0\n"
1205 " cvt.u32.u16 %r1, %ctaid.x;\n"
1206 " mul.lo.u32 %r2, %r1, 1024;\n"
1207 " cvt.u32.u16 %r3, %tid.x;\n"
1208 " add.u32 %r4, %r2, %r3;\n"
1209 " mov.s32 %r5, %r4;\n"
1210 " ld.param.u32 %r6, [__cudaparm_reduce_float_512_true_n];\n"
1211 " setp.ge.u32 %p1, %r4, %r6;\n"
1212 " @%p1 bra $Lt_11_16386;\n"
1213 " mov.u16 %rh1, %nctaid.x;\n"
1214 " mul.wide.u16 %r7, %rh1, 1024;\n"
1215 " cvt.s64.u32 %rd1, %r7;\n"
1216 " ld.param.u64 %rd2, [__cudaparm_reduce_float_512_true_g_idata];\n"
1217 " cvt.u64.u32 %rd3, %r4;\n"
1218 " mul.wide.u32 %rd4, %r4, 4;\n"
1219 " add.u64 %rd5, %rd2, %rd4;\n"
1220 " mul.wide.u32 %rd6, %r7, 4;\n"
1221 " mov.f32 %f1, 0f00000000; // 0\n"
1222 "$Lt_11_13314:\n"
1223 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1224 " .loc 3 188 0\n"
1225 " ld.global.f32 %f2, [%rd5+0];\n"
1226 " add.f32 %f3, %f2, %f1;\n"
1227 " .loc 3 191 0\n"
1228 " ld.global.f32 %f4, [%rd5+2048];\n"
1229 " add.f32 %f1, %f4, %f3;\n"
1230 " add.u32 %r5, %r7, %r5;\n"
1231 " add.u64 %rd5, %rd5, %rd6;\n"
1232 " .loc 3 181 0\n"
1233 " ld.param.u32 %r6, [__cudaparm_reduce_float_512_true_n];\n"
1234 " .loc 3 191 0\n"
1235 " setp.lt.u32 %p2, %r5, %r6;\n"
1236 " @%p2 bra $Lt_11_13314;\n"
1237 " bra.uni $Lt_11_12802;\n"
1238 "$Lt_11_16386:\n"
1239 " mov.f32 %f1, 0f00000000; // 0\n"
1240 "$Lt_11_12802:\n"
1241 " .loc 3 195 0\n"
1242 " mov.f32 %f5, %f1;\n"
1243 " mov.f32 %f6, %f5;\n"
1244 " .loc 3 71 0\n"
1245 " mov.u64 %rd7, __smem;\n"
1246 " cvt.u64.u32 %rd8, %r3;\n"
1247 " mul.wide.u32 %rd9, %r3, 4;\n"
1248 " add.u64 %rd10, %rd7, %rd9;\n"
1249 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1250 " .loc 3 72 0\n"
1251 " bar.sync 0;\n"
1252 " mov.u32 %r8, 255;\n"
1253 " setp.gt.u32 %p3, %r3, %r8;\n"
1254 " @%p3 bra $Lt_11_13826;\n"
1255 " .loc 3 75 0\n"
1256 " ld.volatile.shared.f32 %f7, [%rd10+1024];\n"
1257 " add.f32 %f6, %f7, %f5;\n"
1258 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1259 "$Lt_11_13826:\n"
1260 " bar.sync 0;\n"
1261 " mov.u32 %r9, 127;\n"
1262 " setp.gt.u32 %p4, %r3, %r9;\n"
1263 " @%p4 bra $Lt_11_14338;\n"
1264 " .loc 3 76 0\n"
1265 " ld.volatile.shared.f32 %f8, [%rd10+512];\n"
1266 " add.f32 %f6, %f8, %f6;\n"
1267 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1268 "$Lt_11_14338:\n"
1269 " bar.sync 0;\n"
1270 " mov.u32 %r10, 63;\n"
1271 " setp.gt.u32 %p5, %r3, %r10;\n"
1272 " @%p5 bra $Lt_11_14850;\n"
1273 " .loc 3 77 0\n"
1274 " ld.volatile.shared.f32 %f9, [%rd10+256];\n"
1275 " add.f32 %f6, %f9, %f6;\n"
1276 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1277 "$Lt_11_14850:\n"
1278 " bar.sync 0;\n"
1279 " mov.u32 %r11, 31;\n"
1280 " setp.gt.u32 %p6, %r3, %r11;\n"
1281 " @%p6 bra $Lt_11_15362;\n"
1282 " .loc 3 83 0\n"
1283 " ld.volatile.shared.f32 %f10, [%rd10+128];\n"
1284 " add.f32 %f11, %f10, %f6;\n"
1285 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1286 " .loc 3 84 0\n"
1287 " ld.volatile.shared.f32 %f12, [%rd10+64];\n"
1288 " add.f32 %f13, %f12, %f11;\n"
1289 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1290 " .loc 3 85 0\n"
1291 " ld.volatile.shared.f32 %f14, [%rd10+32];\n"
1292 " add.f32 %f15, %f14, %f13;\n"
1293 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
1294 " .loc 3 86 0\n"
1295 " ld.volatile.shared.f32 %f16, [%rd10+16];\n"
1296 " add.f32 %f17, %f16, %f15;\n"
1297 " st.volatile.shared.f32 [%rd10+0], %f17;\n"
1298 " .loc 3 87 0\n"
1299 " ld.volatile.shared.f32 %f18, [%rd10+8];\n"
1300 " add.f32 %f19, %f18, %f17;\n"
1301 " st.volatile.shared.f32 [%rd10+0], %f19;\n"
1302 " .loc 3 88 0\n"
1303 " ld.volatile.shared.f32 %f20, [%rd10+4];\n"
1304 " add.f32 %f6, %f20, %f19;\n"
1305 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1306 "$Lt_11_15362:\n"
1307 " .loc 3 195 0\n"
1308 " mov.u32 %r12, 0;\n"
1309 " setp.ne.u32 %p7, %r3, %r12;\n"
1310 " @%p7 bra $Lt_11_15874;\n"
1311 " .loc 3 199 0\n"
1312 " ld.shared.f32 %f21, [__smem+0];\n"
1313 " ld.param.u64 %rd11, [__cudaparm_reduce_float_512_true_g_odata];\n"
1314 " cvt.u64.u32 %rd12, %r1;\n"
1315 " mul.wide.u32 %rd13, %r1, 4;\n"
1316 " add.u64 %rd14, %rd11, %rd13;\n"
1317 " st.global.f32 [%rd14+0], %f21;\n"
1318 "$Lt_11_15874:\n"
1319 " .loc 3 420 0\n"
1320 " exit;\n"
1321 "$LDWend_reduce_float_512_true:\n"
1322 " } // reduce_float_512_true\n"
1323 "\n"
1324 " .entry reduce_float_1_false (\n"
1325 " .param .u64 __cudaparm_reduce_float_1_false_g_idata,\n"
1326 " .param .u64 __cudaparm_reduce_float_1_false_g_odata,\n"
1327 " .param .u32 __cudaparm_reduce_float_1_false_n)\n"
1328 " {\n"
1329 " .reg .u16 %rh<3>;\n"
1330 " .reg .u32 %r<11>;\n"
1331 " .reg .u64 %rd<16>;\n"
1332 " .reg .f32 %f<6>;\n"
1333 " .reg .pred %p<6>;\n"
1334 " .loc 3 423 0\n"
1335 "$LDWbegin_reduce_float_1_false:\n"
1336 " .loc 3 181 0\n"
1337 " cvt.u32.u16 %r1, %ctaid.x;\n"
1338 " mul24.lo.u32 %r2, %r1, 2;\n"
1339 " cvt.u32.u16 %r3, %tid.x;\n"
1340 " add.u32 %r4, %r2, %r3;\n"
1341 " ld.param.u32 %r5, [__cudaparm_reduce_float_1_false_n];\n"
1342 " setp.ge.u32 %p1, %r4, %r5;\n"
1343 " @%p1 bra $Lt_12_17154;\n"
1344 " add.u32 %r6, %r4, 1;\n"
1345 " ld.param.u32 %r5, [__cudaparm_reduce_float_1_false_n];\n"
1346 " add.u32 %r7, %r5, 1;\n"
1347 " mov.u16 %rh1, %nctaid.x;\n"
1348 " mul.wide.u16 %r8, %rh1, 2;\n"
1349 " cvt.s64.u32 %rd1, %r8;\n"
1350 " ld.param.u64 %rd2, [__cudaparm_reduce_float_1_false_g_idata];\n"
1351 " cvt.u64.u32 %rd3, %r4;\n"
1352 " mul.wide.u32 %rd4, %r4, 4;\n"
1353 " add.u64 %rd5, %rd2, %rd4;\n"
1354 " mul.wide.u32 %rd6, %r8, 4;\n"
1355 " mov.f32 %f1, 0f00000000; // 0\n"
1356 "$Lt_12_15618:\n"
1357 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1358 " .loc 3 188 0\n"
1359 " ld.global.f32 %f2, [%rd5+0];\n"
1360 " add.f32 %f1, %f2, %f1;\n"
1361 " .loc 3 181 0\n"
1362 " ld.param.u32 %r5, [__cudaparm_reduce_float_1_false_n];\n"
1363 " .loc 3 188 0\n"
1364 " setp.ge.u32 %p2, %r6, %r5;\n"
1365 " @%p2 bra $Lt_12_15874;\n"
1366 " //<loop> Part of loop body line 181, head labeled $Lt_12_15618\n"
1367 " .loc 3 191 0\n"
1368 " ld.global.f32 %f3, [%rd5+4];\n"
1369 " add.f32 %f1, %f3, %f1;\n"
1370 "$Lt_12_15874:\n"
1371 " //<loop> Part of loop body line 181, head labeled $Lt_12_15618\n"
1372 " add.u32 %r6, %r6, %r8;\n"
1373 " add.u64 %rd5, %rd5, %rd6;\n"
1374 " setp.lt.u32 %p3, %r6, %r7;\n"
1375 " @%p3 bra $Lt_12_15618;\n"
1376 " bra.uni $Lt_12_15106;\n"
1377 "$Lt_12_17154:\n"
1378 " mov.f32 %f1, 0f00000000; // 0\n"
1379 "$Lt_12_15106:\n"
1380 " .loc 3 71 0\n"
1381 " mov.u64 %rd7, __smem;\n"
1382 " cvt.u64.u32 %rd8, %r3;\n"
1383 " mul.wide.u32 %rd9, %r3, 4;\n"
1384 " add.u64 %rd10, %rd7, %rd9;\n"
1385 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1386 " .loc 3 72 0\n"
1387 " bar.sync 0;\n"
1388 " .loc 3 195 0\n"
1389 " mov.u32 %r9, 0;\n"
1390 " setp.ne.u32 %p4, %r3, %r9;\n"
1391 " @%p4 bra $Lt_12_16642;\n"
1392 " .loc 3 199 0\n"
1393 " ld.shared.f32 %f4, [__smem+0];\n"
1394 " ld.param.u64 %rd11, [__cudaparm_reduce_float_1_false_g_odata];\n"
1395 " cvt.u64.u32 %rd12, %r1;\n"
1396 " mul.wide.u32 %rd13, %r1, 4;\n"
1397 " add.u64 %rd14, %rd11, %rd13;\n"
1398 " st.global.f32 [%rd14+0], %f4;\n"
1399 "$Lt_12_16642:\n"
1400 " .loc 3 426 0\n"
1401 " exit;\n"
1402 "$LDWend_reduce_float_1_false:\n"
1403 " } // reduce_float_1_false\n"
1404 "\n"
1405 " .entry reduce_float_2_false (\n"
1406 " .param .u64 __cudaparm_reduce_float_2_false_g_idata,\n"
1407 " .param .u64 __cudaparm_reduce_float_2_false_g_odata,\n"
1408 " .param .u32 __cudaparm_reduce_float_2_false_n)\n"
1409 " {\n"
1410 " .reg .u16 %rh<3>;\n"
1411 " .reg .u32 %r<12>;\n"
1412 " .reg .u64 %rd<16>;\n"
1413 " .reg .f32 %f<8>;\n"
1414 " .reg .pred %p<7>;\n"
1415 " .loc 3 428 0\n"
1416 "$LDWbegin_reduce_float_2_false:\n"
1417 " .loc 3 181 0\n"
1418 " cvt.u32.u16 %r1, %ctaid.x;\n"
1419 " mul24.lo.u32 %r2, %r1, 4;\n"
1420 " cvt.u32.u16 %r3, %tid.x;\n"
1421 " add.u32 %r4, %r2, %r3;\n"
1422 " ld.param.u32 %r5, [__cudaparm_reduce_float_2_false_n];\n"
1423 " setp.ge.u32 %p1, %r4, %r5;\n"
1424 " @%p1 bra $Lt_13_17410;\n"
1425 " add.u32 %r6, %r4, 2;\n"
1426 " ld.param.u32 %r5, [__cudaparm_reduce_float_2_false_n];\n"
1427 " add.u32 %r7, %r5, 2;\n"
1428 " mov.u16 %rh1, %nctaid.x;\n"
1429 " mul.wide.u16 %r8, %rh1, 4;\n"
1430 " cvt.s64.u32 %rd1, %r8;\n"
1431 " ld.param.u64 %rd2, [__cudaparm_reduce_float_2_false_g_idata];\n"
1432 " cvt.u64.u32 %rd3, %r4;\n"
1433 " mul.wide.u32 %rd4, %r4, 4;\n"
1434 " add.u64 %rd5, %rd2, %rd4;\n"
1435 " mul.wide.u32 %rd6, %r8, 4;\n"
1436 " mov.f32 %f1, 0f00000000; // 0\n"
1437 "$Lt_13_15362:\n"
1438 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1439 " .loc 3 188 0\n"
1440 " ld.global.f32 %f2, [%rd5+0];\n"
1441 " add.f32 %f1, %f2, %f1;\n"
1442 " .loc 3 181 0\n"
1443 " ld.param.u32 %r5, [__cudaparm_reduce_float_2_false_n];\n"
1444 " .loc 3 188 0\n"
1445 " setp.ge.u32 %p2, %r6, %r5;\n"
1446 " @%p2 bra $Lt_13_15618;\n"
1447 " //<loop> Part of loop body line 181, head labeled $Lt_13_15362\n"
1448 " .loc 3 191 0\n"
1449 " ld.global.f32 %f3, [%rd5+8];\n"
1450 " add.f32 %f1, %f3, %f1;\n"
1451 "$Lt_13_15618:\n"
1452 " //<loop> Part of loop body line 181, head labeled $Lt_13_15362\n"
1453 " add.u32 %r6, %r6, %r8;\n"
1454 " add.u64 %rd5, %rd5, %rd6;\n"
1455 " setp.lt.u32 %p3, %r6, %r7;\n"
1456 " @%p3 bra $Lt_13_15362;\n"
1457 " bra.uni $Lt_13_14850;\n"
1458 "$Lt_13_17410:\n"
1459 " mov.f32 %f1, 0f00000000; // 0\n"
1460 "$Lt_13_14850:\n"
1461 " .loc 3 71 0\n"
1462 " mov.u64 %rd7, __smem;\n"
1463 " cvt.u64.u32 %rd8, %r3;\n"
1464 " mul.wide.u32 %rd9, %r3, 4;\n"
1465 " add.u64 %rd10, %rd7, %rd9;\n"
1466 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1467 " .loc 3 72 0\n"
1468 " bar.sync 0;\n"
1469 " mov.u32 %r9, 31;\n"
1470 " setp.gt.u32 %p4, %r3, %r9;\n"
1471 " @%p4 bra $Lt_13_16386;\n"
1472 " .loc 3 88 0\n"
1473 " ld.volatile.shared.f32 %f4, [%rd10+4];\n"
1474 " add.f32 %f5, %f4, %f1;\n"
1475 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1476 "$Lt_13_16386:\n"
1477 " .loc 3 195 0\n"
1478 " mov.u32 %r10, 0;\n"
1479 " setp.ne.u32 %p5, %r3, %r10;\n"
1480 " @%p5 bra $Lt_13_16898;\n"
1481 " .loc 3 199 0\n"
1482 " ld.shared.f32 %f6, [__smem+0];\n"
1483 " ld.param.u64 %rd11, [__cudaparm_reduce_float_2_false_g_odata];\n"
1484 " cvt.u64.u32 %rd12, %r1;\n"
1485 " mul.wide.u32 %rd13, %r1, 4;\n"
1486 " add.u64 %rd14, %rd11, %rd13;\n"
1487 " st.global.f32 [%rd14+0], %f6;\n"
1488 "$Lt_13_16898:\n"
1489 " .loc 3 431 0\n"
1490 " exit;\n"
1491 "$LDWend_reduce_float_2_false:\n"
1492 " } // reduce_float_2_false\n"
1493 "\n"
1494 " .entry reduce_float_4_false (\n"
1495 " .param .u64 __cudaparm_reduce_float_4_false_g_idata,\n"
1496 " .param .u64 __cudaparm_reduce_float_4_false_g_odata,\n"
1497 " .param .u32 __cudaparm_reduce_float_4_false_n)\n"
1498 " {\n"
1499 " .reg .u16 %rh<3>;\n"
1500 " .reg .u32 %r<12>;\n"
1501 " .reg .u64 %rd<16>;\n"
1502 " .reg .f32 %f<10>;\n"
1503 " .reg .pred %p<7>;\n"
1504 " .loc 3 433 0\n"
1505 "$LDWbegin_reduce_float_4_false:\n"
1506 " .loc 3 181 0\n"
1507 " cvt.u32.u16 %r1, %ctaid.x;\n"
1508 " mul24.lo.u32 %r2, %r1, 8;\n"
1509 " cvt.u32.u16 %r3, %tid.x;\n"
1510 " add.u32 %r4, %r2, %r3;\n"
1511 " ld.param.u32 %r5, [__cudaparm_reduce_float_4_false_n];\n"
1512 " setp.ge.u32 %p1, %r4, %r5;\n"
1513 " @%p1 bra $Lt_14_17154;\n"
1514 " add.u32 %r6, %r4, 4;\n"
1515 " ld.param.u32 %r5, [__cudaparm_reduce_float_4_false_n];\n"
1516 " add.u32 %r7, %r5, 4;\n"
1517 " mov.u16 %rh1, %nctaid.x;\n"
1518 " mul.wide.u16 %r8, %rh1, 8;\n"
1519 " cvt.s64.u32 %rd1, %r8;\n"
1520 " ld.param.u64 %rd2, [__cudaparm_reduce_float_4_false_g_idata];\n"
1521 " cvt.u64.u32 %rd3, %r4;\n"
1522 " mul.wide.u32 %rd4, %r4, 4;\n"
1523 " add.u64 %rd5, %rd2, %rd4;\n"
1524 " mul.wide.u32 %rd6, %r8, 4;\n"
1525 " mov.f32 %f1, 0f00000000; // 0\n"
1526 "$Lt_14_15106:\n"
1527 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1528 " .loc 3 188 0\n"
1529 " ld.global.f32 %f2, [%rd5+0];\n"
1530 " add.f32 %f1, %f2, %f1;\n"
1531 " .loc 3 181 0\n"
1532 " ld.param.u32 %r5, [__cudaparm_reduce_float_4_false_n];\n"
1533 " .loc 3 188 0\n"
1534 " setp.ge.u32 %p2, %r6, %r5;\n"
1535 " @%p2 bra $Lt_14_15362;\n"
1536 " //<loop> Part of loop body line 181, head labeled $Lt_14_15106\n"
1537 " .loc 3 191 0\n"
1538 " ld.global.f32 %f3, [%rd5+16];\n"
1539 " add.f32 %f1, %f3, %f1;\n"
1540 "$Lt_14_15362:\n"
1541 " //<loop> Part of loop body line 181, head labeled $Lt_14_15106\n"
1542 " add.u32 %r6, %r6, %r8;\n"
1543 " add.u64 %rd5, %rd5, %rd6;\n"
1544 " setp.lt.u32 %p3, %r6, %r7;\n"
1545 " @%p3 bra $Lt_14_15106;\n"
1546 " bra.uni $Lt_14_14594;\n"
1547 "$Lt_14_17154:\n"
1548 " mov.f32 %f1, 0f00000000; // 0\n"
1549 "$Lt_14_14594:\n"
1550 " .loc 3 71 0\n"
1551 " mov.u64 %rd7, __smem;\n"
1552 " cvt.u64.u32 %rd8, %r3;\n"
1553 " mul.wide.u32 %rd9, %r3, 4;\n"
1554 " add.u64 %rd10, %rd7, %rd9;\n"
1555 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1556 " .loc 3 72 0\n"
1557 " bar.sync 0;\n"
1558 " mov.u32 %r9, 31;\n"
1559 " setp.gt.u32 %p4, %r3, %r9;\n"
1560 " @%p4 bra $Lt_14_16130;\n"
1561 " .loc 3 87 0\n"
1562 " ld.volatile.shared.f32 %f4, [%rd10+8];\n"
1563 " add.f32 %f5, %f4, %f1;\n"
1564 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1565 " .loc 3 88 0\n"
1566 " ld.volatile.shared.f32 %f6, [%rd10+4];\n"
1567 " add.f32 %f7, %f6, %f5;\n"
1568 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1569 "$Lt_14_16130:\n"
1570 " .loc 3 195 0\n"
1571 " mov.u32 %r10, 0;\n"
1572 " setp.ne.u32 %p5, %r3, %r10;\n"
1573 " @%p5 bra $Lt_14_16642;\n"
1574 " .loc 3 199 0\n"
1575 " ld.shared.f32 %f8, [__smem+0];\n"
1576 " ld.param.u64 %rd11, [__cudaparm_reduce_float_4_false_g_odata];\n"
1577 " cvt.u64.u32 %rd12, %r1;\n"
1578 " mul.wide.u32 %rd13, %r1, 4;\n"
1579 " add.u64 %rd14, %rd11, %rd13;\n"
1580 " st.global.f32 [%rd14+0], %f8;\n"
1581 "$Lt_14_16642:\n"
1582 " .loc 3 436 0\n"
1583 " exit;\n"
1584 "$LDWend_reduce_float_4_false:\n"
1585 " } // reduce_float_4_false\n"
1586 "\n"
1587 " .entry reduce_float_8_false (\n"
1588 " .param .u64 __cudaparm_reduce_float_8_false_g_idata,\n"
1589 " .param .u64 __cudaparm_reduce_float_8_false_g_odata,\n"
1590 " .param .u32 __cudaparm_reduce_float_8_false_n)\n"
1591 " {\n"
1592 " .reg .u16 %rh<3>;\n"
1593 " .reg .u32 %r<12>;\n"
1594 " .reg .u64 %rd<16>;\n"
1595 " .reg .f32 %f<12>;\n"
1596 " .reg .pred %p<7>;\n"
1597 " .loc 3 438 0\n"
1598 "$LDWbegin_reduce_float_8_false:\n"
1599 " .loc 3 181 0\n"
1600 " cvt.u32.u16 %r1, %ctaid.x;\n"
1601 " mul24.lo.u32 %r2, %r1, 16;\n"
1602 " cvt.u32.u16 %r3, %tid.x;\n"
1603 " add.u32 %r4, %r2, %r3;\n"
1604 " ld.param.u32 %r5, [__cudaparm_reduce_float_8_false_n];\n"
1605 " setp.ge.u32 %p1, %r4, %r5;\n"
1606 " @%p1 bra $Lt_15_16898;\n"
1607 " add.u32 %r6, %r4, 8;\n"
1608 " ld.param.u32 %r5, [__cudaparm_reduce_float_8_false_n];\n"
1609 " add.u32 %r7, %r5, 8;\n"
1610 " mov.u16 %rh1, %nctaid.x;\n"
1611 " mul.wide.u16 %r8, %rh1, 16;\n"
1612 " cvt.s64.u32 %rd1, %r8;\n"
1613 " ld.param.u64 %rd2, [__cudaparm_reduce_float_8_false_g_idata];\n"
1614 " cvt.u64.u32 %rd3, %r4;\n"
1615 " mul.wide.u32 %rd4, %r4, 4;\n"
1616 " add.u64 %rd5, %rd2, %rd4;\n"
1617 " mul.wide.u32 %rd6, %r8, 4;\n"
1618 " mov.f32 %f1, 0f00000000; // 0\n"
1619 "$Lt_15_14850:\n"
1620 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1621 " .loc 3 188 0\n"
1622 " ld.global.f32 %f2, [%rd5+0];\n"
1623 " add.f32 %f1, %f2, %f1;\n"
1624 " .loc 3 181 0\n"
1625 " ld.param.u32 %r5, [__cudaparm_reduce_float_8_false_n];\n"
1626 " .loc 3 188 0\n"
1627 " setp.ge.u32 %p2, %r6, %r5;\n"
1628 " @%p2 bra $Lt_15_15106;\n"
1629 " //<loop> Part of loop body line 181, head labeled $Lt_15_14850\n"
1630 " .loc 3 191 0\n"
1631 " ld.global.f32 %f3, [%rd5+32];\n"
1632 " add.f32 %f1, %f3, %f1;\n"
1633 "$Lt_15_15106:\n"
1634 " //<loop> Part of loop body line 181, head labeled $Lt_15_14850\n"
1635 " add.u32 %r6, %r6, %r8;\n"
1636 " add.u64 %rd5, %rd5, %rd6;\n"
1637 " setp.lt.u32 %p3, %r6, %r7;\n"
1638 " @%p3 bra $Lt_15_14850;\n"
1639 " bra.uni $Lt_15_14338;\n"
1640 "$Lt_15_16898:\n"
1641 " mov.f32 %f1, 0f00000000; // 0\n"
1642 "$Lt_15_14338:\n"
1643 " .loc 3 71 0\n"
1644 " mov.u64 %rd7, __smem;\n"
1645 " cvt.u64.u32 %rd8, %r3;\n"
1646 " mul.wide.u32 %rd9, %r3, 4;\n"
1647 " add.u64 %rd10, %rd7, %rd9;\n"
1648 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1649 " .loc 3 72 0\n"
1650 " bar.sync 0;\n"
1651 " mov.u32 %r9, 31;\n"
1652 " setp.gt.u32 %p4, %r3, %r9;\n"
1653 " @%p4 bra $Lt_15_15874;\n"
1654 " .loc 3 86 0\n"
1655 " ld.volatile.shared.f32 %f4, [%rd10+16];\n"
1656 " add.f32 %f5, %f4, %f1;\n"
1657 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1658 " .loc 3 87 0\n"
1659 " ld.volatile.shared.f32 %f6, [%rd10+8];\n"
1660 " add.f32 %f7, %f6, %f5;\n"
1661 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1662 " .loc 3 88 0\n"
1663 " ld.volatile.shared.f32 %f8, [%rd10+4];\n"
1664 " add.f32 %f9, %f8, %f7;\n"
1665 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1666 "$Lt_15_15874:\n"
1667 " .loc 3 195 0\n"
1668 " mov.u32 %r10, 0;\n"
1669 " setp.ne.u32 %p5, %r3, %r10;\n"
1670 " @%p5 bra $Lt_15_16386;\n"
1671 " .loc 3 199 0\n"
1672 " ld.shared.f32 %f10, [__smem+0];\n"
1673 " ld.param.u64 %rd11, [__cudaparm_reduce_float_8_false_g_odata];\n"
1674 " cvt.u64.u32 %rd12, %r1;\n"
1675 " mul.wide.u32 %rd13, %r1, 4;\n"
1676 " add.u64 %rd14, %rd11, %rd13;\n"
1677 " st.global.f32 [%rd14+0], %f10;\n"
1678 "$Lt_15_16386:\n"
1679 " .loc 3 441 0\n"
1680 " exit;\n"
1681 "$LDWend_reduce_float_8_false:\n"
1682 " } // reduce_float_8_false\n"
1683 "\n"
1684 " .entry reduce_float_16_false (\n"
1685 " .param .u64 __cudaparm_reduce_float_16_false_g_idata,\n"
1686 " .param .u64 __cudaparm_reduce_float_16_false_g_odata,\n"
1687 " .param .u32 __cudaparm_reduce_float_16_false_n)\n"
1688 " {\n"
1689 " .reg .u16 %rh<3>;\n"
1690 " .reg .u32 %r<12>;\n"
1691 " .reg .u64 %rd<16>;\n"
1692 " .reg .f32 %f<14>;\n"
1693 " .reg .pred %p<7>;\n"
1694 " .loc 3 443 0\n"
1695 "$LDWbegin_reduce_float_16_false:\n"
1696 " .loc 3 181 0\n"
1697 " cvt.u32.u16 %r1, %ctaid.x;\n"
1698 " mul24.lo.u32 %r2, %r1, 32;\n"
1699 " cvt.u32.u16 %r3, %tid.x;\n"
1700 " add.u32 %r4, %r2, %r3;\n"
1701 " ld.param.u32 %r5, [__cudaparm_reduce_float_16_false_n];\n"
1702 " setp.ge.u32 %p1, %r4, %r5;\n"
1703 " @%p1 bra $Lt_16_16642;\n"
1704 " add.u32 %r6, %r4, 16;\n"
1705 " ld.param.u32 %r5, [__cudaparm_reduce_float_16_false_n];\n"
1706 " add.u32 %r7, %r5, 16;\n"
1707 " mov.u16 %rh1, %nctaid.x;\n"
1708 " mul.wide.u16 %r8, %rh1, 32;\n"
1709 " cvt.s64.u32 %rd1, %r8;\n"
1710 " ld.param.u64 %rd2, [__cudaparm_reduce_float_16_false_g_idata];\n"
1711 " cvt.u64.u32 %rd3, %r4;\n"
1712 " mul.wide.u32 %rd4, %r4, 4;\n"
1713 " add.u64 %rd5, %rd2, %rd4;\n"
1714 " mul.wide.u32 %rd6, %r8, 4;\n"
1715 " mov.f32 %f1, 0f00000000; // 0\n"
1716 "$Lt_16_14594:\n"
1717 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1718 " .loc 3 188 0\n"
1719 " ld.global.f32 %f2, [%rd5+0];\n"
1720 " add.f32 %f1, %f2, %f1;\n"
1721 " .loc 3 181 0\n"
1722 " ld.param.u32 %r5, [__cudaparm_reduce_float_16_false_n];\n"
1723 " .loc 3 188 0\n"
1724 " setp.ge.u32 %p2, %r6, %r5;\n"
1725 " @%p2 bra $Lt_16_14850;\n"
1726 " //<loop> Part of loop body line 181, head labeled $Lt_16_14594\n"
1727 " .loc 3 191 0\n"
1728 " ld.global.f32 %f3, [%rd5+64];\n"
1729 " add.f32 %f1, %f3, %f1;\n"
1730 "$Lt_16_14850:\n"
1731 " //<loop> Part of loop body line 181, head labeled $Lt_16_14594\n"
1732 " add.u32 %r6, %r6, %r8;\n"
1733 " add.u64 %rd5, %rd5, %rd6;\n"
1734 " setp.lt.u32 %p3, %r6, %r7;\n"
1735 " @%p3 bra $Lt_16_14594;\n"
1736 " bra.uni $Lt_16_14082;\n"
1737 "$Lt_16_16642:\n"
1738 " mov.f32 %f1, 0f00000000; // 0\n"
1739 "$Lt_16_14082:\n"
1740 " .loc 3 71 0\n"
1741 " mov.u64 %rd7, __smem;\n"
1742 " cvt.u64.u32 %rd8, %r3;\n"
1743 " mul.wide.u32 %rd9, %r3, 4;\n"
1744 " add.u64 %rd10, %rd7, %rd9;\n"
1745 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1746 " .loc 3 72 0\n"
1747 " bar.sync 0;\n"
1748 " mov.u32 %r9, 31;\n"
1749 " setp.gt.u32 %p4, %r3, %r9;\n"
1750 " @%p4 bra $Lt_16_15618;\n"
1751 " .loc 3 85 0\n"
1752 " ld.volatile.shared.f32 %f4, [%rd10+32];\n"
1753 " add.f32 %f5, %f4, %f1;\n"
1754 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1755 " .loc 3 86 0\n"
1756 " ld.volatile.shared.f32 %f6, [%rd10+16];\n"
1757 " add.f32 %f7, %f6, %f5;\n"
1758 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1759 " .loc 3 87 0\n"
1760 " ld.volatile.shared.f32 %f8, [%rd10+8];\n"
1761 " add.f32 %f9, %f8, %f7;\n"
1762 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1763 " .loc 3 88 0\n"
1764 " ld.volatile.shared.f32 %f10, [%rd10+4];\n"
1765 " add.f32 %f11, %f10, %f9;\n"
1766 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1767 "$Lt_16_15618:\n"
1768 " .loc 3 195 0\n"
1769 " mov.u32 %r10, 0;\n"
1770 " setp.ne.u32 %p5, %r3, %r10;\n"
1771 " @%p5 bra $Lt_16_16130;\n"
1772 " .loc 3 199 0\n"
1773 " ld.shared.f32 %f12, [__smem+0];\n"
1774 " ld.param.u64 %rd11, [__cudaparm_reduce_float_16_false_g_odata];\n"
1775 " cvt.u64.u32 %rd12, %r1;\n"
1776 " mul.wide.u32 %rd13, %r1, 4;\n"
1777 " add.u64 %rd14, %rd11, %rd13;\n"
1778 " st.global.f32 [%rd14+0], %f12;\n"
1779 "$Lt_16_16130:\n"
1780 " .loc 3 446 0\n"
1781 " exit;\n"
1782 "$LDWend_reduce_float_16_false:\n"
1783 " } // reduce_float_16_false\n"
1784 "\n"
1785 " .entry reduce_float_32_false (\n"
1786 " .param .u64 __cudaparm_reduce_float_32_false_g_idata,\n"
1787 " .param .u64 __cudaparm_reduce_float_32_false_g_odata,\n"
1788 " .param .u32 __cudaparm_reduce_float_32_false_n)\n"
1789 " {\n"
1790 " .reg .u16 %rh<3>;\n"
1791 " .reg .u32 %r<12>;\n"
1792 " .reg .u64 %rd<16>;\n"
1793 " .reg .f32 %f<16>;\n"
1794 " .reg .pred %p<7>;\n"
1795 " .loc 3 448 0\n"
1796 "$LDWbegin_reduce_float_32_false:\n"
1797 " .loc 3 181 0\n"
1798 " cvt.u32.u16 %r1, %ctaid.x;\n"
1799 " mul24.lo.u32 %r2, %r1, 64;\n"
1800 " cvt.u32.u16 %r3, %tid.x;\n"
1801 " add.u32 %r4, %r2, %r3;\n"
1802 " ld.param.u32 %r5, [__cudaparm_reduce_float_32_false_n];\n"
1803 " setp.ge.u32 %p1, %r4, %r5;\n"
1804 " @%p1 bra $Lt_17_16386;\n"
1805 " add.u32 %r6, %r4, 32;\n"
1806 " ld.param.u32 %r5, [__cudaparm_reduce_float_32_false_n];\n"
1807 " add.u32 %r7, %r5, 32;\n"
1808 " mov.u16 %rh1, %nctaid.x;\n"
1809 " mul.wide.u16 %r8, %rh1, 64;\n"
1810 " cvt.s64.u32 %rd1, %r8;\n"
1811 " ld.param.u64 %rd2, [__cudaparm_reduce_float_32_false_g_idata];\n"
1812 " cvt.u64.u32 %rd3, %r4;\n"
1813 " mul.wide.u32 %rd4, %r4, 4;\n"
1814 " add.u64 %rd5, %rd2, %rd4;\n"
1815 " mul.wide.u32 %rd6, %r8, 4;\n"
1816 " mov.f32 %f1, 0f00000000; // 0\n"
1817 "$Lt_17_14338:\n"
1818 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1819 " .loc 3 188 0\n"
1820 " ld.global.f32 %f2, [%rd5+0];\n"
1821 " add.f32 %f1, %f2, %f1;\n"
1822 " .loc 3 181 0\n"
1823 " ld.param.u32 %r5, [__cudaparm_reduce_float_32_false_n];\n"
1824 " .loc 3 188 0\n"
1825 " setp.ge.u32 %p2, %r6, %r5;\n"
1826 " @%p2 bra $Lt_17_14594;\n"
1827 " //<loop> Part of loop body line 181, head labeled $Lt_17_14338\n"
1828 " .loc 3 191 0\n"
1829 " ld.global.f32 %f3, [%rd5+128];\n"
1830 " add.f32 %f1, %f3, %f1;\n"
1831 "$Lt_17_14594:\n"
1832 " //<loop> Part of loop body line 181, head labeled $Lt_17_14338\n"
1833 " add.u32 %r6, %r6, %r8;\n"
1834 " add.u64 %rd5, %rd5, %rd6;\n"
1835 " setp.lt.u32 %p3, %r6, %r7;\n"
1836 " @%p3 bra $Lt_17_14338;\n"
1837 " bra.uni $Lt_17_13826;\n"
1838 "$Lt_17_16386:\n"
1839 " mov.f32 %f1, 0f00000000; // 0\n"
1840 "$Lt_17_13826:\n"
1841 " .loc 3 71 0\n"
1842 " mov.u64 %rd7, __smem;\n"
1843 " cvt.u64.u32 %rd8, %r3;\n"
1844 " mul.wide.u32 %rd9, %r3, 4;\n"
1845 " add.u64 %rd10, %rd7, %rd9;\n"
1846 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1847 " .loc 3 72 0\n"
1848 " bar.sync 0;\n"
1849 " mov.u32 %r9, 31;\n"
1850 " setp.gt.u32 %p4, %r3, %r9;\n"
1851 " @%p4 bra $Lt_17_15362;\n"
1852 " .loc 3 84 0\n"
1853 " ld.volatile.shared.f32 %f4, [%rd10+64];\n"
1854 " add.f32 %f5, %f4, %f1;\n"
1855 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1856 " .loc 3 85 0\n"
1857 " ld.volatile.shared.f32 %f6, [%rd10+32];\n"
1858 " add.f32 %f7, %f6, %f5;\n"
1859 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1860 " .loc 3 86 0\n"
1861 " ld.volatile.shared.f32 %f8, [%rd10+16];\n"
1862 " add.f32 %f9, %f8, %f7;\n"
1863 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1864 " .loc 3 87 0\n"
1865 " ld.volatile.shared.f32 %f10, [%rd10+8];\n"
1866 " add.f32 %f11, %f10, %f9;\n"
1867 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1868 " .loc 3 88 0\n"
1869 " ld.volatile.shared.f32 %f12, [%rd10+4];\n"
1870 " add.f32 %f13, %f12, %f11;\n"
1871 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1872 "$Lt_17_15362:\n"
1873 " .loc 3 195 0\n"
1874 " mov.u32 %r10, 0;\n"
1875 " setp.ne.u32 %p5, %r3, %r10;\n"
1876 " @%p5 bra $Lt_17_15874;\n"
1877 " .loc 3 199 0\n"
1878 " ld.shared.f32 %f14, [__smem+0];\n"
1879 " ld.param.u64 %rd11, [__cudaparm_reduce_float_32_false_g_odata];\n"
1880 " cvt.u64.u32 %rd12, %r1;\n"
1881 " mul.wide.u32 %rd13, %r1, 4;\n"
1882 " add.u64 %rd14, %rd11, %rd13;\n"
1883 " st.global.f32 [%rd14+0], %f14;\n"
1884 "$Lt_17_15874:\n"
1885 " .loc 3 451 0\n"
1886 " exit;\n"
1887 "$LDWend_reduce_float_32_false:\n"
1888 " } // reduce_float_32_false\n"
1889 "\n"
1890 " .entry reduce_float_64_false (\n"
1891 " .param .u64 __cudaparm_reduce_float_64_false_g_idata,\n"
1892 " .param .u64 __cudaparm_reduce_float_64_false_g_odata,\n"
1893 " .param .u32 __cudaparm_reduce_float_64_false_n)\n"
1894 " {\n"
1895 " .reg .u16 %rh<3>;\n"
1896 " .reg .u32 %r<12>;\n"
1897 " .reg .u64 %rd<16>;\n"
1898 " .reg .f32 %f<18>;\n"
1899 " .reg .pred %p<7>;\n"
1900 " .loc 3 453 0\n"
1901 "$LDWbegin_reduce_float_64_false:\n"
1902 " .loc 3 181 0\n"
1903 " cvt.u32.u16 %r1, %ctaid.x;\n"
1904 " mul24.lo.u32 %r2, %r1, 128;\n"
1905 " cvt.u32.u16 %r3, %tid.x;\n"
1906 " add.u32 %r4, %r2, %r3;\n"
1907 " ld.param.u32 %r5, [__cudaparm_reduce_float_64_false_n];\n"
1908 " setp.ge.u32 %p1, %r4, %r5;\n"
1909 " @%p1 bra $Lt_18_16130;\n"
1910 " add.u32 %r6, %r4, 64;\n"
1911 " ld.param.u32 %r5, [__cudaparm_reduce_float_64_false_n];\n"
1912 " add.u32 %r7, %r5, 64;\n"
1913 " mov.u16 %rh1, %nctaid.x;\n"
1914 " mul.wide.u16 %r8, %rh1, 128;\n"
1915 " cvt.s64.u32 %rd1, %r8;\n"
1916 " ld.param.u64 %rd2, [__cudaparm_reduce_float_64_false_g_idata];\n"
1917 " cvt.u64.u32 %rd3, %r4;\n"
1918 " mul.wide.u32 %rd4, %r4, 4;\n"
1919 " add.u64 %rd5, %rd2, %rd4;\n"
1920 " mul.wide.u32 %rd6, %r8, 4;\n"
1921 " mov.f32 %f1, 0f00000000; // 0\n"
1922 "$Lt_18_14082:\n"
1923 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1924 " .loc 3 188 0\n"
1925 " ld.global.f32 %f2, [%rd5+0];\n"
1926 " add.f32 %f1, %f2, %f1;\n"
1927 " .loc 3 181 0\n"
1928 " ld.param.u32 %r5, [__cudaparm_reduce_float_64_false_n];\n"
1929 " .loc 3 188 0\n"
1930 " setp.ge.u32 %p2, %r6, %r5;\n"
1931 " @%p2 bra $Lt_18_14338;\n"
1932 " //<loop> Part of loop body line 181, head labeled $Lt_18_14082\n"
1933 " .loc 3 191 0\n"
1934 " ld.global.f32 %f3, [%rd5+256];\n"
1935 " add.f32 %f1, %f3, %f1;\n"
1936 "$Lt_18_14338:\n"
1937 " //<loop> Part of loop body line 181, head labeled $Lt_18_14082\n"
1938 " add.u32 %r6, %r6, %r8;\n"
1939 " add.u64 %rd5, %rd5, %rd6;\n"
1940 " setp.lt.u32 %p3, %r6, %r7;\n"
1941 " @%p3 bra $Lt_18_14082;\n"
1942 " bra.uni $Lt_18_13570;\n"
1943 "$Lt_18_16130:\n"
1944 " mov.f32 %f1, 0f00000000; // 0\n"
1945 "$Lt_18_13570:\n"
1946 " .loc 3 71 0\n"
1947 " mov.u64 %rd7, __smem;\n"
1948 " cvt.u64.u32 %rd8, %r3;\n"
1949 " mul.wide.u32 %rd9, %r3, 4;\n"
1950 " add.u64 %rd10, %rd7, %rd9;\n"
1951 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1952 " .loc 3 72 0\n"
1953 " bar.sync 0;\n"
1954 " mov.u32 %r9, 31;\n"
1955 " setp.gt.u32 %p4, %r3, %r9;\n"
1956 " @%p4 bra $Lt_18_15106;\n"
1957 " .loc 3 83 0\n"
1958 " ld.volatile.shared.f32 %f4, [%rd10+128];\n"
1959 " add.f32 %f5, %f4, %f1;\n"
1960 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1961 " .loc 3 84 0\n"
1962 " ld.volatile.shared.f32 %f6, [%rd10+64];\n"
1963 " add.f32 %f7, %f6, %f5;\n"
1964 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1965 " .loc 3 85 0\n"
1966 " ld.volatile.shared.f32 %f8, [%rd10+32];\n"
1967 " add.f32 %f9, %f8, %f7;\n"
1968 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1969 " .loc 3 86 0\n"
1970 " ld.volatile.shared.f32 %f10, [%rd10+16];\n"
1971 " add.f32 %f11, %f10, %f9;\n"
1972 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1973 " .loc 3 87 0\n"
1974 " ld.volatile.shared.f32 %f12, [%rd10+8];\n"
1975 " add.f32 %f13, %f12, %f11;\n"
1976 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1977 " .loc 3 88 0\n"
1978 " ld.volatile.shared.f32 %f14, [%rd10+4];\n"
1979 " add.f32 %f15, %f14, %f13;\n"
1980 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
1981 "$Lt_18_15106:\n"
1982 " .loc 3 195 0\n"
1983 " mov.u32 %r10, 0;\n"
1984 " setp.ne.u32 %p5, %r3, %r10;\n"
1985 " @%p5 bra $Lt_18_15618;\n"
1986 " .loc 3 199 0\n"
1987 " ld.shared.f32 %f16, [__smem+0];\n"
1988 " ld.param.u64 %rd11, [__cudaparm_reduce_float_64_false_g_odata];\n"
1989 " cvt.u64.u32 %rd12, %r1;\n"
1990 " mul.wide.u32 %rd13, %r1, 4;\n"
1991 " add.u64 %rd14, %rd11, %rd13;\n"
1992 " st.global.f32 [%rd14+0], %f16;\n"
1993 "$Lt_18_15618:\n"
1994 " .loc 3 456 0\n"
1995 " exit;\n"
1996 "$LDWend_reduce_float_64_false:\n"
1997 " } // reduce_float_64_false\n"
1998 "\n"
1999 " .entry reduce_float_128_false (\n"
2000 " .param .u64 __cudaparm_reduce_float_128_false_g_idata,\n"
2001 " .param .u64 __cudaparm_reduce_float_128_false_g_odata,\n"
2002 " .param .u32 __cudaparm_reduce_float_128_false_n)\n"
2003 " {\n"
2004 " .reg .u16 %rh<3>;\n"
2005 " .reg .u32 %r<13>;\n"
2006 " .reg .u64 %rd<16>;\n"
2007 " .reg .f32 %f<20>;\n"
2008 " .reg .pred %p<8>;\n"
2009 " .loc 3 458 0\n"
2010 "$LDWbegin_reduce_float_128_false:\n"
2011 " .loc 3 181 0\n"
2012 " cvt.u32.u16 %r1, %ctaid.x;\n"
2013 " mul.lo.u32 %r2, %r1, 256;\n"
2014 " cvt.u32.u16 %r3, %tid.x;\n"
2015 " add.u32 %r4, %r2, %r3;\n"
2016 " ld.param.u32 %r5, [__cudaparm_reduce_float_128_false_n];\n"
2017 " setp.ge.u32 %p1, %r4, %r5;\n"
2018 " @%p1 bra $Lt_19_16386;\n"
2019 " add.u32 %r6, %r4, 128;\n"
2020 " ld.param.u32 %r5, [__cudaparm_reduce_float_128_false_n];\n"
2021 " add.u32 %r7, %r5, 128;\n"
2022 " mov.u16 %rh1, %nctaid.x;\n"
2023 " mul.wide.u16 %r8, %rh1, 256;\n"
2024 " cvt.s64.u32 %rd1, %r8;\n"
2025 " ld.param.u64 %rd2, [__cudaparm_reduce_float_128_false_g_idata];\n"
2026 " cvt.u64.u32 %rd3, %r4;\n"
2027 " mul.wide.u32 %rd4, %r4, 4;\n"
2028 " add.u64 %rd5, %rd2, %rd4;\n"
2029 " mul.wide.u32 %rd6, %r8, 4;\n"
2030 " mov.f32 %f1, 0f00000000; // 0\n"
2031 "$Lt_19_13826:\n"
2032 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2033 " .loc 3 188 0\n"
2034 " ld.global.f32 %f2, [%rd5+0];\n"
2035 " add.f32 %f1, %f2, %f1;\n"
2036 " .loc 3 181 0\n"
2037 " ld.param.u32 %r5, [__cudaparm_reduce_float_128_false_n];\n"
2038 " .loc 3 188 0\n"
2039 " setp.ge.u32 %p2, %r6, %r5;\n"
2040 " @%p2 bra $Lt_19_14082;\n"
2041 " //<loop> Part of loop body line 181, head labeled $Lt_19_13826\n"
2042 " .loc 3 191 0\n"
2043 " ld.global.f32 %f3, [%rd5+512];\n"
2044 " add.f32 %f1, %f3, %f1;\n"
2045 "$Lt_19_14082:\n"
2046 " //<loop> Part of loop body line 181, head labeled $Lt_19_13826\n"
2047 " add.u32 %r6, %r6, %r8;\n"
2048 " add.u64 %rd5, %rd5, %rd6;\n"
2049 " setp.lt.u32 %p3, %r6, %r7;\n"
2050 " @%p3 bra $Lt_19_13826;\n"
2051 " bra.uni $Lt_19_13314;\n"
2052 "$Lt_19_16386:\n"
2053 " mov.f32 %f1, 0f00000000; // 0\n"
2054 "$Lt_19_13314:\n"
2055 " .loc 3 195 0\n"
2056 " mov.f32 %f4, %f1;\n"
2057 " mov.f32 %f5, %f4;\n"
2058 " .loc 3 71 0\n"
2059 " mov.u64 %rd7, __smem;\n"
2060 " cvt.u64.u32 %rd8, %r3;\n"
2061 " mul.wide.u32 %rd9, %r3, 4;\n"
2062 " add.u64 %rd10, %rd7, %rd9;\n"
2063 " st.volatile.shared.f32 [%rd10+0], %f4;\n"
2064 " .loc 3 72 0\n"
2065 " bar.sync 0;\n"
2066 " mov.u32 %r9, 63;\n"
2067 " setp.gt.u32 %p4, %r3, %r9;\n"
2068 " @%p4 bra $Lt_19_14850;\n"
2069 " .loc 3 77 0\n"
2070 " ld.volatile.shared.f32 %f6, [%rd10+256];\n"
2071 " add.f32 %f5, %f6, %f4;\n"
2072 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2073 "$Lt_19_14850:\n"
2074 " bar.sync 0;\n"
2075 " mov.u32 %r10, 31;\n"
2076 " setp.gt.u32 %p5, %r3, %r10;\n"
2077 " @%p5 bra $Lt_19_15362;\n"
2078 " .loc 3 83 0\n"
2079 " ld.volatile.shared.f32 %f7, [%rd10+128];\n"
2080 " add.f32 %f8, %f7, %f5;\n"
2081 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
2082 " .loc 3 84 0\n"
2083 " ld.volatile.shared.f32 %f9, [%rd10+64];\n"
2084 " add.f32 %f10, %f9, %f8;\n"
2085 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
2086 " .loc 3 85 0\n"
2087 " ld.volatile.shared.f32 %f11, [%rd10+32];\n"
2088 " add.f32 %f12, %f11, %f10;\n"
2089 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
2090 " .loc 3 86 0\n"
2091 " ld.volatile.shared.f32 %f13, [%rd10+16];\n"
2092 " add.f32 %f14, %f13, %f12;\n"
2093 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
2094 " .loc 3 87 0\n"
2095 " ld.volatile.shared.f32 %f15, [%rd10+8];\n"
2096 " add.f32 %f16, %f15, %f14;\n"
2097 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
2098 " .loc 3 88 0\n"
2099 " ld.volatile.shared.f32 %f17, [%rd10+4];\n"
2100 " add.f32 %f5, %f17, %f16;\n"
2101 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2102 "$Lt_19_15362:\n"
2103 " .loc 3 195 0\n"
2104 " mov.u32 %r11, 0;\n"
2105 " setp.ne.u32 %p6, %r3, %r11;\n"
2106 " @%p6 bra $Lt_19_15874;\n"
2107 " .loc 3 199 0\n"
2108 " ld.shared.f32 %f18, [__smem+0];\n"
2109 " ld.param.u64 %rd11, [__cudaparm_reduce_float_128_false_g_odata];\n"
2110 " cvt.u64.u32 %rd12, %r1;\n"
2111 " mul.wide.u32 %rd13, %r1, 4;\n"
2112 " add.u64 %rd14, %rd11, %rd13;\n"
2113 " st.global.f32 [%rd14+0], %f18;\n"
2114 "$Lt_19_15874:\n"
2115 " .loc 3 461 0\n"
2116 " exit;\n"
2117 "$LDWend_reduce_float_128_false:\n"
2118 " } // reduce_float_128_false\n"
2119 "\n"
2120 " .entry reduce_float_256_false (\n"
2121 " .param .u64 __cudaparm_reduce_float_256_false_g_idata,\n"
2122 " .param .u64 __cudaparm_reduce_float_256_false_g_odata,\n"
2123 " .param .u32 __cudaparm_reduce_float_256_false_n)\n"
2124 " {\n"
2125 " .reg .u16 %rh<3>;\n"
2126 " .reg .u32 %r<14>;\n"
2127 " .reg .u64 %rd<16>;\n"
2128 " .reg .f32 %f<21>;\n"
2129 " .reg .pred %p<9>;\n"
2130 " .loc 3 463 0\n"
2131 "$LDWbegin_reduce_float_256_false:\n"
2132 " .loc 3 181 0\n"
2133 " cvt.u32.u16 %r1, %ctaid.x;\n"
2134 " mul.lo.u32 %r2, %r1, 512;\n"
2135 " cvt.u32.u16 %r3, %tid.x;\n"
2136 " add.u32 %r4, %r2, %r3;\n"
2137 " ld.param.u32 %r5, [__cudaparm_reduce_float_256_false_n];\n"
2138 " setp.ge.u32 %p1, %r4, %r5;\n"
2139 " @%p1 bra $Lt_20_16642;\n"
2140 " add.u32 %r6, %r4, 256;\n"
2141 " ld.param.u32 %r5, [__cudaparm_reduce_float_256_false_n];\n"
2142 " add.u32 %r7, %r5, 256;\n"
2143 " mov.u16 %rh1, %nctaid.x;\n"
2144 " mul.wide.u16 %r8, %rh1, 512;\n"
2145 " cvt.s64.u32 %rd1, %r8;\n"
2146 " ld.param.u64 %rd2, [__cudaparm_reduce_float_256_false_g_idata];\n"
2147 " cvt.u64.u32 %rd3, %r4;\n"
2148 " mul.wide.u32 %rd4, %r4, 4;\n"
2149 " add.u64 %rd5, %rd2, %rd4;\n"
2150 " mul.wide.u32 %rd6, %r8, 4;\n"
2151 " mov.f32 %f1, 0f00000000; // 0\n"
2152 "$Lt_20_13570:\n"
2153 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2154 " .loc 3 188 0\n"
2155 " ld.global.f32 %f2, [%rd5+0];\n"
2156 " add.f32 %f1, %f2, %f1;\n"
2157 " .loc 3 181 0\n"
2158 " ld.param.u32 %r5, [__cudaparm_reduce_float_256_false_n];\n"
2159 " .loc 3 188 0\n"
2160 " setp.ge.u32 %p2, %r6, %r5;\n"
2161 " @%p2 bra $Lt_20_13826;\n"
2162 " //<loop> Part of loop body line 181, head labeled $Lt_20_13570\n"
2163 " .loc 3 191 0\n"
2164 " ld.global.f32 %f3, [%rd5+1024];\n"
2165 " add.f32 %f1, %f3, %f1;\n"
2166 "$Lt_20_13826:\n"
2167 " //<loop> Part of loop body line 181, head labeled $Lt_20_13570\n"
2168 " add.u32 %r6, %r6, %r8;\n"
2169 " add.u64 %rd5, %rd5, %rd6;\n"
2170 " setp.lt.u32 %p3, %r6, %r7;\n"
2171 " @%p3 bra $Lt_20_13570;\n"
2172 " bra.uni $Lt_20_13058;\n"
2173 "$Lt_20_16642:\n"
2174 " mov.f32 %f1, 0f00000000; // 0\n"
2175 "$Lt_20_13058:\n"
2176 " .loc 3 195 0\n"
2177 " mov.f32 %f4, %f1;\n"
2178 " mov.f32 %f5, %f4;\n"
2179 " .loc 3 71 0\n"
2180 " mov.u64 %rd7, __smem;\n"
2181 " cvt.u64.u32 %rd8, %r3;\n"
2182 " mul.wide.u32 %rd9, %r3, 4;\n"
2183 " add.u64 %rd10, %rd7, %rd9;\n"
2184 " st.volatile.shared.f32 [%rd10+0], %f4;\n"
2185 " .loc 3 72 0\n"
2186 " bar.sync 0;\n"
2187 " mov.u32 %r9, 127;\n"
2188 " setp.gt.u32 %p4, %r3, %r9;\n"
2189 " @%p4 bra $Lt_20_14594;\n"
2190 " .loc 3 76 0\n"
2191 " ld.volatile.shared.f32 %f6, [%rd10+512];\n"
2192 " add.f32 %f5, %f6, %f4;\n"
2193 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2194 "$Lt_20_14594:\n"
2195 " bar.sync 0;\n"
2196 " mov.u32 %r10, 63;\n"
2197 " setp.gt.u32 %p5, %r3, %r10;\n"
2198 " @%p5 bra $Lt_20_15106;\n"
2199 " .loc 3 77 0\n"
2200 " ld.volatile.shared.f32 %f7, [%rd10+256];\n"
2201 " add.f32 %f5, %f7, %f5;\n"
2202 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2203 "$Lt_20_15106:\n"
2204 " bar.sync 0;\n"
2205 " mov.u32 %r11, 31;\n"
2206 " setp.gt.u32 %p6, %r3, %r11;\n"
2207 " @%p6 bra $Lt_20_15618;\n"
2208 " .loc 3 83 0\n"
2209 " ld.volatile.shared.f32 %f8, [%rd10+128];\n"
2210 " add.f32 %f9, %f8, %f5;\n"
2211 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
2212 " .loc 3 84 0\n"
2213 " ld.volatile.shared.f32 %f10, [%rd10+64];\n"
2214 " add.f32 %f11, %f10, %f9;\n"
2215 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
2216 " .loc 3 85 0\n"
2217 " ld.volatile.shared.f32 %f12, [%rd10+32];\n"
2218 " add.f32 %f13, %f12, %f11;\n"
2219 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
2220 " .loc 3 86 0\n"
2221 " ld.volatile.shared.f32 %f14, [%rd10+16];\n"
2222 " add.f32 %f15, %f14, %f13;\n"
2223 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
2224 " .loc 3 87 0\n"
2225 " ld.volatile.shared.f32 %f16, [%rd10+8];\n"
2226 " add.f32 %f17, %f16, %f15;\n"
2227 " st.volatile.shared.f32 [%rd10+0], %f17;\n"
2228 " .loc 3 88 0\n"
2229 " ld.volatile.shared.f32 %f18, [%rd10+4];\n"
2230 " add.f32 %f5, %f18, %f17;\n"
2231 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2232 "$Lt_20_15618:\n"
2233 " .loc 3 195 0\n"
2234 " mov.u32 %r12, 0;\n"
2235 " setp.ne.u32 %p7, %r3, %r12;\n"
2236 " @%p7 bra $Lt_20_16130;\n"
2237 " .loc 3 199 0\n"
2238 " ld.shared.f32 %f19, [__smem+0];\n"
2239 " ld.param.u64 %rd11, [__cudaparm_reduce_float_256_false_g_odata];\n"
2240 " cvt.u64.u32 %rd12, %r1;\n"
2241 " mul.wide.u32 %rd13, %r1, 4;\n"
2242 " add.u64 %rd14, %rd11, %rd13;\n"
2243 " st.global.f32 [%rd14+0], %f19;\n"
2244 "$Lt_20_16130:\n"
2245 " .loc 3 466 0\n"
2246 " exit;\n"
2247 "$LDWend_reduce_float_256_false:\n"
2248 " } // reduce_float_256_false\n"
2249 "\n"
2250 " .entry reduce_float_512_false (\n"
2251 " .param .u64 __cudaparm_reduce_float_512_false_g_idata,\n"
2252 " .param .u64 __cudaparm_reduce_float_512_false_g_odata,\n"
2253 " .param .u32 __cudaparm_reduce_float_512_false_n)\n"
2254 " {\n"
2255 " .reg .u16 %rh<3>;\n"
2256 " .reg .u32 %r<15>;\n"
2257 " .reg .u64 %rd<16>;\n"
2258 " .reg .f32 %f<22>;\n"
2259 " .reg .pred %p<10>;\n"
2260 " .loc 3 468 0\n"
2261 "$LDWbegin_reduce_float_512_false:\n"
2262 " .loc 3 181 0\n"
2263 " cvt.u32.u16 %r1, %ctaid.x;\n"
2264 " mul.lo.u32 %r2, %r1, 1024;\n"
2265 " cvt.u32.u16 %r3, %tid.x;\n"
2266 " add.u32 %r4, %r2, %r3;\n"
2267 " ld.param.u32 %r5, [__cudaparm_reduce_float_512_false_n];\n"
2268 " setp.ge.u32 %p1, %r4, %r5;\n"
2269 " @%p1 bra $Lt_21_16898;\n"
2270 " add.u32 %r6, %r4, 512;\n"
2271 " ld.param.u32 %r5, [__cudaparm_reduce_float_512_false_n];\n"
2272 " add.u32 %r7, %r5, 512;\n"
2273 " mov.u16 %rh1, %nctaid.x;\n"
2274 " mul.wide.u16 %r8, %rh1, 1024;\n"
2275 " cvt.s64.u32 %rd1, %r8;\n"
2276 " ld.param.u64 %rd2, [__cudaparm_reduce_float_512_false_g_idata];\n"
2277 " cvt.u64.u32 %rd3, %r4;\n"
2278 " mul.wide.u32 %rd4, %r4, 4;\n"
2279 " add.u64 %rd5, %rd2, %rd4;\n"
2280 " mul.wide.u32 %rd6, %r8, 4;\n"
2281 " mov.f32 %f1, 0f00000000; // 0\n"
2282 "$Lt_21_13314:\n"
2283 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2284 " .loc 3 188 0\n"
2285 " ld.global.f32 %f2, [%rd5+0];\n"
2286 " add.f32 %f1, %f2, %f1;\n"
2287 " .loc 3 181 0\n"
2288 " ld.param.u32 %r5, [__cudaparm_reduce_float_512_false_n];\n"
2289 " .loc 3 188 0\n"
2290 " setp.ge.u32 %p2, %r6, %r5;\n"
2291 " @%p2 bra $Lt_21_13570;\n"
2292 " //<loop> Part of loop body line 181, head labeled $Lt_21_13314\n"
2293 " .loc 3 191 0\n"
2294 " ld.global.f32 %f3, [%rd5+2048];\n"
2295 " add.f32 %f1, %f3, %f1;\n"
2296 "$Lt_21_13570:\n"
2297 " //<loop> Part of loop body line 181, head labeled $Lt_21_13314\n"
2298 " add.u32 %r6, %r6, %r8;\n"
2299 " add.u64 %rd5, %rd5, %rd6;\n"
2300 " setp.lt.u32 %p3, %r6, %r7;\n"
2301 " @%p3 bra $Lt_21_13314;\n"
2302 " bra.uni $Lt_21_12802;\n"
2303 "$Lt_21_16898:\n"
2304 " mov.f32 %f1, 0f00000000; // 0\n"
2305 "$Lt_21_12802:\n"
2306 " .loc 3 195 0\n"
2307 " mov.f32 %f4, %f1;\n"
2308 " mov.f32 %f5, %f4;\n"
2309 " .loc 3 71 0\n"
2310 " mov.u64 %rd7, __smem;\n"
2311 " cvt.u64.u32 %rd8, %r3;\n"
2312 " mul.wide.u32 %rd9, %r3, 4;\n"
2313 " add.u64 %rd10, %rd7, %rd9;\n"
2314 " st.volatile.shared.f32 [%rd10+0], %f4;\n"
2315 " .loc 3 72 0\n"
2316 " bar.sync 0;\n"
2317 " mov.u32 %r9, 255;\n"
2318 " setp.gt.u32 %p4, %r3, %r9;\n"
2319 " @%p4 bra $Lt_21_14338;\n"
2320 " .loc 3 75 0\n"
2321 " ld.volatile.shared.f32 %f6, [%rd10+1024];\n"
2322 " add.f32 %f5, %f6, %f4;\n"
2323 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2324 "$Lt_21_14338:\n"
2325 " bar.sync 0;\n"
2326 " mov.u32 %r10, 127;\n"
2327 " setp.gt.u32 %p5, %r3, %r10;\n"
2328 " @%p5 bra $Lt_21_14850;\n"
2329 " .loc 3 76 0\n"
2330 " ld.volatile.shared.f32 %f7, [%rd10+512];\n"
2331 " add.f32 %f5, %f7, %f5;\n"
2332 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2333 "$Lt_21_14850:\n"
2334 " bar.sync 0;\n"
2335 " mov.u32 %r11, 63;\n"
2336 " setp.gt.u32 %p6, %r3, %r11;\n"
2337 " @%p6 bra $Lt_21_15362;\n"
2338 " .loc 3 77 0\n"
2339 " ld.volatile.shared.f32 %f8, [%rd10+256];\n"
2340 " add.f32 %f5, %f8, %f5;\n"
2341 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2342 "$Lt_21_15362:\n"
2343 " bar.sync 0;\n"
2344 " mov.u32 %r12, 31;\n"
2345 " setp.gt.u32 %p7, %r3, %r12;\n"
2346 " @%p7 bra $Lt_21_15874;\n"
2347 " .loc 3 83 0\n"
2348 " ld.volatile.shared.f32 %f9, [%rd10+128];\n"
2349 " add.f32 %f10, %f9, %f5;\n"
2350 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
2351 " .loc 3 84 0\n"
2352 " ld.volatile.shared.f32 %f11, [%rd10+64];\n"
2353 " add.f32 %f12, %f11, %f10;\n"
2354 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
2355 " .loc 3 85 0\n"
2356 " ld.volatile.shared.f32 %f13, [%rd10+32];\n"
2357 " add.f32 %f14, %f13, %f12;\n"
2358 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
2359 " .loc 3 86 0\n"
2360 " ld.volatile.shared.f32 %f15, [%rd10+16];\n"
2361 " add.f32 %f16, %f15, %f14;\n"
2362 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
2363 " .loc 3 87 0\n"
2364 " ld.volatile.shared.f32 %f17, [%rd10+8];\n"
2365 " add.f32 %f18, %f17, %f16;\n"
2366 " st.volatile.shared.f32 [%rd10+0], %f18;\n"
2367 " .loc 3 88 0\n"
2368 " ld.volatile.shared.f32 %f19, [%rd10+4];\n"
2369 " add.f32 %f5, %f19, %f18;\n"
2370 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2371 "$Lt_21_15874:\n"
2372 " .loc 3 195 0\n"
2373 " mov.u32 %r13, 0;\n"
2374 " setp.ne.u32 %p8, %r3, %r13;\n"
2375 " @%p8 bra $Lt_21_16386;\n"
2376 " .loc 3 199 0\n"
2377 " ld.shared.f32 %f20, [__smem+0];\n"
2378 " ld.param.u64 %rd11, [__cudaparm_reduce_float_512_false_g_odata];\n"
2379 " cvt.u64.u32 %rd12, %r1;\n"
2380 " mul.wide.u32 %rd13, %r1, 4;\n"
2381 " add.u64 %rd14, %rd11, %rd13;\n"
2382 " st.global.f32 [%rd14+0], %f20;\n"
2383 "$Lt_21_16386:\n"
2384 " .loc 3 471 0\n"
2385 " exit;\n"
2386 "$LDWend_reduce_float_512_false:\n"
2387 " } // reduce_float_512_false\n"
2388 "\n"
2389 " .entry tex_reduce_256_false (\n"
2390 " .param .u64 __cudaparm_tex_reduce_256_false_g_odata,\n"
2391 " .param .u32 __cudaparm_tex_reduce_256_false_n,\n"
2392 " .param .u32 __cudaparm_tex_reduce_256_false_stride)\n"
2393 " {\n"
2394 " .reg .u16 %rh<3>;\n"
2395 " .reg .u32 %r<19>;\n"
2396 " .reg .u64 %rd<10>;\n"
2397 " .reg .f32 %f<37>;\n"
2398 " .reg .pred %p<9>;\n"
2399 " .loc 3 477 0\n"
2400 "$LDWbegin_tex_reduce_256_false:\n"
2401 " .loc 3 240 0\n"
2402 " cvt.u32.u16 %r1, %ctaid.x;\n"
2403 " mul.lo.u32 %r2, %r1, 512;\n"
2404 " cvt.u32.u16 %r3, %tid.x;\n"
2405 " add.u32 %r4, %r2, %r3;\n"
2406 " mov.s32 %r5, %r4;\n"
2407 " ld.param.u32 %r6, [__cudaparm_tex_reduce_256_false_n];\n"
2408 " setp.ge.u32 %p1, %r4, %r6;\n"
2409 " @%p1 bra $Lt_22_16642;\n"
2410 " mov.u16 %rh1, %nctaid.x;\n"
2411 " mul.wide.u16 %r7, %rh1, 512;\n"
2412 " ld.param.u32 %r8, [__cudaparm_tex_reduce_256_false_stride];\n"
2413 " mov.f32 %f1, 0f00000000; // 0\n"
2414 "$Lt_22_13570:\n"
2415 " //<loop> Loop body line 240, nesting depth: 1, estimated iterations: unknown\n"
2416 " ld.param.u32 %r8, [__cudaparm_tex_reduce_256_false_stride];\n"
2417 " rem.u32 %r9, %r5, %r8;\n"
2418 " cvt.rn.f32.u32 %f2, %r9;\n"
2419 " div.u32 %r10, %r5, %r8;\n"
2420 " cvt.rn.f32.u32 %f3, %r10;\n"
2421 " mov.f32 %f4, 0f00000000; // 0\n"
2422 " mov.f32 %f5, 0f00000000; // 0\n"
2423 " tex.2d.v4.f32.f32 {%f6,%f7,%f8,%f9},[tex_ref_1,{%f2,%f3,%f4,%f5}];\n"
2424 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2425 " .loc 3 248 0\n"
2426 " mov.f32 %f10, %f6;\n"
2427 " add.f32 %f1, %f10, %f1;\n"
2428 " add.u32 %r11, %r5, 256;\n"
2429 " .loc 3 240 0\n"
2430 " ld.param.u32 %r6, [__cudaparm_tex_reduce_256_false_n];\n"
2431 " .loc 3 248 0\n"
2432 " setp.ge.u32 %p2, %r11, %r6;\n"
2433 " @%p2 bra $Lt_22_13826;\n"
2434 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2435 " .loc 3 240 0\n"
2436 " ld.param.u32 %r8, [__cudaparm_tex_reduce_256_false_stride];\n"
2437 " .loc 3 248 0\n"
2438 " rem.u32 %r12, %r11, %r8;\n"
2439 " cvt.rn.f32.u32 %f11, %r12;\n"
2440 " div.u32 %r13, %r11, %r8;\n"
2441 " cvt.rn.f32.u32 %f12, %r13;\n"
2442 " mov.f32 %f13, 0f00000000; // 0\n"
2443 " mov.f32 %f14, 0f00000000; // 0\n"
2444 " tex.2d.v4.f32.f32 {%f15,%f16,%f17,%f18},[tex_ref_1,{%f11,%f12,%f13,%f14}];\n"
2445 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2446 " .loc 3 253 0\n"
2447 " mov.f32 %f19, %f15;\n"
2448 " add.f32 %f1, %f19, %f1;\n"
2449 "$Lt_22_13826:\n"
2450 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2451 " add.u32 %r5, %r7, %r5;\n"
2452 " .loc 3 240 0\n"
2453 " ld.param.u32 %r6, [__cudaparm_tex_reduce_256_false_n];\n"
2454 " .loc 3 253 0\n"
2455 " setp.lt.u32 %p3, %r5, %r6;\n"
2456 " @%p3 bra $Lt_22_13570;\n"
2457 " bra.uni $Lt_22_13058;\n"
2458 "$Lt_22_16642:\n"
2459 " mov.f32 %f1, 0f00000000; // 0\n"
2460 "$Lt_22_13058:\n"
2461 " .loc 3 258 0\n"
2462 " mov.f32 %f20, %f1;\n"
2463 " mov.f32 %f21, %f20;\n"
2464 " .loc 3 71 0\n"
2465 " mov.u64 %rd1, __smem;\n"
2466 " cvt.u64.u32 %rd2, %r3;\n"
2467 " mul.wide.u32 %rd3, %r3, 4;\n"
2468 " add.u64 %rd4, %rd1, %rd3;\n"
2469 " st.volatile.shared.f32 [%rd4+0], %f20;\n"
2470 " .loc 3 72 0\n"
2471 " bar.sync 0;\n"
2472 " mov.u32 %r14, 127;\n"
2473 " setp.gt.u32 %p4, %r3, %r14;\n"
2474 " @%p4 bra $Lt_22_14594;\n"
2475 " .loc 3 76 0\n"
2476 " ld.volatile.shared.f32 %f22, [%rd4+512];\n"
2477 " add.f32 %f21, %f22, %f20;\n"
2478 " st.volatile.shared.f32 [%rd4+0], %f21;\n"
2479 "$Lt_22_14594:\n"
2480 " bar.sync 0;\n"
2481 " mov.u32 %r15, 63;\n"
2482 " setp.gt.u32 %p5, %r3, %r15;\n"
2483 " @%p5 bra $Lt_22_15106;\n"
2484 " .loc 3 77 0\n"
2485 " ld.volatile.shared.f32 %f23, [%rd4+256];\n"
2486 " add.f32 %f21, %f23, %f21;\n"
2487 " st.volatile.shared.f32 [%rd4+0], %f21;\n"
2488 "$Lt_22_15106:\n"
2489 " bar.sync 0;\n"
2490 " mov.u32 %r16, 31;\n"
2491 " setp.gt.u32 %p6, %r3, %r16;\n"
2492 " @%p6 bra $Lt_22_15618;\n"
2493 " .loc 3 83 0\n"
2494 " ld.volatile.shared.f32 %f24, [%rd4+128];\n"
2495 " add.f32 %f25, %f24, %f21;\n"
2496 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2497 " .loc 3 84 0\n"
2498 " ld.volatile.shared.f32 %f26, [%rd4+64];\n"
2499 " add.f32 %f27, %f26, %f25;\n"
2500 " st.volatile.shared.f32 [%rd4+0], %f27;\n"
2501 " .loc 3 85 0\n"
2502 " ld.volatile.shared.f32 %f28, [%rd4+32];\n"
2503 " add.f32 %f29, %f28, %f27;\n"
2504 " st.volatile.shared.f32 [%rd4+0], %f29;\n"
2505 " .loc 3 86 0\n"
2506 " ld.volatile.shared.f32 %f30, [%rd4+16];\n"
2507 " add.f32 %f31, %f30, %f29;\n"
2508 " st.volatile.shared.f32 [%rd4+0], %f31;\n"
2509 " .loc 3 87 0\n"
2510 " ld.volatile.shared.f32 %f32, [%rd4+8];\n"
2511 " add.f32 %f33, %f32, %f31;\n"
2512 " st.volatile.shared.f32 [%rd4+0], %f33;\n"
2513 " .loc 3 88 0\n"
2514 " ld.volatile.shared.f32 %f34, [%rd4+4];\n"
2515 " add.f32 %f21, %f34, %f33;\n"
2516 " st.volatile.shared.f32 [%rd4+0], %f21;\n"
2517 "$Lt_22_15618:\n"
2518 " .loc 3 258 0\n"
2519 " mov.u32 %r17, 0;\n"
2520 " setp.ne.u32 %p7, %r3, %r17;\n"
2521 " @%p7 bra $Lt_22_16130;\n"
2522 " .loc 3 262 0\n"
2523 " ld.shared.f32 %f35, [__smem+0];\n"
2524 " ld.param.u64 %rd5, [__cudaparm_tex_reduce_256_false_g_odata];\n"
2525 " cvt.u64.u32 %rd6, %r1;\n"
2526 " mul.wide.u32 %rd7, %r1, 4;\n"
2527 " add.u64 %rd8, %rd5, %rd7;\n"
2528 " st.global.f32 [%rd8+0], %f35;\n"
2529 "$Lt_22_16130:\n"
2530 " .loc 3 480 0\n"
2531 " exit;\n"
2532 "$LDWend_tex_reduce_256_false:\n"
2533 " } // tex_reduce_256_false\n"
2534 "\n"
2535 " .entry tex_count_256_false (\n"
2536 " .param .u64 __cudaparm_tex_count_256_false_g_odata,\n"
2537 " .param .u32 __cudaparm_tex_count_256_false_n,\n"
2538 " .param .u32 __cudaparm_tex_count_256_false_stride)\n"
2539 " {\n"
2540 " .reg .u16 %rh<3>;\n"
2541 " .reg .u32 %r<19>;\n"
2542 " .reg .u64 %rd<10>;\n"
2543 " .reg .f32 %f<41>;\n"
2544 " .reg .f64 %fd<6>;\n"
2545 " .reg .pred %p<11>;\n"
2546 " .loc 3 482 0\n"
2547 "$LDWbegin_tex_count_256_false:\n"
2548 " .loc 3 333 0\n"
2549 " cvt.u32.u16 %r1, %ctaid.x;\n"
2550 " mul.lo.u32 %r2, %r1, 512;\n"
2551 " cvt.u32.u16 %r3, %tid.x;\n"
2552 " add.u32 %r4, %r2, %r3;\n"
2553 " mov.s32 %r5, %r4;\n"
2554 " ld.param.u32 %r6, [__cudaparm_tex_count_256_false_n];\n"
2555 " setp.ge.u32 %p1, %r4, %r6;\n"
2556 " @%p1 bra $Lt_23_18178;\n"
2557 " mov.u16 %rh1, %nctaid.x;\n"
2558 " mul.wide.u16 %r7, %rh1, 512;\n"
2559 " ld.param.u32 %r8, [__cudaparm_tex_count_256_false_stride];\n"
2560 " mov.f32 %f1, 0f00000000; // 0\n"
2561 "$Lt_23_15106:\n"
2562 " //<loop> Loop body line 333, nesting depth: 1, estimated iterations: unknown\n"
2563 " ld.param.u32 %r8, [__cudaparm_tex_count_256_false_stride];\n"
2564 " rem.u32 %r9, %r5, %r8;\n"
2565 " cvt.rn.f32.u32 %f2, %r9;\n"
2566 " div.u32 %r10, %r5, %r8;\n"
2567 " cvt.rn.f32.u32 %f3, %r10;\n"
2568 " mov.f32 %f4, 0f00000000; // 0\n"
2569 " mov.f32 %f5, 0f00000000; // 0\n"
2570 " tex.2d.v4.f32.f32 {%f6,%f7,%f8,%f9},[tex_ref_1,{%f2,%f3,%f4,%f5}];\n"
2571 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2572 " .loc 3 341 0\n"
2573 " mov.f32 %f10, %f6;\n"
2574 " .loc 3 340 0\n"
2575 " mov.f32 %f11, 0f3f800000; // 1\n"
2576 " add.f32 %f12, %f1, %f11;\n"
2577 " cvt.f64.f32 %fd1, %f10;\n"
2578 " mov.f64 %fd2, 0d0000000000000000; // 0\n"
2579 " setp.ne.f64 %p2, %fd1, %fd2;\n"
2580 " selp.f32 %f1, %f12, %f1, %p2;\n"
2581 " add.u32 %r11, %r5, 256;\n"
2582 " .loc 3 333 0\n"
2583 " ld.param.u32 %r6, [__cudaparm_tex_count_256_false_n];\n"
2584 " .loc 3 340 0\n"
2585 " setp.ge.u32 %p3, %r11, %r6;\n"
2586 " @%p3 bra $Lt_23_15362;\n"
2587 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2588 " .loc 3 333 0\n"
2589 " ld.param.u32 %r8, [__cudaparm_tex_count_256_false_stride];\n"
2590 " .loc 3 340 0\n"
2591 " rem.u32 %r12, %r11, %r8;\n"
2592 " cvt.rn.f32.u32 %f13, %r12;\n"
2593 " div.u32 %r13, %r11, %r8;\n"
2594 " cvt.rn.f32.u32 %f14, %r13;\n"
2595 " mov.f32 %f15, 0f00000000; // 0\n"
2596 " mov.f32 %f16, 0f00000000; // 0\n"
2597 " tex.2d.v4.f32.f32 {%f17,%f18,%f19,%f20},[tex_ref_1,{%f13,%f14,%f15,%f16}];\n"
2598 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2599 " .loc 3 348 0\n"
2600 " mov.f32 %f21, %f17;\n"
2601 " .loc 3 347 0\n"
2602 " mov.f32 %f22, 0f3f800000; // 1\n"
2603 " add.f32 %f23, %f1, %f22;\n"
2604 " cvt.f64.f32 %fd3, %f21;\n"
2605 " mov.f64 %fd4, 0d0000000000000000; // 0\n"
2606 " setp.ne.f64 %p4, %fd3, %fd4;\n"
2607 " selp.f32 %f1, %f23, %f1, %p4;\n"
2608 "$Lt_23_15362:\n"
2609 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2610 " add.u32 %r5, %r7, %r5;\n"
2611 " .loc 3 333 0\n"
2612 " ld.param.u32 %r6, [__cudaparm_tex_count_256_false_n];\n"
2613 " .loc 3 347 0\n"
2614 " setp.lt.u32 %p5, %r5, %r6;\n"
2615 " @%p5 bra $Lt_23_15106;\n"
2616 " bra.uni $Lt_23_14594;\n"
2617 "$Lt_23_18178:\n"
2618 " mov.f32 %f1, 0f00000000; // 0\n"
2619 "$Lt_23_14594:\n"
2620 " .loc 3 355 0\n"
2621 " mov.f32 %f24, %f1;\n"
2622 " mov.f32 %f25, %f24;\n"
2623 " .loc 3 71 0\n"
2624 " mov.u64 %rd1, __smem;\n"
2625 " cvt.u64.u32 %rd2, %r3;\n"
2626 " mul.wide.u32 %rd3, %r3, 4;\n"
2627 " add.u64 %rd4, %rd1, %rd3;\n"
2628 " st.volatile.shared.f32 [%rd4+0], %f24;\n"
2629 " .loc 3 72 0\n"
2630 " bar.sync 0;\n"
2631 " mov.u32 %r14, 127;\n"
2632 " setp.gt.u32 %p6, %r3, %r14;\n"
2633 " @%p6 bra $Lt_23_16130;\n"
2634 " .loc 3 76 0\n"
2635 " ld.volatile.shared.f32 %f26, [%rd4+512];\n"
2636 " add.f32 %f25, %f26, %f24;\n"
2637 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2638 "$Lt_23_16130:\n"
2639 " bar.sync 0;\n"
2640 " mov.u32 %r15, 63;\n"
2641 " setp.gt.u32 %p7, %r3, %r15;\n"
2642 " @%p7 bra $Lt_23_16642;\n"
2643 " .loc 3 77 0\n"
2644 " ld.volatile.shared.f32 %f27, [%rd4+256];\n"
2645 " add.f32 %f25, %f27, %f25;\n"
2646 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2647 "$Lt_23_16642:\n"
2648 " bar.sync 0;\n"
2649 " mov.u32 %r16, 31;\n"
2650 " setp.gt.u32 %p8, %r3, %r16;\n"
2651 " @%p8 bra $Lt_23_17154;\n"
2652 " .loc 3 83 0\n"
2653 " ld.volatile.shared.f32 %f28, [%rd4+128];\n"
2654 " add.f32 %f29, %f28, %f25;\n"
2655 " st.volatile.shared.f32 [%rd4+0], %f29;\n"
2656 " .loc 3 84 0\n"
2657 " ld.volatile.shared.f32 %f30, [%rd4+64];\n"
2658 " add.f32 %f31, %f30, %f29;\n"
2659 " st.volatile.shared.f32 [%rd4+0], %f31;\n"
2660 " .loc 3 85 0\n"
2661 " ld.volatile.shared.f32 %f32, [%rd4+32];\n"
2662 " add.f32 %f33, %f32, %f31;\n"
2663 " st.volatile.shared.f32 [%rd4+0], %f33;\n"
2664 " .loc 3 86 0\n"
2665 " ld.volatile.shared.f32 %f34, [%rd4+16];\n"
2666 " add.f32 %f35, %f34, %f33;\n"
2667 " st.volatile.shared.f32 [%rd4+0], %f35;\n"
2668 " .loc 3 87 0\n"
2669 " ld.volatile.shared.f32 %f36, [%rd4+8];\n"
2670 " add.f32 %f37, %f36, %f35;\n"
2671 " st.volatile.shared.f32 [%rd4+0], %f37;\n"
2672 " .loc 3 88 0\n"
2673 " ld.volatile.shared.f32 %f38, [%rd4+4];\n"
2674 " add.f32 %f25, %f38, %f37;\n"
2675 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2676 "$Lt_23_17154:\n"
2677 " .loc 3 355 0\n"
2678 " mov.u32 %r17, 0;\n"
2679 " setp.ne.u32 %p9, %r3, %r17;\n"
2680 " @%p9 bra $Lt_23_17666;\n"
2681 " .loc 3 359 0\n"
2682 " ld.shared.f32 %f39, [__smem+0];\n"
2683 " ld.param.u64 %rd5, [__cudaparm_tex_count_256_false_g_odata];\n"
2684 " cvt.u64.u32 %rd6, %r1;\n"
2685 " mul.wide.u32 %rd7, %r1, 4;\n"
2686 " add.u64 %rd8, %rd5, %rd7;\n"
2687 " st.global.f32 [%rd8+0], %f39;\n"
2688 "$Lt_23_17666:\n"
2689 " .loc 3 485 0\n"
2690 " exit;\n"
2691 "$LDWend_tex_count_256_false:\n"
2692 " } // tex_count_256_false\n"
2693 "\n"
2694 " .entry chamfer_reduce_256_false (\n"
2695 " .param .u64 __cudaparm_chamfer_reduce_256_false_g_odata,\n"
2696 " .param .u32 __cudaparm_chamfer_reduce_256_false_n,\n"
2697 " .param .u32 __cudaparm_chamfer_reduce_256_false_stride)\n"
2698 " {\n"
2699 " .reg .u16 %rh<3>;\n"
2700 " .reg .u32 %r<19>;\n"
2701 " .reg .u64 %rd<10>;\n"
2702 " .reg .f32 %f<59>;\n"
2703 " .reg .pred %p<9>;\n"
2704 " .loc 3 487 0\n"
2705 "$LDWbegin_chamfer_reduce_256_false:\n"
2706 " .loc 3 287 0\n"
2707 " cvt.u32.u16 %r1, %ctaid.x;\n"
2708 " mul.lo.u32 %r2, %r1, 512;\n"
2709 " cvt.u32.u16 %r3, %tid.x;\n"
2710 " add.u32 %r4, %r2, %r3;\n"
2711 " mov.s32 %r5, %r4;\n"
2712 " ld.param.u32 %r6, [__cudaparm_chamfer_reduce_256_false_n];\n"
2713 " setp.ge.u32 %p1, %r4, %r6;\n"
2714 " @%p1 bra $Lt_24_16642;\n"
2715 " mov.u16 %rh1, %nctaid.x;\n"
2716 " mul.wide.u16 %r7, %rh1, 512;\n"
2717 " ld.param.u32 %r8, [__cudaparm_chamfer_reduce_256_false_stride];\n"
2718 " mov.f32 %f1, 0f00000000; // 0\n"
2719 "$Lt_24_13570:\n"
2720 " //<loop> Loop body line 287, nesting depth: 1, estimated iterations: unknown\n"
2721 " ld.param.u32 %r8, [__cudaparm_chamfer_reduce_256_false_stride];\n"
2722 " rem.u32 %r9, %r5, %r8;\n"
2723 " cvt.rn.f32.u32 %f2, %r9;\n"
2724 " div.u32 %r10, %r5, %r8;\n"
2725 " cvt.rn.f32.u32 %f3, %r10;\n"
2726 " mov.f32 %f4, %f2;\n"
2727 " mov.f32 %f5, %f3;\n"
2728 " mov.f32 %f6, 0f00000000; // 0\n"
2729 " mov.f32 %f7, 0f00000000; // 0\n"
2730 " tex.2d.v4.f32.f32 {%f8,%f9,%f10,%f11},[tex_ref_1,{%f4,%f5,%f6,%f7}];\n"
2731 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2732 " .loc 3 295 0\n"
2733 " mov.f32 %f12, %f8;\n"
2734 " mov.f32 %f13, %f2;\n"
2735 " mov.f32 %f14, %f3;\n"
2736 " mov.f32 %f15, 0f00000000; // 0\n"
2737 " mov.f32 %f16, 0f00000000; // 0\n"
2738 " tex.2d.v4.f32.f32 {%f17,%f18,%f19,%f20},[tex_ref_2,{%f13,%f14,%f15,%f16}];\n"
2739 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2740 " mov.f32 %f21, %f17;\n"
2741 " mad.f32 %f1, %f12, %f21, %f1;\n"
2742 " add.u32 %r11, %r5, 256;\n"
2743 " .loc 3 287 0\n"
2744 " ld.param.u32 %r6, [__cudaparm_chamfer_reduce_256_false_n];\n"
2745 " .loc 3 295 0\n"
2746 " setp.ge.u32 %p2, %r11, %r6;\n"
2747 " @%p2 bra $Lt_24_13826;\n"
2748 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2749 " .loc 3 287 0\n"
2750 " ld.param.u32 %r8, [__cudaparm_chamfer_reduce_256_false_stride];\n"
2751 " .loc 3 295 0\n"
2752 " rem.u32 %r12, %r11, %r8;\n"
2753 " cvt.rn.f32.u32 %f22, %r12;\n"
2754 " div.u32 %r13, %r11, %r8;\n"
2755 " cvt.rn.f32.u32 %f23, %r13;\n"
2756 " mov.f32 %f24, %f22;\n"
2757 " mov.f32 %f25, %f23;\n"
2758 " mov.f32 %f26, 0f00000000; // 0\n"
2759 " mov.f32 %f27, 0f00000000; // 0\n"
2760 " tex.2d.v4.f32.f32 {%f28,%f29,%f30,%f31},[tex_ref_1,{%f24,%f25,%f26,%f27}];\n"
2761 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2762 " .loc 3 300 0\n"
2763 " mov.f32 %f32, %f28;\n"
2764 " mov.f32 %f33, %f22;\n"
2765 " mov.f32 %f34, %f23;\n"
2766 " mov.f32 %f35, 0f00000000; // 0\n"
2767 " mov.f32 %f36, 0f00000000; // 0\n"
2768 " tex.2d.v4.f32.f32 {%f37,%f38,%f39,%f40},[tex_ref_2,{%f33,%f34,%f35,%f36}];\n"
2769 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2770 " mov.f32 %f41, %f37;\n"
2771 " mad.f32 %f1, %f32, %f41, %f1;\n"
2772 "$Lt_24_13826:\n"
2773 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2774 " add.u32 %r5, %r7, %r5;\n"
2775 " .loc 3 287 0\n"
2776 " ld.param.u32 %r6, [__cudaparm_chamfer_reduce_256_false_n];\n"
2777 " .loc 3 300 0\n"
2778 " setp.lt.u32 %p3, %r5, %r6;\n"
2779 " @%p3 bra $Lt_24_13570;\n"
2780 " bra.uni $Lt_24_13058;\n"
2781 "$Lt_24_16642:\n"
2782 " mov.f32 %f1, 0f00000000; // 0\n"
2783 "$Lt_24_13058:\n"
2784 " .loc 3 305 0\n"
2785 " mov.f32 %f42, %f1;\n"
2786 " mov.f32 %f43, %f42;\n"
2787 " .loc 3 71 0\n"
2788 " mov.u64 %rd1, __smem;\n"
2789 " cvt.u64.u32 %rd2, %r3;\n"
2790 " mul.wide.u32 %rd3, %r3, 4;\n"
2791 " add.u64 %rd4, %rd1, %rd3;\n"
2792 " st.volatile.shared.f32 [%rd4+0], %f42;\n"
2793 " .loc 3 72 0\n"
2794 " bar.sync 0;\n"
2795 " mov.u32 %r14, 127;\n"
2796 " setp.gt.u32 %p4, %r3, %r14;\n"
2797 " @%p4 bra $Lt_24_14594;\n"
2798 " .loc 3 76 0\n"
2799 " ld.volatile.shared.f32 %f44, [%rd4+512];\n"
2800 " add.f32 %f43, %f44, %f42;\n"
2801 " st.volatile.shared.f32 [%rd4+0], %f43;\n"
2802 "$Lt_24_14594:\n"
2803 " bar.sync 0;\n"
2804 " mov.u32 %r15, 63;\n"
2805 " setp.gt.u32 %p5, %r3, %r15;\n"
2806 " @%p5 bra $Lt_24_15106;\n"
2807 " .loc 3 77 0\n"
2808 " ld.volatile.shared.f32 %f45, [%rd4+256];\n"
2809 " add.f32 %f43, %f45, %f43;\n"
2810 " st.volatile.shared.f32 [%rd4+0], %f43;\n"
2811 "$Lt_24_15106:\n"
2812 " bar.sync 0;\n"
2813 " mov.u32 %r16, 31;\n"
2814 " setp.gt.u32 %p6, %r3, %r16;\n"
2815 " @%p6 bra $Lt_24_15618;\n"
2816 " .loc 3 83 0\n"
2817 " ld.volatile.shared.f32 %f46, [%rd4+128];\n"
2818 " add.f32 %f47, %f46, %f43;\n"
2819 " st.volatile.shared.f32 [%rd4+0], %f47;\n"
2820 " .loc 3 84 0\n"
2821 " ld.volatile.shared.f32 %f48, [%rd4+64];\n"
2822 " add.f32 %f49, %f48, %f47;\n"
2823 " st.volatile.shared.f32 [%rd4+0], %f49;\n"
2824 " .loc 3 85 0\n"
2825 " ld.volatile.shared.f32 %f50, [%rd4+32];\n"
2826 " add.f32 %f51, %f50, %f49;\n"
2827 " st.volatile.shared.f32 [%rd4+0], %f51;\n"
2828 " .loc 3 86 0\n"
2829 " ld.volatile.shared.f32 %f52, [%rd4+16];\n"
2830 " add.f32 %f53, %f52, %f51;\n"
2831 " st.volatile.shared.f32 [%rd4+0], %f53;\n"
2832 " .loc 3 87 0\n"
2833 " ld.volatile.shared.f32 %f54, [%rd4+8];\n"
2834 " add.f32 %f55, %f54, %f53;\n"
2835 " st.volatile.shared.f32 [%rd4+0], %f55;\n"
2836 " .loc 3 88 0\n"
2837 " ld.volatile.shared.f32 %f56, [%rd4+4];\n"
2838 " add.f32 %f43, %f56, %f55;\n"
2839 " st.volatile.shared.f32 [%rd4+0], %f43;\n"
2840 "$Lt_24_15618:\n"
2841 " .loc 3 305 0\n"
2842 " mov.u32 %r17, 0;\n"
2843 " setp.ne.u32 %p7, %r3, %r17;\n"
2844 " @%p7 bra $Lt_24_16130;\n"
2845 " .loc 3 309 0\n"
2846 " ld.shared.f32 %f57, [__smem+0];\n"
2847 " ld.param.u64 %rd5, [__cudaparm_chamfer_reduce_256_false_g_odata];\n"
2848 " cvt.u64.u32 %rd6, %r1;\n"
2849 " mul.wide.u32 %rd7, %r1, 4;\n"
2850 " add.u64 %rd8, %rd5, %rd7;\n"
2851 " st.global.f32 [%rd8+0], %f57;\n"
2852 "$Lt_24_16130:\n"
2853 " .loc 3 490 0\n"
2854 " exit;\n"
2855 "$LDWend_chamfer_reduce_256_false:\n"
2856 " } // chamfer_reduce_256_false\n"
2857 "\n"
2858 " .entry reduce_uchar_1_true (\n"
2859 " .param .u64 __cudaparm_reduce_uchar_1_true_g_idata,\n"
2860 " .param .u64 __cudaparm_reduce_uchar_1_true_g_odata,\n"
2861 " .param .u32 __cudaparm_reduce_uchar_1_true_n)\n"
2862 " {\n"
2863 " .reg .u16 %rh<3>;\n"
2864 " .reg .u32 %r<12>;\n"
2865 " .reg .u64 %rd<14>;\n"
2866 " .reg .f32 %f<7>;\n"
2867 " .reg .pred %p<5>;\n"
2868 " .loc 3 500 0\n"
2869 "$LDWbegin_reduce_uchar_1_true:\n"
2870 " .loc 3 181 0\n"
2871 " cvt.u32.u16 %r1, %ctaid.x;\n"
2872 " mul24.lo.u32 %r2, %r1, 2;\n"
2873 " cvt.u32.u16 %r3, %tid.x;\n"
2874 " add.u32 %r4, %r2, %r3;\n"
2875 " mov.s32 %r5, %r4;\n"
2876 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_1_true_n];\n"
2877 " setp.ge.u32 %p1, %r4, %r6;\n"
2878 " @%p1 bra $Lt_25_16642;\n"
2879 " mov.u16 %rh1, %nctaid.x;\n"
2880 " mul.wide.u16 %r7, %rh1, 2;\n"
2881 " cvt.u64.u32 %rd1, %r4;\n"
2882 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_1_true_g_idata];\n"
2883 " add.u64 %rd3, %rd1, %rd2;\n"
2884 " cvt.s64.u32 %rd4, %r7;\n"
2885 " mov.f32 %f1, 0f00000000; // 0\n"
2886 "$Lt_25_15618:\n"
2887 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2888 " .loc 3 188 0\n"
2889 " ld.global.u8 %r8, [%rd3+0];\n"
2890 " cvt.rn.f32.u32 %f2, %r8;\n"
2891 " add.f32 %f3, %f2, %f1;\n"
2892 " .loc 3 191 0\n"
2893 " ld.global.u8 %r9, [%rd3+1];\n"
2894 " cvt.rn.f32.u32 %f4, %r9;\n"
2895 " add.f32 %f1, %f4, %f3;\n"
2896 " add.u32 %r5, %r7, %r5;\n"
2897 " add.u64 %rd3, %rd4, %rd3;\n"
2898 " .loc 3 181 0\n"
2899 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_1_true_n];\n"
2900 " .loc 3 191 0\n"
2901 " setp.lt.u32 %p2, %r5, %r6;\n"
2902 " @%p2 bra $Lt_25_15618;\n"
2903 " bra.uni $Lt_25_15106;\n"
2904 "$Lt_25_16642:\n"
2905 " mov.f32 %f1, 0f00000000; // 0\n"
2906 "$Lt_25_15106:\n"
2907 " .loc 3 71 0\n"
2908 " mov.u64 %rd5, __smem;\n"
2909 " cvt.u64.u32 %rd6, %r3;\n"
2910 " mul.wide.u32 %rd7, %r3, 4;\n"
2911 " add.u64 %rd8, %rd5, %rd7;\n"
2912 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
2913 " .loc 3 72 0\n"
2914 " bar.sync 0;\n"
2915 " .loc 3 195 0\n"
2916 " mov.u32 %r10, 0;\n"
2917 " setp.ne.u32 %p3, %r3, %r10;\n"
2918 " @%p3 bra $Lt_25_16130;\n"
2919 " .loc 3 199 0\n"
2920 " ld.shared.f32 %f5, [__smem+0];\n"
2921 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_1_true_g_odata];\n"
2922 " cvt.u64.u32 %rd10, %r1;\n"
2923 " mul.wide.u32 %rd11, %r1, 4;\n"
2924 " add.u64 %rd12, %rd9, %rd11;\n"
2925 " st.global.f32 [%rd12+0], %f5;\n"
2926 "$Lt_25_16130:\n"
2927 " .loc 3 503 0\n"
2928 " exit;\n"
2929 "$LDWend_reduce_uchar_1_true:\n"
2930 " } // reduce_uchar_1_true\n"
2931 "\n"
2932 " .entry reduce_uchar_2_true (\n"
2933 " .param .u64 __cudaparm_reduce_uchar_2_true_g_idata,\n"
2934 " .param .u64 __cudaparm_reduce_uchar_2_true_g_odata,\n"
2935 " .param .u32 __cudaparm_reduce_uchar_2_true_n)\n"
2936 " {\n"
2937 " .reg .u16 %rh<3>;\n"
2938 " .reg .u32 %r<13>;\n"
2939 " .reg .u64 %rd<14>;\n"
2940 " .reg .f32 %f<9>;\n"
2941 " .reg .pred %p<6>;\n"
2942 " .loc 3 505 0\n"
2943 "$LDWbegin_reduce_uchar_2_true:\n"
2944 " .loc 3 181 0\n"
2945 " cvt.u32.u16 %r1, %ctaid.x;\n"
2946 " mul24.lo.u32 %r2, %r1, 4;\n"
2947 " cvt.u32.u16 %r3, %tid.x;\n"
2948 " add.u32 %r4, %r2, %r3;\n"
2949 " mov.s32 %r5, %r4;\n"
2950 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_2_true_n];\n"
2951 " setp.ge.u32 %p1, %r4, %r6;\n"
2952 " @%p1 bra $Lt_26_16898;\n"
2953 " mov.u16 %rh1, %nctaid.x;\n"
2954 " mul.wide.u16 %r7, %rh1, 4;\n"
2955 " cvt.u64.u32 %rd1, %r4;\n"
2956 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_2_true_g_idata];\n"
2957 " add.u64 %rd3, %rd1, %rd2;\n"
2958 " cvt.s64.u32 %rd4, %r7;\n"
2959 " mov.f32 %f1, 0f00000000; // 0\n"
2960 "$Lt_26_15362:\n"
2961 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2962 " .loc 3 188 0\n"
2963 " ld.global.u8 %r8, [%rd3+0];\n"
2964 " cvt.rn.f32.u32 %f2, %r8;\n"
2965 " add.f32 %f3, %f2, %f1;\n"
2966 " .loc 3 191 0\n"
2967 " ld.global.u8 %r9, [%rd3+2];\n"
2968 " cvt.rn.f32.u32 %f4, %r9;\n"
2969 " add.f32 %f1, %f4, %f3;\n"
2970 " add.u32 %r5, %r7, %r5;\n"
2971 " add.u64 %rd3, %rd4, %rd3;\n"
2972 " .loc 3 181 0\n"
2973 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_2_true_n];\n"
2974 " .loc 3 191 0\n"
2975 " setp.lt.u32 %p2, %r5, %r6;\n"
2976 " @%p2 bra $Lt_26_15362;\n"
2977 " bra.uni $Lt_26_14850;\n"
2978 "$Lt_26_16898:\n"
2979 " mov.f32 %f1, 0f00000000; // 0\n"
2980 "$Lt_26_14850:\n"
2981 " .loc 3 71 0\n"
2982 " mov.u64 %rd5, __smem;\n"
2983 " cvt.u64.u32 %rd6, %r3;\n"
2984 " mul.wide.u32 %rd7, %r3, 4;\n"
2985 " add.u64 %rd8, %rd5, %rd7;\n"
2986 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
2987 " .loc 3 72 0\n"
2988 " bar.sync 0;\n"
2989 " mov.u32 %r10, 31;\n"
2990 " setp.gt.u32 %p3, %r3, %r10;\n"
2991 " @%p3 bra $Lt_26_15874;\n"
2992 " .loc 3 88 0\n"
2993 " ld.volatile.shared.f32 %f5, [%rd8+4];\n"
2994 " add.f32 %f6, %f5, %f1;\n"
2995 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
2996 "$Lt_26_15874:\n"
2997 " .loc 3 195 0\n"
2998 " mov.u32 %r11, 0;\n"
2999 " setp.ne.u32 %p4, %r3, %r11;\n"
3000 " @%p4 bra $Lt_26_16386;\n"
3001 " .loc 3 199 0\n"
3002 " ld.shared.f32 %f7, [__smem+0];\n"
3003 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_2_true_g_odata];\n"
3004 " cvt.u64.u32 %rd10, %r1;\n"
3005 " mul.wide.u32 %rd11, %r1, 4;\n"
3006 " add.u64 %rd12, %rd9, %rd11;\n"
3007 " st.global.f32 [%rd12+0], %f7;\n"
3008 "$Lt_26_16386:\n"
3009 " .loc 3 508 0\n"
3010 " exit;\n"
3011 "$LDWend_reduce_uchar_2_true:\n"
3012 " } // reduce_uchar_2_true\n"
3013 "\n"
3014 " .entry reduce_uchar_4_true (\n"
3015 " .param .u64 __cudaparm_reduce_uchar_4_true_g_idata,\n"
3016 " .param .u64 __cudaparm_reduce_uchar_4_true_g_odata,\n"
3017 " .param .u32 __cudaparm_reduce_uchar_4_true_n)\n"
3018 " {\n"
3019 " .reg .u16 %rh<3>;\n"
3020 " .reg .u32 %r<13>;\n"
3021 " .reg .u64 %rd<14>;\n"
3022 " .reg .f32 %f<11>;\n"
3023 " .reg .pred %p<6>;\n"
3024 " .loc 3 510 0\n"
3025 "$LDWbegin_reduce_uchar_4_true:\n"
3026 " .loc 3 181 0\n"
3027 " cvt.u32.u16 %r1, %ctaid.x;\n"
3028 " mul24.lo.u32 %r2, %r1, 8;\n"
3029 " cvt.u32.u16 %r3, %tid.x;\n"
3030 " add.u32 %r4, %r2, %r3;\n"
3031 " mov.s32 %r5, %r4;\n"
3032 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_4_true_n];\n"
3033 " setp.ge.u32 %p1, %r4, %r6;\n"
3034 " @%p1 bra $Lt_27_16642;\n"
3035 " mov.u16 %rh1, %nctaid.x;\n"
3036 " mul.wide.u16 %r7, %rh1, 8;\n"
3037 " cvt.u64.u32 %rd1, %r4;\n"
3038 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_4_true_g_idata];\n"
3039 " add.u64 %rd3, %rd1, %rd2;\n"
3040 " cvt.s64.u32 %rd4, %r7;\n"
3041 " mov.f32 %f1, 0f00000000; // 0\n"
3042 "$Lt_27_15106:\n"
3043 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3044 " .loc 3 188 0\n"
3045 " ld.global.u8 %r8, [%rd3+0];\n"
3046 " cvt.rn.f32.u32 %f2, %r8;\n"
3047 " add.f32 %f3, %f2, %f1;\n"
3048 " .loc 3 191 0\n"
3049 " ld.global.u8 %r9, [%rd3+4];\n"
3050 " cvt.rn.f32.u32 %f4, %r9;\n"
3051 " add.f32 %f1, %f4, %f3;\n"
3052 " add.u32 %r5, %r7, %r5;\n"
3053 " add.u64 %rd3, %rd4, %rd3;\n"
3054 " .loc 3 181 0\n"
3055 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_4_true_n];\n"
3056 " .loc 3 191 0\n"
3057 " setp.lt.u32 %p2, %r5, %r6;\n"
3058 " @%p2 bra $Lt_27_15106;\n"
3059 " bra.uni $Lt_27_14594;\n"
3060 "$Lt_27_16642:\n"
3061 " mov.f32 %f1, 0f00000000; // 0\n"
3062 "$Lt_27_14594:\n"
3063 " .loc 3 71 0\n"
3064 " mov.u64 %rd5, __smem;\n"
3065 " cvt.u64.u32 %rd6, %r3;\n"
3066 " mul.wide.u32 %rd7, %r3, 4;\n"
3067 " add.u64 %rd8, %rd5, %rd7;\n"
3068 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3069 " .loc 3 72 0\n"
3070 " bar.sync 0;\n"
3071 " mov.u32 %r10, 31;\n"
3072 " setp.gt.u32 %p3, %r3, %r10;\n"
3073 " @%p3 bra $Lt_27_15618;\n"
3074 " .loc 3 87 0\n"
3075 " ld.volatile.shared.f32 %f5, [%rd8+8];\n"
3076 " add.f32 %f6, %f5, %f1;\n"
3077 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3078 " .loc 3 88 0\n"
3079 " ld.volatile.shared.f32 %f7, [%rd8+4];\n"
3080 " add.f32 %f8, %f7, %f6;\n"
3081 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3082 "$Lt_27_15618:\n"
3083 " .loc 3 195 0\n"
3084 " mov.u32 %r11, 0;\n"
3085 " setp.ne.u32 %p4, %r3, %r11;\n"
3086 " @%p4 bra $Lt_27_16130;\n"
3087 " .loc 3 199 0\n"
3088 " ld.shared.f32 %f9, [__smem+0];\n"
3089 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_4_true_g_odata];\n"
3090 " cvt.u64.u32 %rd10, %r1;\n"
3091 " mul.wide.u32 %rd11, %r1, 4;\n"
3092 " add.u64 %rd12, %rd9, %rd11;\n"
3093 " st.global.f32 [%rd12+0], %f9;\n"
3094 "$Lt_27_16130:\n"
3095 " .loc 3 513 0\n"
3096 " exit;\n"
3097 "$LDWend_reduce_uchar_4_true:\n"
3098 " } // reduce_uchar_4_true\n"
3099 "\n"
3100 " .entry reduce_uchar_8_true (\n"
3101 " .param .u64 __cudaparm_reduce_uchar_8_true_g_idata,\n"
3102 " .param .u64 __cudaparm_reduce_uchar_8_true_g_odata,\n"
3103 " .param .u32 __cudaparm_reduce_uchar_8_true_n)\n"
3104 " {\n"
3105 " .reg .u16 %rh<3>;\n"
3106 " .reg .u32 %r<13>;\n"
3107 " .reg .u64 %rd<14>;\n"
3108 " .reg .f32 %f<13>;\n"
3109 " .reg .pred %p<6>;\n"
3110 " .loc 3 515 0\n"
3111 "$LDWbegin_reduce_uchar_8_true:\n"
3112 " .loc 3 181 0\n"
3113 " cvt.u32.u16 %r1, %ctaid.x;\n"
3114 " mul24.lo.u32 %r2, %r1, 16;\n"
3115 " cvt.u32.u16 %r3, %tid.x;\n"
3116 " add.u32 %r4, %r2, %r3;\n"
3117 " mov.s32 %r5, %r4;\n"
3118 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_8_true_n];\n"
3119 " setp.ge.u32 %p1, %r4, %r6;\n"
3120 " @%p1 bra $Lt_28_16386;\n"
3121 " mov.u16 %rh1, %nctaid.x;\n"
3122 " mul.wide.u16 %r7, %rh1, 16;\n"
3123 " cvt.u64.u32 %rd1, %r4;\n"
3124 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_8_true_g_idata];\n"
3125 " add.u64 %rd3, %rd1, %rd2;\n"
3126 " cvt.s64.u32 %rd4, %r7;\n"
3127 " mov.f32 %f1, 0f00000000; // 0\n"
3128 "$Lt_28_14850:\n"
3129 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3130 " .loc 3 188 0\n"
3131 " ld.global.u8 %r8, [%rd3+0];\n"
3132 " cvt.rn.f32.u32 %f2, %r8;\n"
3133 " add.f32 %f3, %f2, %f1;\n"
3134 " .loc 3 191 0\n"
3135 " ld.global.u8 %r9, [%rd3+8];\n"
3136 " cvt.rn.f32.u32 %f4, %r9;\n"
3137 " add.f32 %f1, %f4, %f3;\n"
3138 " add.u32 %r5, %r7, %r5;\n"
3139 " add.u64 %rd3, %rd4, %rd3;\n"
3140 " .loc 3 181 0\n"
3141 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_8_true_n];\n"
3142 " .loc 3 191 0\n"
3143 " setp.lt.u32 %p2, %r5, %r6;\n"
3144 " @%p2 bra $Lt_28_14850;\n"
3145 " bra.uni $Lt_28_14338;\n"
3146 "$Lt_28_16386:\n"
3147 " mov.f32 %f1, 0f00000000; // 0\n"
3148 "$Lt_28_14338:\n"
3149 " .loc 3 71 0\n"
3150 " mov.u64 %rd5, __smem;\n"
3151 " cvt.u64.u32 %rd6, %r3;\n"
3152 " mul.wide.u32 %rd7, %r3, 4;\n"
3153 " add.u64 %rd8, %rd5, %rd7;\n"
3154 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3155 " .loc 3 72 0\n"
3156 " bar.sync 0;\n"
3157 " mov.u32 %r10, 31;\n"
3158 " setp.gt.u32 %p3, %r3, %r10;\n"
3159 " @%p3 bra $Lt_28_15362;\n"
3160 " .loc 3 86 0\n"
3161 " ld.volatile.shared.f32 %f5, [%rd8+16];\n"
3162 " add.f32 %f6, %f5, %f1;\n"
3163 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3164 " .loc 3 87 0\n"
3165 " ld.volatile.shared.f32 %f7, [%rd8+8];\n"
3166 " add.f32 %f8, %f7, %f6;\n"
3167 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3168 " .loc 3 88 0\n"
3169 " ld.volatile.shared.f32 %f9, [%rd8+4];\n"
3170 " add.f32 %f10, %f9, %f8;\n"
3171 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3172 "$Lt_28_15362:\n"
3173 " .loc 3 195 0\n"
3174 " mov.u32 %r11, 0;\n"
3175 " setp.ne.u32 %p4, %r3, %r11;\n"
3176 " @%p4 bra $Lt_28_15874;\n"
3177 " .loc 3 199 0\n"
3178 " ld.shared.f32 %f11, [__smem+0];\n"
3179 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_8_true_g_odata];\n"
3180 " cvt.u64.u32 %rd10, %r1;\n"
3181 " mul.wide.u32 %rd11, %r1, 4;\n"
3182 " add.u64 %rd12, %rd9, %rd11;\n"
3183 " st.global.f32 [%rd12+0], %f11;\n"
3184 "$Lt_28_15874:\n"
3185 " .loc 3 518 0\n"
3186 " exit;\n"
3187 "$LDWend_reduce_uchar_8_true:\n"
3188 " } // reduce_uchar_8_true\n"
3189 "\n"
3190 " .entry reduce_uchar_16_true (\n"
3191 " .param .u64 __cudaparm_reduce_uchar_16_true_g_idata,\n"
3192 " .param .u64 __cudaparm_reduce_uchar_16_true_g_odata,\n"
3193 " .param .u32 __cudaparm_reduce_uchar_16_true_n)\n"
3194 " {\n"
3195 " .reg .u16 %rh<3>;\n"
3196 " .reg .u32 %r<13>;\n"
3197 " .reg .u64 %rd<14>;\n"
3198 " .reg .f32 %f<15>;\n"
3199 " .reg .pred %p<6>;\n"
3200 " .loc 3 520 0\n"
3201 "$LDWbegin_reduce_uchar_16_true:\n"
3202 " .loc 3 181 0\n"
3203 " cvt.u32.u16 %r1, %ctaid.x;\n"
3204 " mul24.lo.u32 %r2, %r1, 32;\n"
3205 " cvt.u32.u16 %r3, %tid.x;\n"
3206 " add.u32 %r4, %r2, %r3;\n"
3207 " mov.s32 %r5, %r4;\n"
3208 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_16_true_n];\n"
3209 " setp.ge.u32 %p1, %r4, %r6;\n"
3210 " @%p1 bra $Lt_29_16130;\n"
3211 " mov.u16 %rh1, %nctaid.x;\n"
3212 " mul.wide.u16 %r7, %rh1, 32;\n"
3213 " cvt.u64.u32 %rd1, %r4;\n"
3214 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_16_true_g_idata];\n"
3215 " add.u64 %rd3, %rd1, %rd2;\n"
3216 " cvt.s64.u32 %rd4, %r7;\n"
3217 " mov.f32 %f1, 0f00000000; // 0\n"
3218 "$Lt_29_14594:\n"
3219 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3220 " .loc 3 188 0\n"
3221 " ld.global.u8 %r8, [%rd3+0];\n"
3222 " cvt.rn.f32.u32 %f2, %r8;\n"
3223 " add.f32 %f3, %f2, %f1;\n"
3224 " .loc 3 191 0\n"
3225 " ld.global.u8 %r9, [%rd3+16];\n"
3226 " cvt.rn.f32.u32 %f4, %r9;\n"
3227 " add.f32 %f1, %f4, %f3;\n"
3228 " add.u32 %r5, %r7, %r5;\n"
3229 " add.u64 %rd3, %rd4, %rd3;\n"
3230 " .loc 3 181 0\n"
3231 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_16_true_n];\n"
3232 " .loc 3 191 0\n"
3233 " setp.lt.u32 %p2, %r5, %r6;\n"
3234 " @%p2 bra $Lt_29_14594;\n"
3235 " bra.uni $Lt_29_14082;\n"
3236 "$Lt_29_16130:\n"
3237 " mov.f32 %f1, 0f00000000; // 0\n"
3238 "$Lt_29_14082:\n"
3239 " .loc 3 71 0\n"
3240 " mov.u64 %rd5, __smem;\n"
3241 " cvt.u64.u32 %rd6, %r3;\n"
3242 " mul.wide.u32 %rd7, %r3, 4;\n"
3243 " add.u64 %rd8, %rd5, %rd7;\n"
3244 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3245 " .loc 3 72 0\n"
3246 " bar.sync 0;\n"
3247 " mov.u32 %r10, 31;\n"
3248 " setp.gt.u32 %p3, %r3, %r10;\n"
3249 " @%p3 bra $Lt_29_15106;\n"
3250 " .loc 3 85 0\n"
3251 " ld.volatile.shared.f32 %f5, [%rd8+32];\n"
3252 " add.f32 %f6, %f5, %f1;\n"
3253 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3254 " .loc 3 86 0\n"
3255 " ld.volatile.shared.f32 %f7, [%rd8+16];\n"
3256 " add.f32 %f8, %f7, %f6;\n"
3257 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3258 " .loc 3 87 0\n"
3259 " ld.volatile.shared.f32 %f9, [%rd8+8];\n"
3260 " add.f32 %f10, %f9, %f8;\n"
3261 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3262 " .loc 3 88 0\n"
3263 " ld.volatile.shared.f32 %f11, [%rd8+4];\n"
3264 " add.f32 %f12, %f11, %f10;\n"
3265 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3266 "$Lt_29_15106:\n"
3267 " .loc 3 195 0\n"
3268 " mov.u32 %r11, 0;\n"
3269 " setp.ne.u32 %p4, %r3, %r11;\n"
3270 " @%p4 bra $Lt_29_15618;\n"
3271 " .loc 3 199 0\n"
3272 " ld.shared.f32 %f13, [__smem+0];\n"
3273 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_16_true_g_odata];\n"
3274 " cvt.u64.u32 %rd10, %r1;\n"
3275 " mul.wide.u32 %rd11, %r1, 4;\n"
3276 " add.u64 %rd12, %rd9, %rd11;\n"
3277 " st.global.f32 [%rd12+0], %f13;\n"
3278 "$Lt_29_15618:\n"
3279 " .loc 3 523 0\n"
3280 " exit;\n"
3281 "$LDWend_reduce_uchar_16_true:\n"
3282 " } // reduce_uchar_16_true\n"
3283 "\n"
3284 " .entry reduce_uchar_32_true (\n"
3285 " .param .u64 __cudaparm_reduce_uchar_32_true_g_idata,\n"
3286 " .param .u64 __cudaparm_reduce_uchar_32_true_g_odata,\n"
3287 " .param .u32 __cudaparm_reduce_uchar_32_true_n)\n"
3288 " {\n"
3289 " .reg .u16 %rh<3>;\n"
3290 " .reg .u32 %r<13>;\n"
3291 " .reg .u64 %rd<14>;\n"
3292 " .reg .f32 %f<17>;\n"
3293 " .reg .pred %p<6>;\n"
3294 " .loc 3 525 0\n"
3295 "$LDWbegin_reduce_uchar_32_true:\n"
3296 " .loc 3 181 0\n"
3297 " cvt.u32.u16 %r1, %ctaid.x;\n"
3298 " mul24.lo.u32 %r2, %r1, 64;\n"
3299 " cvt.u32.u16 %r3, %tid.x;\n"
3300 " add.u32 %r4, %r2, %r3;\n"
3301 " mov.s32 %r5, %r4;\n"
3302 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_32_true_n];\n"
3303 " setp.ge.u32 %p1, %r4, %r6;\n"
3304 " @%p1 bra $Lt_30_15874;\n"
3305 " mov.u16 %rh1, %nctaid.x;\n"
3306 " mul.wide.u16 %r7, %rh1, 64;\n"
3307 " cvt.u64.u32 %rd1, %r4;\n"
3308 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_32_true_g_idata];\n"
3309 " add.u64 %rd3, %rd1, %rd2;\n"
3310 " cvt.s64.u32 %rd4, %r7;\n"
3311 " mov.f32 %f1, 0f00000000; // 0\n"
3312 "$Lt_30_14338:\n"
3313 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3314 " .loc 3 188 0\n"
3315 " ld.global.u8 %r8, [%rd3+0];\n"
3316 " cvt.rn.f32.u32 %f2, %r8;\n"
3317 " add.f32 %f3, %f2, %f1;\n"
3318 " .loc 3 191 0\n"
3319 " ld.global.u8 %r9, [%rd3+32];\n"
3320 " cvt.rn.f32.u32 %f4, %r9;\n"
3321 " add.f32 %f1, %f4, %f3;\n"
3322 " add.u32 %r5, %r7, %r5;\n"
3323 " add.u64 %rd3, %rd4, %rd3;\n"
3324 " .loc 3 181 0\n"
3325 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_32_true_n];\n"
3326 " .loc 3 191 0\n"
3327 " setp.lt.u32 %p2, %r5, %r6;\n"
3328 " @%p2 bra $Lt_30_14338;\n"
3329 " bra.uni $Lt_30_13826;\n"
3330 "$Lt_30_15874:\n"
3331 " mov.f32 %f1, 0f00000000; // 0\n"
3332 "$Lt_30_13826:\n"
3333 " .loc 3 71 0\n"
3334 " mov.u64 %rd5, __smem;\n"
3335 " cvt.u64.u32 %rd6, %r3;\n"
3336 " mul.wide.u32 %rd7, %r3, 4;\n"
3337 " add.u64 %rd8, %rd5, %rd7;\n"
3338 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3339 " .loc 3 72 0\n"
3340 " bar.sync 0;\n"
3341 " mov.u32 %r10, 31;\n"
3342 " setp.gt.u32 %p3, %r3, %r10;\n"
3343 " @%p3 bra $Lt_30_14850;\n"
3344 " .loc 3 84 0\n"
3345 " ld.volatile.shared.f32 %f5, [%rd8+64];\n"
3346 " add.f32 %f6, %f5, %f1;\n"
3347 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3348 " .loc 3 85 0\n"
3349 " ld.volatile.shared.f32 %f7, [%rd8+32];\n"
3350 " add.f32 %f8, %f7, %f6;\n"
3351 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3352 " .loc 3 86 0\n"
3353 " ld.volatile.shared.f32 %f9, [%rd8+16];\n"
3354 " add.f32 %f10, %f9, %f8;\n"
3355 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3356 " .loc 3 87 0\n"
3357 " ld.volatile.shared.f32 %f11, [%rd8+8];\n"
3358 " add.f32 %f12, %f11, %f10;\n"
3359 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3360 " .loc 3 88 0\n"
3361 " ld.volatile.shared.f32 %f13, [%rd8+4];\n"
3362 " add.f32 %f14, %f13, %f12;\n"
3363 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
3364 "$Lt_30_14850:\n"
3365 " .loc 3 195 0\n"
3366 " mov.u32 %r11, 0;\n"
3367 " setp.ne.u32 %p4, %r3, %r11;\n"
3368 " @%p4 bra $Lt_30_15362;\n"
3369 " .loc 3 199 0\n"
3370 " ld.shared.f32 %f15, [__smem+0];\n"
3371 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_32_true_g_odata];\n"
3372 " cvt.u64.u32 %rd10, %r1;\n"
3373 " mul.wide.u32 %rd11, %r1, 4;\n"
3374 " add.u64 %rd12, %rd9, %rd11;\n"
3375 " st.global.f32 [%rd12+0], %f15;\n"
3376 "$Lt_30_15362:\n"
3377 " .loc 3 528 0\n"
3378 " exit;\n"
3379 "$LDWend_reduce_uchar_32_true:\n"
3380 " } // reduce_uchar_32_true\n"
3381 "\n"
3382 " .entry reduce_uchar_64_true (\n"
3383 " .param .u64 __cudaparm_reduce_uchar_64_true_g_idata,\n"
3384 " .param .u64 __cudaparm_reduce_uchar_64_true_g_odata,\n"
3385 " .param .u32 __cudaparm_reduce_uchar_64_true_n)\n"
3386 " {\n"
3387 " .reg .u16 %rh<3>;\n"
3388 " .reg .u32 %r<13>;\n"
3389 " .reg .u64 %rd<14>;\n"
3390 " .reg .f32 %f<19>;\n"
3391 " .reg .pred %p<6>;\n"
3392 " .loc 3 530 0\n"
3393 "$LDWbegin_reduce_uchar_64_true:\n"
3394 " .loc 3 181 0\n"
3395 " cvt.u32.u16 %r1, %ctaid.x;\n"
3396 " mul24.lo.u32 %r2, %r1, 128;\n"
3397 " cvt.u32.u16 %r3, %tid.x;\n"
3398 " add.u32 %r4, %r2, %r3;\n"
3399 " mov.s32 %r5, %r4;\n"
3400 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_64_true_n];\n"
3401 " setp.ge.u32 %p1, %r4, %r6;\n"
3402 " @%p1 bra $Lt_31_15618;\n"
3403 " mov.u16 %rh1, %nctaid.x;\n"
3404 " mul.wide.u16 %r7, %rh1, 128;\n"
3405 " cvt.u64.u32 %rd1, %r4;\n"
3406 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_64_true_g_idata];\n"
3407 " add.u64 %rd3, %rd1, %rd2;\n"
3408 " cvt.s64.u32 %rd4, %r7;\n"
3409 " mov.f32 %f1, 0f00000000; // 0\n"
3410 "$Lt_31_14082:\n"
3411 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3412 " .loc 3 188 0\n"
3413 " ld.global.u8 %r8, [%rd3+0];\n"
3414 " cvt.rn.f32.u32 %f2, %r8;\n"
3415 " add.f32 %f3, %f2, %f1;\n"
3416 " .loc 3 191 0\n"
3417 " ld.global.u8 %r9, [%rd3+64];\n"
3418 " cvt.rn.f32.u32 %f4, %r9;\n"
3419 " add.f32 %f1, %f4, %f3;\n"
3420 " add.u32 %r5, %r7, %r5;\n"
3421 " add.u64 %rd3, %rd4, %rd3;\n"
3422 " .loc 3 181 0\n"
3423 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_64_true_n];\n"
3424 " .loc 3 191 0\n"
3425 " setp.lt.u32 %p2, %r5, %r6;\n"
3426 " @%p2 bra $Lt_31_14082;\n"
3427 " bra.uni $Lt_31_13570;\n"
3428 "$Lt_31_15618:\n"
3429 " mov.f32 %f1, 0f00000000; // 0\n"
3430 "$Lt_31_13570:\n"
3431 " .loc 3 71 0\n"
3432 " mov.u64 %rd5, __smem;\n"
3433 " cvt.u64.u32 %rd6, %r3;\n"
3434 " mul.wide.u32 %rd7, %r3, 4;\n"
3435 " add.u64 %rd8, %rd5, %rd7;\n"
3436 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3437 " .loc 3 72 0\n"
3438 " bar.sync 0;\n"
3439 " mov.u32 %r10, 31;\n"
3440 " setp.gt.u32 %p3, %r3, %r10;\n"
3441 " @%p3 bra $Lt_31_14594;\n"
3442 " .loc 3 83 0\n"
3443 " ld.volatile.shared.f32 %f5, [%rd8+128];\n"
3444 " add.f32 %f6, %f5, %f1;\n"
3445 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3446 " .loc 3 84 0\n"
3447 " ld.volatile.shared.f32 %f7, [%rd8+64];\n"
3448 " add.f32 %f8, %f7, %f6;\n"
3449 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3450 " .loc 3 85 0\n"
3451 " ld.volatile.shared.f32 %f9, [%rd8+32];\n"
3452 " add.f32 %f10, %f9, %f8;\n"
3453 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3454 " .loc 3 86 0\n"
3455 " ld.volatile.shared.f32 %f11, [%rd8+16];\n"
3456 " add.f32 %f12, %f11, %f10;\n"
3457 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3458 " .loc 3 87 0\n"
3459 " ld.volatile.shared.f32 %f13, [%rd8+8];\n"
3460 " add.f32 %f14, %f13, %f12;\n"
3461 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
3462 " .loc 3 88 0\n"
3463 " ld.volatile.shared.f32 %f15, [%rd8+4];\n"
3464 " add.f32 %f16, %f15, %f14;\n"
3465 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
3466 "$Lt_31_14594:\n"
3467 " .loc 3 195 0\n"
3468 " mov.u32 %r11, 0;\n"
3469 " setp.ne.u32 %p4, %r3, %r11;\n"
3470 " @%p4 bra $Lt_31_15106;\n"
3471 " .loc 3 199 0\n"
3472 " ld.shared.f32 %f17, [__smem+0];\n"
3473 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_64_true_g_odata];\n"
3474 " cvt.u64.u32 %rd10, %r1;\n"
3475 " mul.wide.u32 %rd11, %r1, 4;\n"
3476 " add.u64 %rd12, %rd9, %rd11;\n"
3477 " st.global.f32 [%rd12+0], %f17;\n"
3478 "$Lt_31_15106:\n"
3479 " .loc 3 533 0\n"
3480 " exit;\n"
3481 "$LDWend_reduce_uchar_64_true:\n"
3482 " } // reduce_uchar_64_true\n"
3483 "\n"
3484 " .entry reduce_uchar_128_true (\n"
3485 " .param .u64 __cudaparm_reduce_uchar_128_true_g_idata,\n"
3486 " .param .u64 __cudaparm_reduce_uchar_128_true_g_odata,\n"
3487 " .param .u32 __cudaparm_reduce_uchar_128_true_n)\n"
3488 " {\n"
3489 " .reg .u16 %rh<3>;\n"
3490 " .reg .u32 %r<14>;\n"
3491 " .reg .u64 %rd<14>;\n"
3492 " .reg .f32 %f<21>;\n"
3493 " .reg .pred %p<7>;\n"
3494 " .loc 3 535 0\n"
3495 "$LDWbegin_reduce_uchar_128_true:\n"
3496 " .loc 3 181 0\n"
3497 " cvt.u32.u16 %r1, %ctaid.x;\n"
3498 " mul.lo.u32 %r2, %r1, 256;\n"
3499 " cvt.u32.u16 %r3, %tid.x;\n"
3500 " add.u32 %r4, %r2, %r3;\n"
3501 " mov.s32 %r5, %r4;\n"
3502 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_128_true_n];\n"
3503 " setp.ge.u32 %p1, %r4, %r6;\n"
3504 " @%p1 bra $Lt_32_15874;\n"
3505 " mov.u16 %rh1, %nctaid.x;\n"
3506 " mul.wide.u16 %r7, %rh1, 256;\n"
3507 " cvt.u64.u32 %rd1, %r4;\n"
3508 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_128_true_g_idata];\n"
3509 " add.u64 %rd3, %rd1, %rd2;\n"
3510 " cvt.s64.u32 %rd4, %r7;\n"
3511 " mov.f32 %f1, 0f00000000; // 0\n"
3512 "$Lt_32_13826:\n"
3513 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3514 " .loc 3 188 0\n"
3515 " ld.global.u8 %r8, [%rd3+0];\n"
3516 " cvt.rn.f32.u32 %f2, %r8;\n"
3517 " add.f32 %f3, %f2, %f1;\n"
3518 " .loc 3 191 0\n"
3519 " ld.global.u8 %r9, [%rd3+128];\n"
3520 " cvt.rn.f32.u32 %f4, %r9;\n"
3521 " add.f32 %f1, %f4, %f3;\n"
3522 " add.u32 %r5, %r7, %r5;\n"
3523 " add.u64 %rd3, %rd4, %rd3;\n"
3524 " .loc 3 181 0\n"
3525 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_128_true_n];\n"
3526 " .loc 3 191 0\n"
3527 " setp.lt.u32 %p2, %r5, %r6;\n"
3528 " @%p2 bra $Lt_32_13826;\n"
3529 " bra.uni $Lt_32_13314;\n"
3530 "$Lt_32_15874:\n"
3531 " mov.f32 %f1, 0f00000000; // 0\n"
3532 "$Lt_32_13314:\n"
3533 " .loc 3 195 0\n"
3534 " mov.f32 %f5, %f1;\n"
3535 " mov.f32 %f6, %f5;\n"
3536 " .loc 3 71 0\n"
3537 " mov.u64 %rd5, __smem;\n"
3538 " cvt.u64.u32 %rd6, %r3;\n"
3539 " mul.wide.u32 %rd7, %r3, 4;\n"
3540 " add.u64 %rd8, %rd5, %rd7;\n"
3541 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
3542 " .loc 3 72 0\n"
3543 " bar.sync 0;\n"
3544 " mov.u32 %r10, 63;\n"
3545 " setp.gt.u32 %p3, %r3, %r10;\n"
3546 " @%p3 bra $Lt_32_14338;\n"
3547 " .loc 3 77 0\n"
3548 " ld.volatile.shared.f32 %f7, [%rd8+256];\n"
3549 " add.f32 %f6, %f7, %f5;\n"
3550 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3551 "$Lt_32_14338:\n"
3552 " bar.sync 0;\n"
3553 " mov.u32 %r11, 31;\n"
3554 " setp.gt.u32 %p4, %r3, %r11;\n"
3555 " @%p4 bra $Lt_32_14850;\n"
3556 " .loc 3 83 0\n"
3557 " ld.volatile.shared.f32 %f8, [%rd8+128];\n"
3558 " add.f32 %f9, %f8, %f6;\n"
3559 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
3560 " .loc 3 84 0\n"
3561 " ld.volatile.shared.f32 %f10, [%rd8+64];\n"
3562 " add.f32 %f11, %f10, %f9;\n"
3563 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
3564 " .loc 3 85 0\n"
3565 " ld.volatile.shared.f32 %f12, [%rd8+32];\n"
3566 " add.f32 %f13, %f12, %f11;\n"
3567 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
3568 " .loc 3 86 0\n"
3569 " ld.volatile.shared.f32 %f14, [%rd8+16];\n"
3570 " add.f32 %f15, %f14, %f13;\n"
3571 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
3572 " .loc 3 87 0\n"
3573 " ld.volatile.shared.f32 %f16, [%rd8+8];\n"
3574 " add.f32 %f17, %f16, %f15;\n"
3575 " st.volatile.shared.f32 [%rd8+0], %f17;\n"
3576 " .loc 3 88 0\n"
3577 " ld.volatile.shared.f32 %f18, [%rd8+4];\n"
3578 " add.f32 %f6, %f18, %f17;\n"
3579 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3580 "$Lt_32_14850:\n"
3581 " .loc 3 195 0\n"
3582 " mov.u32 %r12, 0;\n"
3583 " setp.ne.u32 %p5, %r3, %r12;\n"
3584 " @%p5 bra $Lt_32_15362;\n"
3585 " .loc 3 199 0\n"
3586 " ld.shared.f32 %f19, [__smem+0];\n"
3587 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_128_true_g_odata];\n"
3588 " cvt.u64.u32 %rd10, %r1;\n"
3589 " mul.wide.u32 %rd11, %r1, 4;\n"
3590 " add.u64 %rd12, %rd9, %rd11;\n"
3591 " st.global.f32 [%rd12+0], %f19;\n"
3592 "$Lt_32_15362:\n"
3593 " .loc 3 538 0\n"
3594 " exit;\n"
3595 "$LDWend_reduce_uchar_128_true:\n"
3596 " } // reduce_uchar_128_true\n"
3597 "\n"
3598 " .entry reduce_uchar_256_true (\n"
3599 " .param .u64 __cudaparm_reduce_uchar_256_true_g_idata,\n"
3600 " .param .u64 __cudaparm_reduce_uchar_256_true_g_odata,\n"
3601 " .param .u32 __cudaparm_reduce_uchar_256_true_n)\n"
3602 " {\n"
3603 " .reg .u16 %rh<3>;\n"
3604 " .reg .u32 %r<15>;\n"
3605 " .reg .u64 %rd<14>;\n"
3606 " .reg .f32 %f<22>;\n"
3607 " .reg .pred %p<8>;\n"
3608 " .loc 3 540 0\n"
3609 "$LDWbegin_reduce_uchar_256_true:\n"
3610 " .loc 3 181 0\n"
3611 " cvt.u32.u16 %r1, %ctaid.x;\n"
3612 " mul.lo.u32 %r2, %r1, 512;\n"
3613 " cvt.u32.u16 %r3, %tid.x;\n"
3614 " add.u32 %r4, %r2, %r3;\n"
3615 " mov.s32 %r5, %r4;\n"
3616 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_256_true_n];\n"
3617 " setp.ge.u32 %p1, %r4, %r6;\n"
3618 " @%p1 bra $Lt_33_16130;\n"
3619 " mov.u16 %rh1, %nctaid.x;\n"
3620 " mul.wide.u16 %r7, %rh1, 512;\n"
3621 " cvt.u64.u32 %rd1, %r4;\n"
3622 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_256_true_g_idata];\n"
3623 " add.u64 %rd3, %rd1, %rd2;\n"
3624 " cvt.s64.u32 %rd4, %r7;\n"
3625 " mov.f32 %f1, 0f00000000; // 0\n"
3626 "$Lt_33_13570:\n"
3627 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3628 " .loc 3 188 0\n"
3629 " ld.global.u8 %r8, [%rd3+0];\n"
3630 " cvt.rn.f32.u32 %f2, %r8;\n"
3631 " add.f32 %f3, %f2, %f1;\n"
3632 " .loc 3 191 0\n"
3633 " ld.global.u8 %r9, [%rd3+256];\n"
3634 " cvt.rn.f32.u32 %f4, %r9;\n"
3635 " add.f32 %f1, %f4, %f3;\n"
3636 " add.u32 %r5, %r7, %r5;\n"
3637 " add.u64 %rd3, %rd4, %rd3;\n"
3638 " .loc 3 181 0\n"
3639 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_256_true_n];\n"
3640 " .loc 3 191 0\n"
3641 " setp.lt.u32 %p2, %r5, %r6;\n"
3642 " @%p2 bra $Lt_33_13570;\n"
3643 " bra.uni $Lt_33_13058;\n"
3644 "$Lt_33_16130:\n"
3645 " mov.f32 %f1, 0f00000000; // 0\n"
3646 "$Lt_33_13058:\n"
3647 " .loc 3 195 0\n"
3648 " mov.f32 %f5, %f1;\n"
3649 " mov.f32 %f6, %f5;\n"
3650 " .loc 3 71 0\n"
3651 " mov.u64 %rd5, __smem;\n"
3652 " cvt.u64.u32 %rd6, %r3;\n"
3653 " mul.wide.u32 %rd7, %r3, 4;\n"
3654 " add.u64 %rd8, %rd5, %rd7;\n"
3655 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
3656 " .loc 3 72 0\n"
3657 " bar.sync 0;\n"
3658 " mov.u32 %r10, 127;\n"
3659 " setp.gt.u32 %p3, %r3, %r10;\n"
3660 " @%p3 bra $Lt_33_14082;\n"
3661 " .loc 3 76 0\n"
3662 " ld.volatile.shared.f32 %f7, [%rd8+512];\n"
3663 " add.f32 %f6, %f7, %f5;\n"
3664 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3665 "$Lt_33_14082:\n"
3666 " bar.sync 0;\n"
3667 " mov.u32 %r11, 63;\n"
3668 " setp.gt.u32 %p4, %r3, %r11;\n"
3669 " @%p4 bra $Lt_33_14594;\n"
3670 " .loc 3 77 0\n"
3671 " ld.volatile.shared.f32 %f8, [%rd8+256];\n"
3672 " add.f32 %f6, %f8, %f6;\n"
3673 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3674 "$Lt_33_14594:\n"
3675 " bar.sync 0;\n"
3676 " mov.u32 %r12, 31;\n"
3677 " setp.gt.u32 %p5, %r3, %r12;\n"
3678 " @%p5 bra $Lt_33_15106;\n"
3679 " .loc 3 83 0\n"
3680 " ld.volatile.shared.f32 %f9, [%rd8+128];\n"
3681 " add.f32 %f10, %f9, %f6;\n"
3682 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3683 " .loc 3 84 0\n"
3684 " ld.volatile.shared.f32 %f11, [%rd8+64];\n"
3685 " add.f32 %f12, %f11, %f10;\n"
3686 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3687 " .loc 3 85 0\n"
3688 " ld.volatile.shared.f32 %f13, [%rd8+32];\n"
3689 " add.f32 %f14, %f13, %f12;\n"
3690 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
3691 " .loc 3 86 0\n"
3692 " ld.volatile.shared.f32 %f15, [%rd8+16];\n"
3693 " add.f32 %f16, %f15, %f14;\n"
3694 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
3695 " .loc 3 87 0\n"
3696 " ld.volatile.shared.f32 %f17, [%rd8+8];\n"
3697 " add.f32 %f18, %f17, %f16;\n"
3698 " st.volatile.shared.f32 [%rd8+0], %f18;\n"
3699 " .loc 3 88 0\n"
3700 " ld.volatile.shared.f32 %f19, [%rd8+4];\n"
3701 " add.f32 %f6, %f19, %f18;\n"
3702 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3703 "$Lt_33_15106:\n"
3704 " .loc 3 195 0\n"
3705 " mov.u32 %r13, 0;\n"
3706 " setp.ne.u32 %p6, %r3, %r13;\n"
3707 " @%p6 bra $Lt_33_15618;\n"
3708 " .loc 3 199 0\n"
3709 " ld.shared.f32 %f20, [__smem+0];\n"
3710 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_256_true_g_odata];\n"
3711 " cvt.u64.u32 %rd10, %r1;\n"
3712 " mul.wide.u32 %rd11, %r1, 4;\n"
3713 " add.u64 %rd12, %rd9, %rd11;\n"
3714 " st.global.f32 [%rd12+0], %f20;\n"
3715 "$Lt_33_15618:\n"
3716 " .loc 3 543 0\n"
3717 " exit;\n"
3718 "$LDWend_reduce_uchar_256_true:\n"
3719 " } // reduce_uchar_256_true\n"
3720 "\n"
3721 " .entry reduce_uchar_512_true (\n"
3722 " .param .u64 __cudaparm_reduce_uchar_512_true_g_idata,\n"
3723 " .param .u64 __cudaparm_reduce_uchar_512_true_g_odata,\n"
3724 " .param .u32 __cudaparm_reduce_uchar_512_true_n)\n"
3725 " {\n"
3726 " .reg .u16 %rh<3>;\n"
3727 " .reg .u32 %r<16>;\n"
3728 " .reg .u64 %rd<14>;\n"
3729 " .reg .f32 %f<23>;\n"
3730 " .reg .pred %p<9>;\n"
3731 " .loc 3 545 0\n"
3732 "$LDWbegin_reduce_uchar_512_true:\n"
3733 " .loc 3 181 0\n"
3734 " cvt.u32.u16 %r1, %ctaid.x;\n"
3735 " mul.lo.u32 %r2, %r1, 1024;\n"
3736 " cvt.u32.u16 %r3, %tid.x;\n"
3737 " add.u32 %r4, %r2, %r3;\n"
3738 " mov.s32 %r5, %r4;\n"
3739 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_512_true_n];\n"
3740 " setp.ge.u32 %p1, %r4, %r6;\n"
3741 " @%p1 bra $Lt_34_16386;\n"
3742 " mov.u16 %rh1, %nctaid.x;\n"
3743 " mul.wide.u16 %r7, %rh1, 1024;\n"
3744 " cvt.u64.u32 %rd1, %r4;\n"
3745 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_512_true_g_idata];\n"
3746 " add.u64 %rd3, %rd1, %rd2;\n"
3747 " cvt.s64.u32 %rd4, %r7;\n"
3748 " mov.f32 %f1, 0f00000000; // 0\n"
3749 "$Lt_34_13314:\n"
3750 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3751 " .loc 3 188 0\n"
3752 " ld.global.u8 %r8, [%rd3+0];\n"
3753 " cvt.rn.f32.u32 %f2, %r8;\n"
3754 " add.f32 %f3, %f2, %f1;\n"
3755 " .loc 3 191 0\n"
3756 " ld.global.u8 %r9, [%rd3+512];\n"
3757 " cvt.rn.f32.u32 %f4, %r9;\n"
3758 " add.f32 %f1, %f4, %f3;\n"
3759 " add.u32 %r5, %r7, %r5;\n"
3760 " add.u64 %rd3, %rd4, %rd3;\n"
3761 " .loc 3 181 0\n"
3762 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_512_true_n];\n"
3763 " .loc 3 191 0\n"
3764 " setp.lt.u32 %p2, %r5, %r6;\n"
3765 " @%p2 bra $Lt_34_13314;\n"
3766 " bra.uni $Lt_34_12802;\n"
3767 "$Lt_34_16386:\n"
3768 " mov.f32 %f1, 0f00000000; // 0\n"
3769 "$Lt_34_12802:\n"
3770 " .loc 3 195 0\n"
3771 " mov.f32 %f5, %f1;\n"
3772 " mov.f32 %f6, %f5;\n"
3773 " .loc 3 71 0\n"
3774 " mov.u64 %rd5, __smem;\n"
3775 " cvt.u64.u32 %rd6, %r3;\n"
3776 " mul.wide.u32 %rd7, %r3, 4;\n"
3777 " add.u64 %rd8, %rd5, %rd7;\n"
3778 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
3779 " .loc 3 72 0\n"
3780 " bar.sync 0;\n"
3781 " mov.u32 %r10, 255;\n"
3782 " setp.gt.u32 %p3, %r3, %r10;\n"
3783 " @%p3 bra $Lt_34_13826;\n"
3784 " .loc 3 75 0\n"
3785 " ld.volatile.shared.f32 %f7, [%rd8+1024];\n"
3786 " add.f32 %f6, %f7, %f5;\n"
3787 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3788 "$Lt_34_13826:\n"
3789 " bar.sync 0;\n"
3790 " mov.u32 %r11, 127;\n"
3791 " setp.gt.u32 %p4, %r3, %r11;\n"
3792 " @%p4 bra $Lt_34_14338;\n"
3793 " .loc 3 76 0\n"
3794 " ld.volatile.shared.f32 %f8, [%rd8+512];\n"
3795 " add.f32 %f6, %f8, %f6;\n"
3796 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3797 "$Lt_34_14338:\n"
3798 " bar.sync 0;\n"
3799 " mov.u32 %r12, 63;\n"
3800 " setp.gt.u32 %p5, %r3, %r12;\n"
3801 " @%p5 bra $Lt_34_14850;\n"
3802 " .loc 3 77 0\n"
3803 " ld.volatile.shared.f32 %f9, [%rd8+256];\n"
3804 " add.f32 %f6, %f9, %f6;\n"
3805 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3806 "$Lt_34_14850:\n"
3807 " bar.sync 0;\n"
3808 " mov.u32 %r13, 31;\n"
3809 " setp.gt.u32 %p6, %r3, %r13;\n"
3810 " @%p6 bra $Lt_34_15362;\n"
3811 " .loc 3 83 0\n"
3812 " ld.volatile.shared.f32 %f10, [%rd8+128];\n"
3813 " add.f32 %f11, %f10, %f6;\n"
3814 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
3815 " .loc 3 84 0\n"
3816 " ld.volatile.shared.f32 %f12, [%rd8+64];\n"
3817 " add.f32 %f13, %f12, %f11;\n"
3818 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
3819 " .loc 3 85 0\n"
3820 " ld.volatile.shared.f32 %f14, [%rd8+32];\n"
3821 " add.f32 %f15, %f14, %f13;\n"
3822 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
3823 " .loc 3 86 0\n"
3824 " ld.volatile.shared.f32 %f16, [%rd8+16];\n"
3825 " add.f32 %f17, %f16, %f15;\n"
3826 " st.volatile.shared.f32 [%rd8+0], %f17;\n"
3827 " .loc 3 87 0\n"
3828 " ld.volatile.shared.f32 %f18, [%rd8+8];\n"
3829 " add.f32 %f19, %f18, %f17;\n"
3830 " st.volatile.shared.f32 [%rd8+0], %f19;\n"
3831 " .loc 3 88 0\n"
3832 " ld.volatile.shared.f32 %f20, [%rd8+4];\n"
3833 " add.f32 %f6, %f20, %f19;\n"
3834 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3835 "$Lt_34_15362:\n"
3836 " .loc 3 195 0\n"
3837 " mov.u32 %r14, 0;\n"
3838 " setp.ne.u32 %p7, %r3, %r14;\n"
3839 " @%p7 bra $Lt_34_15874;\n"
3840 " .loc 3 199 0\n"
3841 " ld.shared.f32 %f21, [__smem+0];\n"
3842 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_512_true_g_odata];\n"
3843 " cvt.u64.u32 %rd10, %r1;\n"
3844 " mul.wide.u32 %rd11, %r1, 4;\n"
3845 " add.u64 %rd12, %rd9, %rd11;\n"
3846 " st.global.f32 [%rd12+0], %f21;\n"
3847 "$Lt_34_15874:\n"
3848 " .loc 3 548 0\n"
3849 " exit;\n"
3850 "$LDWend_reduce_uchar_512_true:\n"
3851 " } // reduce_uchar_512_true\n"
3852 "\n"
3853 " .entry reduce_uchar_1_false (\n"
3854 " .param .u64 __cudaparm_reduce_uchar_1_false_g_idata,\n"
3855 " .param .u64 __cudaparm_reduce_uchar_1_false_g_odata,\n"
3856 " .param .u32 __cudaparm_reduce_uchar_1_false_n)\n"
3857 " {\n"
3858 " .reg .u16 %rh<3>;\n"
3859 " .reg .u32 %r<13>;\n"
3860 " .reg .u64 %rd<14>;\n"
3861 " .reg .f32 %f<6>;\n"
3862 " .reg .pred %p<6>;\n"
3863 " .loc 3 551 0\n"
3864 "$LDWbegin_reduce_uchar_1_false:\n"
3865 " .loc 3 181 0\n"
3866 " cvt.u32.u16 %r1, %ctaid.x;\n"
3867 " mul24.lo.u32 %r2, %r1, 2;\n"
3868 " cvt.u32.u16 %r3, %tid.x;\n"
3869 " add.u32 %r4, %r2, %r3;\n"
3870 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_1_false_n];\n"
3871 " setp.ge.u32 %p1, %r4, %r5;\n"
3872 " @%p1 bra $Lt_35_17154;\n"
3873 " add.u32 %r6, %r4, 1;\n"
3874 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_1_false_n];\n"
3875 " add.u32 %r7, %r5, 1;\n"
3876 " mov.u16 %rh1, %nctaid.x;\n"
3877 " mul.wide.u16 %r8, %rh1, 2;\n"
3878 " cvt.u64.u32 %rd1, %r4;\n"
3879 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_1_false_g_idata];\n"
3880 " add.u64 %rd3, %rd1, %rd2;\n"
3881 " cvt.s64.u32 %rd4, %r8;\n"
3882 " mov.f32 %f1, 0f00000000; // 0\n"
3883 "$Lt_35_15618:\n"
3884 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3885 " .loc 3 188 0\n"
3886 " ld.global.u8 %r9, [%rd3+0];\n"
3887 " cvt.rn.f32.u32 %f2, %r9;\n"
3888 " add.f32 %f1, %f2, %f1;\n"
3889 " .loc 3 181 0\n"
3890 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_1_false_n];\n"
3891 " .loc 3 188 0\n"
3892 " setp.ge.u32 %p2, %r6, %r5;\n"
3893 " @%p2 bra $Lt_35_15874;\n"
3894 " //<loop> Part of loop body line 181, head labeled $Lt_35_15618\n"
3895 " .loc 3 191 0\n"
3896 " ld.global.u8 %r10, [%rd3+1];\n"
3897 " cvt.rn.f32.u32 %f3, %r10;\n"
3898 " add.f32 %f1, %f3, %f1;\n"
3899 "$Lt_35_15874:\n"
3900 " //<loop> Part of loop body line 181, head labeled $Lt_35_15618\n"
3901 " add.u32 %r6, %r6, %r8;\n"
3902 " add.u64 %rd3, %rd4, %rd3;\n"
3903 " setp.lt.u32 %p3, %r6, %r7;\n"
3904 " @%p3 bra $Lt_35_15618;\n"
3905 " bra.uni $Lt_35_15106;\n"
3906 "$Lt_35_17154:\n"
3907 " mov.f32 %f1, 0f00000000; // 0\n"
3908 "$Lt_35_15106:\n"
3909 " .loc 3 71 0\n"
3910 " mov.u64 %rd5, __smem;\n"
3911 " cvt.u64.u32 %rd6, %r3;\n"
3912 " mul.wide.u32 %rd7, %r3, 4;\n"
3913 " add.u64 %rd8, %rd5, %rd7;\n"
3914 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3915 " .loc 3 72 0\n"
3916 " bar.sync 0;\n"
3917 " .loc 3 195 0\n"
3918 " mov.u32 %r11, 0;\n"
3919 " setp.ne.u32 %p4, %r3, %r11;\n"
3920 " @%p4 bra $Lt_35_16642;\n"
3921 " .loc 3 199 0\n"
3922 " ld.shared.f32 %f4, [__smem+0];\n"
3923 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_1_false_g_odata];\n"
3924 " cvt.u64.u32 %rd10, %r1;\n"
3925 " mul.wide.u32 %rd11, %r1, 4;\n"
3926 " add.u64 %rd12, %rd9, %rd11;\n"
3927 " st.global.f32 [%rd12+0], %f4;\n"
3928 "$Lt_35_16642:\n"
3929 " .loc 3 554 0\n"
3930 " exit;\n"
3931 "$LDWend_reduce_uchar_1_false:\n"
3932 " } // reduce_uchar_1_false\n"
3933 "\n"
3934 " .entry reduce_uchar_2_false (\n"
3935 " .param .u64 __cudaparm_reduce_uchar_2_false_g_idata,\n"
3936 " .param .u64 __cudaparm_reduce_uchar_2_false_g_odata,\n"
3937 " .param .u32 __cudaparm_reduce_uchar_2_false_n)\n"
3938 " {\n"
3939 " .reg .u16 %rh<3>;\n"
3940 " .reg .u32 %r<14>;\n"
3941 " .reg .u64 %rd<14>;\n"
3942 " .reg .f32 %f<8>;\n"
3943 " .reg .pred %p<7>;\n"
3944 " .loc 3 556 0\n"
3945 "$LDWbegin_reduce_uchar_2_false:\n"
3946 " .loc 3 181 0\n"
3947 " cvt.u32.u16 %r1, %ctaid.x;\n"
3948 " mul24.lo.u32 %r2, %r1, 4;\n"
3949 " cvt.u32.u16 %r3, %tid.x;\n"
3950 " add.u32 %r4, %r2, %r3;\n"
3951 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_2_false_n];\n"
3952 " setp.ge.u32 %p1, %r4, %r5;\n"
3953 " @%p1 bra $Lt_36_17410;\n"
3954 " add.u32 %r6, %r4, 2;\n"
3955 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_2_false_n];\n"
3956 " add.u32 %r7, %r5, 2;\n"
3957 " mov.u16 %rh1, %nctaid.x;\n"
3958 " mul.wide.u16 %r8, %rh1, 4;\n"
3959 " cvt.u64.u32 %rd1, %r4;\n"
3960 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_2_false_g_idata];\n"
3961 " add.u64 %rd3, %rd1, %rd2;\n"
3962 " cvt.s64.u32 %rd4, %r8;\n"
3963 " mov.f32 %f1, 0f00000000; // 0\n"
3964 "$Lt_36_15362:\n"
3965 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3966 " .loc 3 188 0\n"
3967 " ld.global.u8 %r9, [%rd3+0];\n"
3968 " cvt.rn.f32.u32 %f2, %r9;\n"
3969 " add.f32 %f1, %f2, %f1;\n"
3970 " .loc 3 181 0\n"
3971 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_2_false_n];\n"
3972 " .loc 3 188 0\n"
3973 " setp.ge.u32 %p2, %r6, %r5;\n"
3974 " @%p2 bra $Lt_36_15618;\n"
3975 " //<loop> Part of loop body line 181, head labeled $Lt_36_15362\n"
3976 " .loc 3 191 0\n"
3977 " ld.global.u8 %r10, [%rd3+2];\n"
3978 " cvt.rn.f32.u32 %f3, %r10;\n"
3979 " add.f32 %f1, %f3, %f1;\n"
3980 "$Lt_36_15618:\n"
3981 " //<loop> Part of loop body line 181, head labeled $Lt_36_15362\n"
3982 " add.u32 %r6, %r6, %r8;\n"
3983 " add.u64 %rd3, %rd4, %rd3;\n"
3984 " setp.lt.u32 %p3, %r6, %r7;\n"
3985 " @%p3 bra $Lt_36_15362;\n"
3986 " bra.uni $Lt_36_14850;\n"
3987 "$Lt_36_17410:\n"
3988 " mov.f32 %f1, 0f00000000; // 0\n"
3989 "$Lt_36_14850:\n"
3990 " .loc 3 71 0\n"
3991 " mov.u64 %rd5, __smem;\n"
3992 " cvt.u64.u32 %rd6, %r3;\n"
3993 " mul.wide.u32 %rd7, %r3, 4;\n"
3994 " add.u64 %rd8, %rd5, %rd7;\n"
3995 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3996 " .loc 3 72 0\n"
3997 " bar.sync 0;\n"
3998 " mov.u32 %r11, 31;\n"
3999 " setp.gt.u32 %p4, %r3, %r11;\n"
4000 " @%p4 bra $Lt_36_16386;\n"
4001 " .loc 3 88 0\n"
4002 " ld.volatile.shared.f32 %f4, [%rd8+4];\n"
4003 " add.f32 %f5, %f4, %f1;\n"
4004 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4005 "$Lt_36_16386:\n"
4006 " .loc 3 195 0\n"
4007 " mov.u32 %r12, 0;\n"
4008 " setp.ne.u32 %p5, %r3, %r12;\n"
4009 " @%p5 bra $Lt_36_16898;\n"
4010 " .loc 3 199 0\n"
4011 " ld.shared.f32 %f6, [__smem+0];\n"
4012 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_2_false_g_odata];\n"
4013 " cvt.u64.u32 %rd10, %r1;\n"
4014 " mul.wide.u32 %rd11, %r1, 4;\n"
4015 " add.u64 %rd12, %rd9, %rd11;\n"
4016 " st.global.f32 [%rd12+0], %f6;\n"
4017 "$Lt_36_16898:\n"
4018 " .loc 3 559 0\n"
4019 " exit;\n"
4020 "$LDWend_reduce_uchar_2_false:\n"
4021 " } // reduce_uchar_2_false\n"
4022 "\n"
4023 " .entry reduce_uchar_4_false (\n"
4024 " .param .u64 __cudaparm_reduce_uchar_4_false_g_idata,\n"
4025 " .param .u64 __cudaparm_reduce_uchar_4_false_g_odata,\n"
4026 " .param .u32 __cudaparm_reduce_uchar_4_false_n)\n"
4027 " {\n"
4028 " .reg .u16 %rh<3>;\n"
4029 " .reg .u32 %r<14>;\n"
4030 " .reg .u64 %rd<14>;\n"
4031 " .reg .f32 %f<10>;\n"
4032 " .reg .pred %p<7>;\n"
4033 " .loc 3 561 0\n"
4034 "$LDWbegin_reduce_uchar_4_false:\n"
4035 " .loc 3 181 0\n"
4036 " cvt.u32.u16 %r1, %ctaid.x;\n"
4037 " mul24.lo.u32 %r2, %r1, 8;\n"
4038 " cvt.u32.u16 %r3, %tid.x;\n"
4039 " add.u32 %r4, %r2, %r3;\n"
4040 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_4_false_n];\n"
4041 " setp.ge.u32 %p1, %r4, %r5;\n"
4042 " @%p1 bra $Lt_37_17154;\n"
4043 " add.u32 %r6, %r4, 4;\n"
4044 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_4_false_n];\n"
4045 " add.u32 %r7, %r5, 4;\n"
4046 " mov.u16 %rh1, %nctaid.x;\n"
4047 " mul.wide.u16 %r8, %rh1, 8;\n"
4048 " cvt.u64.u32 %rd1, %r4;\n"
4049 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_4_false_g_idata];\n"
4050 " add.u64 %rd3, %rd1, %rd2;\n"
4051 " cvt.s64.u32 %rd4, %r8;\n"
4052 " mov.f32 %f1, 0f00000000; // 0\n"
4053 "$Lt_37_15106:\n"
4054 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4055 " .loc 3 188 0\n"
4056 " ld.global.u8 %r9, [%rd3+0];\n"
4057 " cvt.rn.f32.u32 %f2, %r9;\n"
4058 " add.f32 %f1, %f2, %f1;\n"
4059 " .loc 3 181 0\n"
4060 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_4_false_n];\n"
4061 " .loc 3 188 0\n"
4062 " setp.ge.u32 %p2, %r6, %r5;\n"
4063 " @%p2 bra $Lt_37_15362;\n"
4064 " //<loop> Part of loop body line 181, head labeled $Lt_37_15106\n"
4065 " .loc 3 191 0\n"
4066 " ld.global.u8 %r10, [%rd3+4];\n"
4067 " cvt.rn.f32.u32 %f3, %r10;\n"
4068 " add.f32 %f1, %f3, %f1;\n"
4069 "$Lt_37_15362:\n"
4070 " //<loop> Part of loop body line 181, head labeled $Lt_37_15106\n"
4071 " add.u32 %r6, %r6, %r8;\n"
4072 " add.u64 %rd3, %rd4, %rd3;\n"
4073 " setp.lt.u32 %p3, %r6, %r7;\n"
4074 " @%p3 bra $Lt_37_15106;\n"
4075 " bra.uni $Lt_37_14594;\n"
4076 "$Lt_37_17154:\n"
4077 " mov.f32 %f1, 0f00000000; // 0\n"
4078 "$Lt_37_14594:\n"
4079 " .loc 3 71 0\n"
4080 " mov.u64 %rd5, __smem;\n"
4081 " cvt.u64.u32 %rd6, %r3;\n"
4082 " mul.wide.u32 %rd7, %r3, 4;\n"
4083 " add.u64 %rd8, %rd5, %rd7;\n"
4084 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4085 " .loc 3 72 0\n"
4086 " bar.sync 0;\n"
4087 " mov.u32 %r11, 31;\n"
4088 " setp.gt.u32 %p4, %r3, %r11;\n"
4089 " @%p4 bra $Lt_37_16130;\n"
4090 " .loc 3 87 0\n"
4091 " ld.volatile.shared.f32 %f4, [%rd8+8];\n"
4092 " add.f32 %f5, %f4, %f1;\n"
4093 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4094 " .loc 3 88 0\n"
4095 " ld.volatile.shared.f32 %f6, [%rd8+4];\n"
4096 " add.f32 %f7, %f6, %f5;\n"
4097 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4098 "$Lt_37_16130:\n"
4099 " .loc 3 195 0\n"
4100 " mov.u32 %r12, 0;\n"
4101 " setp.ne.u32 %p5, %r3, %r12;\n"
4102 " @%p5 bra $Lt_37_16642;\n"
4103 " .loc 3 199 0\n"
4104 " ld.shared.f32 %f8, [__smem+0];\n"
4105 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_4_false_g_odata];\n"
4106 " cvt.u64.u32 %rd10, %r1;\n"
4107 " mul.wide.u32 %rd11, %r1, 4;\n"
4108 " add.u64 %rd12, %rd9, %rd11;\n"
4109 " st.global.f32 [%rd12+0], %f8;\n"
4110 "$Lt_37_16642:\n"
4111 " .loc 3 564 0\n"
4112 " exit;\n"
4113 "$LDWend_reduce_uchar_4_false:\n"
4114 " } // reduce_uchar_4_false\n"
4115 "\n"
4116 " .entry reduce_uchar_8_false (\n"
4117 " .param .u64 __cudaparm_reduce_uchar_8_false_g_idata,\n"
4118 " .param .u64 __cudaparm_reduce_uchar_8_false_g_odata,\n"
4119 " .param .u32 __cudaparm_reduce_uchar_8_false_n)\n"
4120 " {\n"
4121 " .reg .u16 %rh<3>;\n"
4122 " .reg .u32 %r<14>;\n"
4123 " .reg .u64 %rd<14>;\n"
4124 " .reg .f32 %f<12>;\n"
4125 " .reg .pred %p<7>;\n"
4126 " .loc 3 566 0\n"
4127 "$LDWbegin_reduce_uchar_8_false:\n"
4128 " .loc 3 181 0\n"
4129 " cvt.u32.u16 %r1, %ctaid.x;\n"
4130 " mul24.lo.u32 %r2, %r1, 16;\n"
4131 " cvt.u32.u16 %r3, %tid.x;\n"
4132 " add.u32 %r4, %r2, %r3;\n"
4133 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_8_false_n];\n"
4134 " setp.ge.u32 %p1, %r4, %r5;\n"
4135 " @%p1 bra $Lt_38_16898;\n"
4136 " add.u32 %r6, %r4, 8;\n"
4137 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_8_false_n];\n"
4138 " add.u32 %r7, %r5, 8;\n"
4139 " mov.u16 %rh1, %nctaid.x;\n"
4140 " mul.wide.u16 %r8, %rh1, 16;\n"
4141 " cvt.u64.u32 %rd1, %r4;\n"
4142 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_8_false_g_idata];\n"
4143 " add.u64 %rd3, %rd1, %rd2;\n"
4144 " cvt.s64.u32 %rd4, %r8;\n"
4145 " mov.f32 %f1, 0f00000000; // 0\n"
4146 "$Lt_38_14850:\n"
4147 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4148 " .loc 3 188 0\n"
4149 " ld.global.u8 %r9, [%rd3+0];\n"
4150 " cvt.rn.f32.u32 %f2, %r9;\n"
4151 " add.f32 %f1, %f2, %f1;\n"
4152 " .loc 3 181 0\n"
4153 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_8_false_n];\n"
4154 " .loc 3 188 0\n"
4155 " setp.ge.u32 %p2, %r6, %r5;\n"
4156 " @%p2 bra $Lt_38_15106;\n"
4157 " //<loop> Part of loop body line 181, head labeled $Lt_38_14850\n"
4158 " .loc 3 191 0\n"
4159 " ld.global.u8 %r10, [%rd3+8];\n"
4160 " cvt.rn.f32.u32 %f3, %r10;\n"
4161 " add.f32 %f1, %f3, %f1;\n"
4162 "$Lt_38_15106:\n"
4163 " //<loop> Part of loop body line 181, head labeled $Lt_38_14850\n"
4164 " add.u32 %r6, %r6, %r8;\n"
4165 " add.u64 %rd3, %rd4, %rd3;\n"
4166 " setp.lt.u32 %p3, %r6, %r7;\n"
4167 " @%p3 bra $Lt_38_14850;\n"
4168 " bra.uni $Lt_38_14338;\n"
4169 "$Lt_38_16898:\n"
4170 " mov.f32 %f1, 0f00000000; // 0\n"
4171 "$Lt_38_14338:\n"
4172 " .loc 3 71 0\n"
4173 " mov.u64 %rd5, __smem;\n"
4174 " cvt.u64.u32 %rd6, %r3;\n"
4175 " mul.wide.u32 %rd7, %r3, 4;\n"
4176 " add.u64 %rd8, %rd5, %rd7;\n"
4177 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4178 " .loc 3 72 0\n"
4179 " bar.sync 0;\n"
4180 " mov.u32 %r11, 31;\n"
4181 " setp.gt.u32 %p4, %r3, %r11;\n"
4182 " @%p4 bra $Lt_38_15874;\n"
4183 " .loc 3 86 0\n"
4184 " ld.volatile.shared.f32 %f4, [%rd8+16];\n"
4185 " add.f32 %f5, %f4, %f1;\n"
4186 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4187 " .loc 3 87 0\n"
4188 " ld.volatile.shared.f32 %f6, [%rd8+8];\n"
4189 " add.f32 %f7, %f6, %f5;\n"
4190 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4191 " .loc 3 88 0\n"
4192 " ld.volatile.shared.f32 %f8, [%rd8+4];\n"
4193 " add.f32 %f9, %f8, %f7;\n"
4194 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4195 "$Lt_38_15874:\n"
4196 " .loc 3 195 0\n"
4197 " mov.u32 %r12, 0;\n"
4198 " setp.ne.u32 %p5, %r3, %r12;\n"
4199 " @%p5 bra $Lt_38_16386;\n"
4200 " .loc 3 199 0\n"
4201 " ld.shared.f32 %f10, [__smem+0];\n"
4202 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_8_false_g_odata];\n"
4203 " cvt.u64.u32 %rd10, %r1;\n"
4204 " mul.wide.u32 %rd11, %r1, 4;\n"
4205 " add.u64 %rd12, %rd9, %rd11;\n"
4206 " st.global.f32 [%rd12+0], %f10;\n"
4207 "$Lt_38_16386:\n"
4208 " .loc 3 569 0\n"
4209 " exit;\n"
4210 "$LDWend_reduce_uchar_8_false:\n"
4211 " } // reduce_uchar_8_false\n"
4212 "\n"
4213 " .entry reduce_uchar_16_false (\n"
4214 " .param .u64 __cudaparm_reduce_uchar_16_false_g_idata,\n"
4215 " .param .u64 __cudaparm_reduce_uchar_16_false_g_odata,\n"
4216 " .param .u32 __cudaparm_reduce_uchar_16_false_n)\n"
4217 " {\n"
4218 " .reg .u16 %rh<3>;\n"
4219 " .reg .u32 %r<14>;\n"
4220 " .reg .u64 %rd<14>;\n"
4221 " .reg .f32 %f<14>;\n"
4222 " .reg .pred %p<7>;\n"
4223 " .loc 3 571 0\n"
4224 "$LDWbegin_reduce_uchar_16_false:\n"
4225 " .loc 3 181 0\n"
4226 " cvt.u32.u16 %r1, %ctaid.x;\n"
4227 " mul24.lo.u32 %r2, %r1, 32;\n"
4228 " cvt.u32.u16 %r3, %tid.x;\n"
4229 " add.u32 %r4, %r2, %r3;\n"
4230 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_16_false_n];\n"
4231 " setp.ge.u32 %p1, %r4, %r5;\n"
4232 " @%p1 bra $Lt_39_16642;\n"
4233 " add.u32 %r6, %r4, 16;\n"
4234 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_16_false_n];\n"
4235 " add.u32 %r7, %r5, 16;\n"
4236 " mov.u16 %rh1, %nctaid.x;\n"
4237 " mul.wide.u16 %r8, %rh1, 32;\n"
4238 " cvt.u64.u32 %rd1, %r4;\n"
4239 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_16_false_g_idata];\n"
4240 " add.u64 %rd3, %rd1, %rd2;\n"
4241 " cvt.s64.u32 %rd4, %r8;\n"
4242 " mov.f32 %f1, 0f00000000; // 0\n"
4243 "$Lt_39_14594:\n"
4244 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4245 " .loc 3 188 0\n"
4246 " ld.global.u8 %r9, [%rd3+0];\n"
4247 " cvt.rn.f32.u32 %f2, %r9;\n"
4248 " add.f32 %f1, %f2, %f1;\n"
4249 " .loc 3 181 0\n"
4250 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_16_false_n];\n"
4251 " .loc 3 188 0\n"
4252 " setp.ge.u32 %p2, %r6, %r5;\n"
4253 " @%p2 bra $Lt_39_14850;\n"
4254 " //<loop> Part of loop body line 181, head labeled $Lt_39_14594\n"
4255 " .loc 3 191 0\n"
4256 " ld.global.u8 %r10, [%rd3+16];\n"
4257 " cvt.rn.f32.u32 %f3, %r10;\n"
4258 " add.f32 %f1, %f3, %f1;\n"
4259 "$Lt_39_14850:\n"
4260 " //<loop> Part of loop body line 181, head labeled $Lt_39_14594\n"
4261 " add.u32 %r6, %r6, %r8;\n"
4262 " add.u64 %rd3, %rd4, %rd3;\n"
4263 " setp.lt.u32 %p3, %r6, %r7;\n"
4264 " @%p3 bra $Lt_39_14594;\n"
4265 " bra.uni $Lt_39_14082;\n"
4266 "$Lt_39_16642:\n"
4267 " mov.f32 %f1, 0f00000000; // 0\n"
4268 "$Lt_39_14082:\n"
4269 " .loc 3 71 0\n"
4270 " mov.u64 %rd5, __smem;\n"
4271 " cvt.u64.u32 %rd6, %r3;\n"
4272 " mul.wide.u32 %rd7, %r3, 4;\n"
4273 " add.u64 %rd8, %rd5, %rd7;\n"
4274 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4275 " .loc 3 72 0\n"
4276 " bar.sync 0;\n"
4277 " mov.u32 %r11, 31;\n"
4278 " setp.gt.u32 %p4, %r3, %r11;\n"
4279 " @%p4 bra $Lt_39_15618;\n"
4280 " .loc 3 85 0\n"
4281 " ld.volatile.shared.f32 %f4, [%rd8+32];\n"
4282 " add.f32 %f5, %f4, %f1;\n"
4283 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4284 " .loc 3 86 0\n"
4285 " ld.volatile.shared.f32 %f6, [%rd8+16];\n"
4286 " add.f32 %f7, %f6, %f5;\n"
4287 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4288 " .loc 3 87 0\n"
4289 " ld.volatile.shared.f32 %f8, [%rd8+8];\n"
4290 " add.f32 %f9, %f8, %f7;\n"
4291 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4292 " .loc 3 88 0\n"
4293 " ld.volatile.shared.f32 %f10, [%rd8+4];\n"
4294 " add.f32 %f11, %f10, %f9;\n"
4295 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4296 "$Lt_39_15618:\n"
4297 " .loc 3 195 0\n"
4298 " mov.u32 %r12, 0;\n"
4299 " setp.ne.u32 %p5, %r3, %r12;\n"
4300 " @%p5 bra $Lt_39_16130;\n"
4301 " .loc 3 199 0\n"
4302 " ld.shared.f32 %f12, [__smem+0];\n"
4303 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_16_false_g_odata];\n"
4304 " cvt.u64.u32 %rd10, %r1;\n"
4305 " mul.wide.u32 %rd11, %r1, 4;\n"
4306 " add.u64 %rd12, %rd9, %rd11;\n"
4307 " st.global.f32 [%rd12+0], %f12;\n"
4308 "$Lt_39_16130:\n"
4309 " .loc 3 574 0\n"
4310 " exit;\n"
4311 "$LDWend_reduce_uchar_16_false:\n"
4312 " } // reduce_uchar_16_false\n"
4313 "\n"
4314 " .entry reduce_uchar_32_false (\n"
4315 " .param .u64 __cudaparm_reduce_uchar_32_false_g_idata,\n"
4316 " .param .u64 __cudaparm_reduce_uchar_32_false_g_odata,\n"
4317 " .param .u32 __cudaparm_reduce_uchar_32_false_n)\n"
4318 " {\n"
4319 " .reg .u16 %rh<3>;\n"
4320 " .reg .u32 %r<14>;\n"
4321 " .reg .u64 %rd<14>;\n"
4322 " .reg .f32 %f<16>;\n"
4323 " .reg .pred %p<7>;\n"
4324 " .loc 3 576 0\n"
4325 "$LDWbegin_reduce_uchar_32_false:\n"
4326 " .loc 3 181 0\n"
4327 " cvt.u32.u16 %r1, %ctaid.x;\n"
4328 " mul24.lo.u32 %r2, %r1, 64;\n"
4329 " cvt.u32.u16 %r3, %tid.x;\n"
4330 " add.u32 %r4, %r2, %r3;\n"
4331 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_32_false_n];\n"
4332 " setp.ge.u32 %p1, %r4, %r5;\n"
4333 " @%p1 bra $Lt_40_16386;\n"
4334 " add.u32 %r6, %r4, 32;\n"
4335 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_32_false_n];\n"
4336 " add.u32 %r7, %r5, 32;\n"
4337 " mov.u16 %rh1, %nctaid.x;\n"
4338 " mul.wide.u16 %r8, %rh1, 64;\n"
4339 " cvt.u64.u32 %rd1, %r4;\n"
4340 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_32_false_g_idata];\n"
4341 " add.u64 %rd3, %rd1, %rd2;\n"
4342 " cvt.s64.u32 %rd4, %r8;\n"
4343 " mov.f32 %f1, 0f00000000; // 0\n"
4344 "$Lt_40_14338:\n"
4345 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4346 " .loc 3 188 0\n"
4347 " ld.global.u8 %r9, [%rd3+0];\n"
4348 " cvt.rn.f32.u32 %f2, %r9;\n"
4349 " add.f32 %f1, %f2, %f1;\n"
4350 " .loc 3 181 0\n"
4351 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_32_false_n];\n"
4352 " .loc 3 188 0\n"
4353 " setp.ge.u32 %p2, %r6, %r5;\n"
4354 " @%p2 bra $Lt_40_14594;\n"
4355 " //<loop> Part of loop body line 181, head labeled $Lt_40_14338\n"
4356 " .loc 3 191 0\n"
4357 " ld.global.u8 %r10, [%rd3+32];\n"
4358 " cvt.rn.f32.u32 %f3, %r10;\n"
4359 " add.f32 %f1, %f3, %f1;\n"
4360 "$Lt_40_14594:\n"
4361 " //<loop> Part of loop body line 181, head labeled $Lt_40_14338\n"
4362 " add.u32 %r6, %r6, %r8;\n"
4363 " add.u64 %rd3, %rd4, %rd3;\n"
4364 " setp.lt.u32 %p3, %r6, %r7;\n"
4365 " @%p3 bra $Lt_40_14338;\n"
4366 " bra.uni $Lt_40_13826;\n"
4367 "$Lt_40_16386:\n"
4368 " mov.f32 %f1, 0f00000000; // 0\n"
4369 "$Lt_40_13826:\n"
4370 " .loc 3 71 0\n"
4371 " mov.u64 %rd5, __smem;\n"
4372 " cvt.u64.u32 %rd6, %r3;\n"
4373 " mul.wide.u32 %rd7, %r3, 4;\n"
4374 " add.u64 %rd8, %rd5, %rd7;\n"
4375 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4376 " .loc 3 72 0\n"
4377 " bar.sync 0;\n"
4378 " mov.u32 %r11, 31;\n"
4379 " setp.gt.u32 %p4, %r3, %r11;\n"
4380 " @%p4 bra $Lt_40_15362;\n"
4381 " .loc 3 84 0\n"
4382 " ld.volatile.shared.f32 %f4, [%rd8+64];\n"
4383 " add.f32 %f5, %f4, %f1;\n"
4384 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4385 " .loc 3 85 0\n"
4386 " ld.volatile.shared.f32 %f6, [%rd8+32];\n"
4387 " add.f32 %f7, %f6, %f5;\n"
4388 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4389 " .loc 3 86 0\n"
4390 " ld.volatile.shared.f32 %f8, [%rd8+16];\n"
4391 " add.f32 %f9, %f8, %f7;\n"
4392 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4393 " .loc 3 87 0\n"
4394 " ld.volatile.shared.f32 %f10, [%rd8+8];\n"
4395 " add.f32 %f11, %f10, %f9;\n"
4396 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4397 " .loc 3 88 0\n"
4398 " ld.volatile.shared.f32 %f12, [%rd8+4];\n"
4399 " add.f32 %f13, %f12, %f11;\n"
4400 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
4401 "$Lt_40_15362:\n"
4402 " .loc 3 195 0\n"
4403 " mov.u32 %r12, 0;\n"
4404 " setp.ne.u32 %p5, %r3, %r12;\n"
4405 " @%p5 bra $Lt_40_15874;\n"
4406 " .loc 3 199 0\n"
4407 " ld.shared.f32 %f14, [__smem+0];\n"
4408 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_32_false_g_odata];\n"
4409 " cvt.u64.u32 %rd10, %r1;\n"
4410 " mul.wide.u32 %rd11, %r1, 4;\n"
4411 " add.u64 %rd12, %rd9, %rd11;\n"
4412 " st.global.f32 [%rd12+0], %f14;\n"
4413 "$Lt_40_15874:\n"
4414 " .loc 3 579 0\n"
4415 " exit;\n"
4416 "$LDWend_reduce_uchar_32_false:\n"
4417 " } // reduce_uchar_32_false\n"
4418 "\n"
4419 " .entry reduce_uchar_64_false (\n"
4420 " .param .u64 __cudaparm_reduce_uchar_64_false_g_idata,\n"
4421 " .param .u64 __cudaparm_reduce_uchar_64_false_g_odata,\n"
4422 " .param .u32 __cudaparm_reduce_uchar_64_false_n)\n"
4423 " {\n"
4424 " .reg .u16 %rh<3>;\n"
4425 " .reg .u32 %r<14>;\n"
4426 " .reg .u64 %rd<14>;\n"
4427 " .reg .f32 %f<18>;\n"
4428 " .reg .pred %p<7>;\n"
4429 " .loc 3 581 0\n"
4430 "$LDWbegin_reduce_uchar_64_false:\n"
4431 " .loc 3 181 0\n"
4432 " cvt.u32.u16 %r1, %ctaid.x;\n"
4433 " mul24.lo.u32 %r2, %r1, 128;\n"
4434 " cvt.u32.u16 %r3, %tid.x;\n"
4435 " add.u32 %r4, %r2, %r3;\n"
4436 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_64_false_n];\n"
4437 " setp.ge.u32 %p1, %r4, %r5;\n"
4438 " @%p1 bra $Lt_41_16130;\n"
4439 " add.u32 %r6, %r4, 64;\n"
4440 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_64_false_n];\n"
4441 " add.u32 %r7, %r5, 64;\n"
4442 " mov.u16 %rh1, %nctaid.x;\n"
4443 " mul.wide.u16 %r8, %rh1, 128;\n"
4444 " cvt.u64.u32 %rd1, %r4;\n"
4445 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_64_false_g_idata];\n"
4446 " add.u64 %rd3, %rd1, %rd2;\n"
4447 " cvt.s64.u32 %rd4, %r8;\n"
4448 " mov.f32 %f1, 0f00000000; // 0\n"
4449 "$Lt_41_14082:\n"
4450 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4451 " .loc 3 188 0\n"
4452 " ld.global.u8 %r9, [%rd3+0];\n"
4453 " cvt.rn.f32.u32 %f2, %r9;\n"
4454 " add.f32 %f1, %f2, %f1;\n"
4455 " .loc 3 181 0\n"
4456 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_64_false_n];\n"
4457 " .loc 3 188 0\n"
4458 " setp.ge.u32 %p2, %r6, %r5;\n"
4459 " @%p2 bra $Lt_41_14338;\n"
4460 " //<loop> Part of loop body line 181, head labeled $Lt_41_14082\n"
4461 " .loc 3 191 0\n"
4462 " ld.global.u8 %r10, [%rd3+64];\n"
4463 " cvt.rn.f32.u32 %f3, %r10;\n"
4464 " add.f32 %f1, %f3, %f1;\n"
4465 "$Lt_41_14338:\n"
4466 " //<loop> Part of loop body line 181, head labeled $Lt_41_14082\n"
4467 " add.u32 %r6, %r6, %r8;\n"
4468 " add.u64 %rd3, %rd4, %rd3;\n"
4469 " setp.lt.u32 %p3, %r6, %r7;\n"
4470 " @%p3 bra $Lt_41_14082;\n"
4471 " bra.uni $Lt_41_13570;\n"
4472 "$Lt_41_16130:\n"
4473 " mov.f32 %f1, 0f00000000; // 0\n"
4474 "$Lt_41_13570:\n"
4475 " .loc 3 71 0\n"
4476 " mov.u64 %rd5, __smem;\n"
4477 " cvt.u64.u32 %rd6, %r3;\n"
4478 " mul.wide.u32 %rd7, %r3, 4;\n"
4479 " add.u64 %rd8, %rd5, %rd7;\n"
4480 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4481 " .loc 3 72 0\n"
4482 " bar.sync 0;\n"
4483 " mov.u32 %r11, 31;\n"
4484 " setp.gt.u32 %p4, %r3, %r11;\n"
4485 " @%p4 bra $Lt_41_15106;\n"
4486 " .loc 3 83 0\n"
4487 " ld.volatile.shared.f32 %f4, [%rd8+128];\n"
4488 " add.f32 %f5, %f4, %f1;\n"
4489 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4490 " .loc 3 84 0\n"
4491 " ld.volatile.shared.f32 %f6, [%rd8+64];\n"
4492 " add.f32 %f7, %f6, %f5;\n"
4493 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4494 " .loc 3 85 0\n"
4495 " ld.volatile.shared.f32 %f8, [%rd8+32];\n"
4496 " add.f32 %f9, %f8, %f7;\n"
4497 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4498 " .loc 3 86 0\n"
4499 " ld.volatile.shared.f32 %f10, [%rd8+16];\n"
4500 " add.f32 %f11, %f10, %f9;\n"
4501 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4502 " .loc 3 87 0\n"
4503 " ld.volatile.shared.f32 %f12, [%rd8+8];\n"
4504 " add.f32 %f13, %f12, %f11;\n"
4505 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
4506 " .loc 3 88 0\n"
4507 " ld.volatile.shared.f32 %f14, [%rd8+4];\n"
4508 " add.f32 %f15, %f14, %f13;\n"
4509 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
4510 "$Lt_41_15106:\n"
4511 " .loc 3 195 0\n"
4512 " mov.u32 %r12, 0;\n"
4513 " setp.ne.u32 %p5, %r3, %r12;\n"
4514 " @%p5 bra $Lt_41_15618;\n"
4515 " .loc 3 199 0\n"
4516 " ld.shared.f32 %f16, [__smem+0];\n"
4517 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_64_false_g_odata];\n"
4518 " cvt.u64.u32 %rd10, %r1;\n"
4519 " mul.wide.u32 %rd11, %r1, 4;\n"
4520 " add.u64 %rd12, %rd9, %rd11;\n"
4521 " st.global.f32 [%rd12+0], %f16;\n"
4522 "$Lt_41_15618:\n"
4523 " .loc 3 584 0\n"
4524 " exit;\n"
4525 "$LDWend_reduce_uchar_64_false:\n"
4526 " } // reduce_uchar_64_false\n"
4527 "\n"
4528 " .entry reduce_uchar_128_false (\n"
4529 " .param .u64 __cudaparm_reduce_uchar_128_false_g_idata,\n"
4530 " .param .u64 __cudaparm_reduce_uchar_128_false_g_odata,\n"
4531 " .param .u32 __cudaparm_reduce_uchar_128_false_n)\n"
4532 " {\n"
4533 " .reg .u16 %rh<3>;\n"
4534 " .reg .u32 %r<15>;\n"
4535 " .reg .u64 %rd<14>;\n"
4536 " .reg .f32 %f<20>;\n"
4537 " .reg .pred %p<8>;\n"
4538 " .loc 3 586 0\n"
4539 "$LDWbegin_reduce_uchar_128_false:\n"
4540 " .loc 3 181 0\n"
4541 " cvt.u32.u16 %r1, %ctaid.x;\n"
4542 " mul.lo.u32 %r2, %r1, 256;\n"
4543 " cvt.u32.u16 %r3, %tid.x;\n"
4544 " add.u32 %r4, %r2, %r3;\n"
4545 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_128_false_n];\n"
4546 " setp.ge.u32 %p1, %r4, %r5;\n"
4547 " @%p1 bra $Lt_42_16386;\n"
4548 " add.u32 %r6, %r4, 128;\n"
4549 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_128_false_n];\n"
4550 " add.u32 %r7, %r5, 128;\n"
4551 " mov.u16 %rh1, %nctaid.x;\n"
4552 " mul.wide.u16 %r8, %rh1, 256;\n"
4553 " cvt.u64.u32 %rd1, %r4;\n"
4554 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_128_false_g_idata];\n"
4555 " add.u64 %rd3, %rd1, %rd2;\n"
4556 " cvt.s64.u32 %rd4, %r8;\n"
4557 " mov.f32 %f1, 0f00000000; // 0\n"
4558 "$Lt_42_13826:\n"
4559 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4560 " .loc 3 188 0\n"
4561 " ld.global.u8 %r9, [%rd3+0];\n"
4562 " cvt.rn.f32.u32 %f2, %r9;\n"
4563 " add.f32 %f1, %f2, %f1;\n"
4564 " .loc 3 181 0\n"
4565 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_128_false_n];\n"
4566 " .loc 3 188 0\n"
4567 " setp.ge.u32 %p2, %r6, %r5;\n"
4568 " @%p2 bra $Lt_42_14082;\n"
4569 " //<loop> Part of loop body line 181, head labeled $Lt_42_13826\n"
4570 " .loc 3 191 0\n"
4571 " ld.global.u8 %r10, [%rd3+128];\n"
4572 " cvt.rn.f32.u32 %f3, %r10;\n"
4573 " add.f32 %f1, %f3, %f1;\n"
4574 "$Lt_42_14082:\n"
4575 " //<loop> Part of loop body line 181, head labeled $Lt_42_13826\n"
4576 " add.u32 %r6, %r6, %r8;\n"
4577 " add.u64 %rd3, %rd4, %rd3;\n"
4578 " setp.lt.u32 %p3, %r6, %r7;\n"
4579 " @%p3 bra $Lt_42_13826;\n"
4580 " bra.uni $Lt_42_13314;\n"
4581 "$Lt_42_16386:\n"
4582 " mov.f32 %f1, 0f00000000; // 0\n"
4583 "$Lt_42_13314:\n"
4584 " .loc 3 195 0\n"
4585 " mov.f32 %f4, %f1;\n"
4586 " mov.f32 %f5, %f4;\n"
4587 " .loc 3 71 0\n"
4588 " mov.u64 %rd5, __smem;\n"
4589 " cvt.u64.u32 %rd6, %r3;\n"
4590 " mul.wide.u32 %rd7, %r3, 4;\n"
4591 " add.u64 %rd8, %rd5, %rd7;\n"
4592 " st.volatile.shared.f32 [%rd8+0], %f4;\n"
4593 " .loc 3 72 0\n"
4594 " bar.sync 0;\n"
4595 " mov.u32 %r11, 63;\n"
4596 " setp.gt.u32 %p4, %r3, %r11;\n"
4597 " @%p4 bra $Lt_42_14850;\n"
4598 " .loc 3 77 0\n"
4599 " ld.volatile.shared.f32 %f6, [%rd8+256];\n"
4600 " add.f32 %f5, %f6, %f4;\n"
4601 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4602 "$Lt_42_14850:\n"
4603 " bar.sync 0;\n"
4604 " mov.u32 %r12, 31;\n"
4605 " setp.gt.u32 %p5, %r3, %r12;\n"
4606 " @%p5 bra $Lt_42_15362;\n"
4607 " .loc 3 83 0\n"
4608 " ld.volatile.shared.f32 %f7, [%rd8+128];\n"
4609 " add.f32 %f8, %f7, %f5;\n"
4610 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
4611 " .loc 3 84 0\n"
4612 " ld.volatile.shared.f32 %f9, [%rd8+64];\n"
4613 " add.f32 %f10, %f9, %f8;\n"
4614 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
4615 " .loc 3 85 0\n"
4616 " ld.volatile.shared.f32 %f11, [%rd8+32];\n"
4617 " add.f32 %f12, %f11, %f10;\n"
4618 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
4619 " .loc 3 86 0\n"
4620 " ld.volatile.shared.f32 %f13, [%rd8+16];\n"
4621 " add.f32 %f14, %f13, %f12;\n"
4622 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
4623 " .loc 3 87 0\n"
4624 " ld.volatile.shared.f32 %f15, [%rd8+8];\n"
4625 " add.f32 %f16, %f15, %f14;\n"
4626 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
4627 " .loc 3 88 0\n"
4628 " ld.volatile.shared.f32 %f17, [%rd8+4];\n"
4629 " add.f32 %f5, %f17, %f16;\n"
4630 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4631 "$Lt_42_15362:\n"
4632 " .loc 3 195 0\n"
4633 " mov.u32 %r13, 0;\n"
4634 " setp.ne.u32 %p6, %r3, %r13;\n"
4635 " @%p6 bra $Lt_42_15874;\n"
4636 " .loc 3 199 0\n"
4637 " ld.shared.f32 %f18, [__smem+0];\n"
4638 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_128_false_g_odata];\n"
4639 " cvt.u64.u32 %rd10, %r1;\n"
4640 " mul.wide.u32 %rd11, %r1, 4;\n"
4641 " add.u64 %rd12, %rd9, %rd11;\n"
4642 " st.global.f32 [%rd12+0], %f18;\n"
4643 "$Lt_42_15874:\n"
4644 " .loc 3 589 0\n"
4645 " exit;\n"
4646 "$LDWend_reduce_uchar_128_false:\n"
4647 " } // reduce_uchar_128_false\n"
4648 "\n"
4649 " .entry reduce_uchar_256_false (\n"
4650 " .param .u64 __cudaparm_reduce_uchar_256_false_g_idata,\n"
4651 " .param .u64 __cudaparm_reduce_uchar_256_false_g_odata,\n"
4652 " .param .u32 __cudaparm_reduce_uchar_256_false_n)\n"
4653 " {\n"
4654 " .reg .u16 %rh<3>;\n"
4655 " .reg .u32 %r<16>;\n"
4656 " .reg .u64 %rd<14>;\n"
4657 " .reg .f32 %f<21>;\n"
4658 " .reg .pred %p<9>;\n"
4659 " .loc 3 591 0\n"
4660 "$LDWbegin_reduce_uchar_256_false:\n"
4661 " .loc 3 181 0\n"
4662 " cvt.u32.u16 %r1, %ctaid.x;\n"
4663 " mul.lo.u32 %r2, %r1, 512;\n"
4664 " cvt.u32.u16 %r3, %tid.x;\n"
4665 " add.u32 %r4, %r2, %r3;\n"
4666 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_256_false_n];\n"
4667 " setp.ge.u32 %p1, %r4, %r5;\n"
4668 " @%p1 bra $Lt_43_16642;\n"
4669 " add.u32 %r6, %r4, 256;\n"
4670 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_256_false_n];\n"
4671 " add.u32 %r7, %r5, 256;\n"
4672 " mov.u16 %rh1, %nctaid.x;\n"
4673 " mul.wide.u16 %r8, %rh1, 512;\n"
4674 " cvt.u64.u32 %rd1, %r4;\n"
4675 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_256_false_g_idata];\n"
4676 " add.u64 %rd3, %rd1, %rd2;\n"
4677 " cvt.s64.u32 %rd4, %r8;\n"
4678 " mov.f32 %f1, 0f00000000; // 0\n"
4679 "$Lt_43_13570:\n"
4680 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4681 " .loc 3 188 0\n"
4682 " ld.global.u8 %r9, [%rd3+0];\n"
4683 " cvt.rn.f32.u32 %f2, %r9;\n"
4684 " add.f32 %f1, %f2, %f1;\n"
4685 " .loc 3 181 0\n"
4686 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_256_false_n];\n"
4687 " .loc 3 188 0\n"
4688 " setp.ge.u32 %p2, %r6, %r5;\n"
4689 " @%p2 bra $Lt_43_13826;\n"
4690 " //<loop> Part of loop body line 181, head labeled $Lt_43_13570\n"
4691 " .loc 3 191 0\n"
4692 " ld.global.u8 %r10, [%rd3+256];\n"
4693 " cvt.rn.f32.u32 %f3, %r10;\n"
4694 " add.f32 %f1, %f3, %f1;\n"
4695 "$Lt_43_13826:\n"
4696 " //<loop> Part of loop body line 181, head labeled $Lt_43_13570\n"
4697 " add.u32 %r6, %r6, %r8;\n"
4698 " add.u64 %rd3, %rd4, %rd3;\n"
4699 " setp.lt.u32 %p3, %r6, %r7;\n"
4700 " @%p3 bra $Lt_43_13570;\n"
4701 " bra.uni $Lt_43_13058;\n"
4702 "$Lt_43_16642:\n"
4703 " mov.f32 %f1, 0f00000000; // 0\n"
4704 "$Lt_43_13058:\n"
4705 " .loc 3 195 0\n"
4706 " mov.f32 %f4, %f1;\n"
4707 " mov.f32 %f5, %f4;\n"
4708 " .loc 3 71 0\n"
4709 " mov.u64 %rd5, __smem;\n"
4710 " cvt.u64.u32 %rd6, %r3;\n"
4711 " mul.wide.u32 %rd7, %r3, 4;\n"
4712 " add.u64 %rd8, %rd5, %rd7;\n"
4713 " st.volatile.shared.f32 [%rd8+0], %f4;\n"
4714 " .loc 3 72 0\n"
4715 " bar.sync 0;\n"
4716 " mov.u32 %r11, 127;\n"
4717 " setp.gt.u32 %p4, %r3, %r11;\n"
4718 " @%p4 bra $Lt_43_14594;\n"
4719 " .loc 3 76 0\n"
4720 " ld.volatile.shared.f32 %f6, [%rd8+512];\n"
4721 " add.f32 %f5, %f6, %f4;\n"
4722 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4723 "$Lt_43_14594:\n"
4724 " bar.sync 0;\n"
4725 " mov.u32 %r12, 63;\n"
4726 " setp.gt.u32 %p5, %r3, %r12;\n"
4727 " @%p5 bra $Lt_43_15106;\n"
4728 " .loc 3 77 0\n"
4729 " ld.volatile.shared.f32 %f7, [%rd8+256];\n"
4730 " add.f32 %f5, %f7, %f5;\n"
4731 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4732 "$Lt_43_15106:\n"
4733 " bar.sync 0;\n"
4734 " mov.u32 %r13, 31;\n"
4735 " setp.gt.u32 %p6, %r3, %r13;\n"
4736 " @%p6 bra $Lt_43_15618;\n"
4737 " .loc 3 83 0\n"
4738 " ld.volatile.shared.f32 %f8, [%rd8+128];\n"
4739 " add.f32 %f9, %f8, %f5;\n"
4740 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4741 " .loc 3 84 0\n"
4742 " ld.volatile.shared.f32 %f10, [%rd8+64];\n"
4743 " add.f32 %f11, %f10, %f9;\n"
4744 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4745 " .loc 3 85 0\n"
4746 " ld.volatile.shared.f32 %f12, [%rd8+32];\n"
4747 " add.f32 %f13, %f12, %f11;\n"
4748 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
4749 " .loc 3 86 0\n"
4750 " ld.volatile.shared.f32 %f14, [%rd8+16];\n"
4751 " add.f32 %f15, %f14, %f13;\n"
4752 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
4753 " .loc 3 87 0\n"
4754 " ld.volatile.shared.f32 %f16, [%rd8+8];\n"
4755 " add.f32 %f17, %f16, %f15;\n"
4756 " st.volatile.shared.f32 [%rd8+0], %f17;\n"
4757 " .loc 3 88 0\n"
4758 " ld.volatile.shared.f32 %f18, [%rd8+4];\n"
4759 " add.f32 %f5, %f18, %f17;\n"
4760 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4761 "$Lt_43_15618:\n"
4762 " .loc 3 195 0\n"
4763 " mov.u32 %r14, 0;\n"
4764 " setp.ne.u32 %p7, %r3, %r14;\n"
4765 " @%p7 bra $Lt_43_16130;\n"
4766 " .loc 3 199 0\n"
4767 " ld.shared.f32 %f19, [__smem+0];\n"
4768 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_256_false_g_odata];\n"
4769 " cvt.u64.u32 %rd10, %r1;\n"
4770 " mul.wide.u32 %rd11, %r1, 4;\n"
4771 " add.u64 %rd12, %rd9, %rd11;\n"
4772 " st.global.f32 [%rd12+0], %f19;\n"
4773 "$Lt_43_16130:\n"
4774 " .loc 3 594 0\n"
4775 " exit;\n"
4776 "$LDWend_reduce_uchar_256_false:\n"
4777 " } // reduce_uchar_256_false\n"
4778 "\n"
4779 " .entry reduce_uchar_512_false (\n"
4780 " .param .u64 __cudaparm_reduce_uchar_512_false_g_idata,\n"
4781 " .param .u64 __cudaparm_reduce_uchar_512_false_g_odata,\n"
4782 " .param .u32 __cudaparm_reduce_uchar_512_false_n)\n"
4783 " {\n"
4784 " .reg .u16 %rh<3>;\n"
4785 " .reg .u32 %r<17>;\n"
4786 " .reg .u64 %rd<14>;\n"
4787 " .reg .f32 %f<22>;\n"
4788 " .reg .pred %p<10>;\n"
4789 " .loc 3 596 0\n"
4790 "$LDWbegin_reduce_uchar_512_false:\n"
4791 " .loc 3 181 0\n"
4792 " cvt.u32.u16 %r1, %ctaid.x;\n"
4793 " mul.lo.u32 %r2, %r1, 1024;\n"
4794 " cvt.u32.u16 %r3, %tid.x;\n"
4795 " add.u32 %r4, %r2, %r3;\n"
4796 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_512_false_n];\n"
4797 " setp.ge.u32 %p1, %r4, %r5;\n"
4798 " @%p1 bra $Lt_44_16898;\n"
4799 " add.u32 %r6, %r4, 512;\n"
4800 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_512_false_n];\n"
4801 " add.u32 %r7, %r5, 512;\n"
4802 " mov.u16 %rh1, %nctaid.x;\n"
4803 " mul.wide.u16 %r8, %rh1, 1024;\n"
4804 " cvt.u64.u32 %rd1, %r4;\n"
4805 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_512_false_g_idata];\n"
4806 " add.u64 %rd3, %rd1, %rd2;\n"
4807 " cvt.s64.u32 %rd4, %r8;\n"
4808 " mov.f32 %f1, 0f00000000; // 0\n"
4809 "$Lt_44_13314:\n"
4810 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4811 " .loc 3 188 0\n"
4812 " ld.global.u8 %r9, [%rd3+0];\n"
4813 " cvt.rn.f32.u32 %f2, %r9;\n"
4814 " add.f32 %f1, %f2, %f1;\n"
4815 " .loc 3 181 0\n"
4816 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_512_false_n];\n"
4817 " .loc 3 188 0\n"
4818 " setp.ge.u32 %p2, %r6, %r5;\n"
4819 " @%p2 bra $Lt_44_13570;\n"
4820 " //<loop> Part of loop body line 181, head labeled $Lt_44_13314\n"
4821 " .loc 3 191 0\n"
4822 " ld.global.u8 %r10, [%rd3+512];\n"
4823 " cvt.rn.f32.u32 %f3, %r10;\n"
4824 " add.f32 %f1, %f3, %f1;\n"
4825 "$Lt_44_13570:\n"
4826 " //<loop> Part of loop body line 181, head labeled $Lt_44_13314\n"
4827 " add.u32 %r6, %r6, %r8;\n"
4828 " add.u64 %rd3, %rd4, %rd3;\n"
4829 " setp.lt.u32 %p3, %r6, %r7;\n"
4830 " @%p3 bra $Lt_44_13314;\n"
4831 " bra.uni $Lt_44_12802;\n"
4832 "$Lt_44_16898:\n"
4833 " mov.f32 %f1, 0f00000000; // 0\n"
4834 "$Lt_44_12802:\n"
4835 " .loc 3 195 0\n"
4836 " mov.f32 %f4, %f1;\n"
4837 " mov.f32 %f5, %f4;\n"
4838 " .loc 3 71 0\n"
4839 " mov.u64 %rd5, __smem;\n"
4840 " cvt.u64.u32 %rd6, %r3;\n"
4841 " mul.wide.u32 %rd7, %r3, 4;\n"
4842 " add.u64 %rd8, %rd5, %rd7;\n"
4843 " st.volatile.shared.f32 [%rd8+0], %f4;\n"
4844 " .loc 3 72 0\n"
4845 " bar.sync 0;\n"
4846 " mov.u32 %r11, 255;\n"
4847 " setp.gt.u32 %p4, %r3, %r11;\n"
4848 " @%p4 bra $Lt_44_14338;\n"
4849 " .loc 3 75 0\n"
4850 " ld.volatile.shared.f32 %f6, [%rd8+1024];\n"
4851 " add.f32 %f5, %f6, %f4;\n"
4852 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4853 "$Lt_44_14338:\n"
4854 " bar.sync 0;\n"
4855 " mov.u32 %r12, 127;\n"
4856 " setp.gt.u32 %p5, %r3, %r12;\n"
4857 " @%p5 bra $Lt_44_14850;\n"
4858 " .loc 3 76 0\n"
4859 " ld.volatile.shared.f32 %f7, [%rd8+512];\n"
4860 " add.f32 %f5, %f7, %f5;\n"
4861 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4862 "$Lt_44_14850:\n"
4863 " bar.sync 0;\n"
4864 " mov.u32 %r13, 63;\n"
4865 " setp.gt.u32 %p6, %r3, %r13;\n"
4866 " @%p6 bra $Lt_44_15362;\n"
4867 " .loc 3 77 0\n"
4868 " ld.volatile.shared.f32 %f8, [%rd8+256];\n"
4869 " add.f32 %f5, %f8, %f5;\n"
4870 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4871 "$Lt_44_15362:\n"
4872 " bar.sync 0;\n"
4873 " mov.u32 %r14, 31;\n"
4874 " setp.gt.u32 %p7, %r3, %r14;\n"
4875 " @%p7 bra $Lt_44_15874;\n"
4876 " .loc 3 83 0\n"
4877 " ld.volatile.shared.f32 %f9, [%rd8+128];\n"
4878 " add.f32 %f10, %f9, %f5;\n"
4879 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
4880 " .loc 3 84 0\n"
4881 " ld.volatile.shared.f32 %f11, [%rd8+64];\n"
4882 " add.f32 %f12, %f11, %f10;\n"
4883 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
4884 " .loc 3 85 0\n"
4885 " ld.volatile.shared.f32 %f13, [%rd8+32];\n"
4886 " add.f32 %f14, %f13, %f12;\n"
4887 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
4888 " .loc 3 86 0\n"
4889 " ld.volatile.shared.f32 %f15, [%rd8+16];\n"
4890 " add.f32 %f16, %f15, %f14;\n"
4891 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
4892 " .loc 3 87 0\n"
4893 " ld.volatile.shared.f32 %f17, [%rd8+8];\n"
4894 " add.f32 %f18, %f17, %f16;\n"
4895 " st.volatile.shared.f32 [%rd8+0], %f18;\n"
4896 " .loc 3 88 0\n"
4897 " ld.volatile.shared.f32 %f19, [%rd8+4];\n"
4898 " add.f32 %f5, %f19, %f18;\n"
4899 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4900 "$Lt_44_15874:\n"
4901 " .loc 3 195 0\n"
4902 " mov.u32 %r15, 0;\n"
4903 " setp.ne.u32 %p8, %r3, %r15;\n"
4904 " @%p8 bra $Lt_44_16386;\n"
4905 " .loc 3 199 0\n"
4906 " ld.shared.f32 %f20, [__smem+0];\n"
4907 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_512_false_g_odata];\n"
4908 " cvt.u64.u32 %rd10, %r1;\n"
4909 " mul.wide.u32 %rd11, %r1, 4;\n"
4910 " add.u64 %rd12, %rd9, %rd11;\n"
4911 " st.global.f32 [%rd12+0], %f20;\n"
4912 "$Lt_44_16386:\n"
4913 " .loc 3 599 0\n"
4914 " exit;\n"
4915 "$LDWend_reduce_uchar_512_false:\n"
4916 " } // reduce_uchar_512_false\n"
4917 "\n"
4918 " .entry packed_float_reduce_1_false_false (\n"
4919 " .param .u64 __cudaparm_packed_float_reduce_1_false_false_g_idata,\n"
4920 " .param .u64 __cudaparm_packed_float_reduce_1_false_false_g_odata,\n"
4921 " .param .u32 __cudaparm_packed_float_reduce_1_false_false_n)\n"
4922 " {\n"
4923 " .reg .u16 %rh<7>;\n"
4924 " .reg .u32 %r<14>;\n"
4925 " .reg .u64 %rd<13>;\n"
4926 " .reg .f32 %f<4>;\n"
4927 " .reg .pred %p<5>;\n"
4928 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
4929 " .loc 3 722 0\n"
4930 "$LDWbegin_packed_float_reduce_1_false_false:\n"
4931 " .loc 3 637 0\n"
4932 " cvt.u32.u16 %r1, %ctaid.x;\n"
4933 " mul24.lo.u32 %r2, %r1, 2;\n"
4934 " cvt.u32.u16 %r3, %tid.x;\n"
4935 " add.u32 %r4, %r2, %r3;\n"
4936 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_false_n];\n"
4937 " setp.ge.u32 %p1, %r4, %r5;\n"
4938 " @%p1 bra $Lt_45_18178;\n"
4939 " mul.lo.u32 %r6, %r4, 4;\n"
4940 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_false_n];\n"
4941 " mul.lo.u32 %r7, %r5, 4;\n"
4942 " mov.u16 %rh1, %nctaid.x;\n"
4943 " mul.wide.u16 %r8, %rh1, 8;\n"
4944 " add.u32 %r9, %r6, 4;\n"
4945 " add.u32 %r10, %r7, 4;\n"
4946 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_false_g_idata];\n"
4947 "$Lt_45_17154:\n"
4948 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
4949 " .loc 3 677 0\n"
4950 " cvt.u8.u32 %r11, %r9;\n"
4951 " cvt.u64.u32 %rd2, %r11;\n"
4952 " .loc 3 637 0\n"
4953 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_false_g_idata];\n"
4954 " .loc 3 677 0\n"
4955 " add.u64 %rd3, %rd2, %rd1;\n"
4956 " ld.global.u8 %rh2, [%rd3+0];\n"
4957 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
4958 " .loc 3 678 0\n"
4959 " ld.global.u8 %rh3, [%rd3+1];\n"
4960 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
4961 " .loc 3 679 0\n"
4962 " ld.global.u8 %rh4, [%rd3+2];\n"
4963 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
4964 " .loc 3 680 0\n"
4965 " ld.global.u8 %rh5, [%rd3+3];\n"
4966 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
4967 " add.u32 %r9, %r8, %r9;\n"
4968 " setp.lt.u32 %p2, %r9, %r10;\n"
4969 " @%p2 bra $Lt_45_17154;\n"
4970 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
4971 " bra.uni $Lt_45_16642;\n"
4972 "$Lt_45_18178:\n"
4973 " mov.f32 %f1, 0f00000000; // 0\n"
4974 "$Lt_45_16642:\n"
4975 " .loc 3 692 0\n"
4976 " mov.u64 %rd4, __smem;\n"
4977 " cvt.u64.u32 %rd5, %r3;\n"
4978 " mul.wide.u32 %rd6, %r3, 4;\n"
4979 " add.u64 %rd7, %rd4, %rd6;\n"
4980 " st.shared.f32 [%rd7+0], %f1;\n"
4981 " .loc 3 693 0\n"
4982 " bar.sync 0;\n"
4983 " mov.u32 %r12, 0;\n"
4984 " setp.ne.u32 %p3, %r3, %r12;\n"
4985 " @%p3 bra $Lt_45_17666;\n"
4986 " .loc 3 719 0\n"
4987 " ld.shared.f32 %f2, [__smem+0];\n"
4988 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_false_false_g_odata];\n"
4989 " cvt.u64.u32 %rd9, %r1;\n"
4990 " mul.wide.u32 %rd10, %r1, 4;\n"
4991 " add.u64 %rd11, %rd8, %rd10;\n"
4992 " st.global.f32 [%rd11+0], %f2;\n"
4993 "$Lt_45_17666:\n"
4994 " .loc 3 723 0\n"
4995 " exit;\n"
4996 "$LDWend_packed_float_reduce_1_false_false:\n"
4997 " } // packed_float_reduce_1_false_false\n"
4998 "\n"
4999 " .entry packed_float_reduce_1_false_true (\n"
5000 " .param .u64 __cudaparm_packed_float_reduce_1_false_true_g_idata,\n"
5001 " .param .u64 __cudaparm_packed_float_reduce_1_false_true_g_odata,\n"
5002 " .param .u32 __cudaparm_packed_float_reduce_1_false_true_n)\n"
5003 " {\n"
5004 " .reg .u16 %rh<7>;\n"
5005 " .reg .u32 %r<14>;\n"
5006 " .reg .u64 %rd<13>;\n"
5007 " .reg .f32 %f<4>;\n"
5008 " .reg .pred %p<5>;\n"
5009 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5010 " .loc 3 724 0\n"
5011 "$LDWbegin_packed_float_reduce_1_false_true:\n"
5012 " .loc 3 637 0\n"
5013 " cvt.u32.u16 %r1, %ctaid.x;\n"
5014 " mul24.lo.u32 %r2, %r1, 2;\n"
5015 " cvt.u32.u16 %r3, %tid.x;\n"
5016 " add.u32 %r4, %r2, %r3;\n"
5017 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_true_n];\n"
5018 " setp.ge.u32 %p1, %r4, %r5;\n"
5019 " @%p1 bra $Lt_46_18178;\n"
5020 " mul.lo.u32 %r6, %r4, 4;\n"
5021 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_true_n];\n"
5022 " mul.lo.u32 %r7, %r5, 4;\n"
5023 " mov.u16 %rh1, %nctaid.x;\n"
5024 " mul.wide.u16 %r8, %rh1, 8;\n"
5025 " add.u32 %r9, %r6, 4;\n"
5026 " add.u32 %r10, %r7, 4;\n"
5027 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_true_g_idata];\n"
5028 "$Lt_46_17154:\n"
5029 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5030 " .loc 3 677 0\n"
5031 " cvt.u8.u32 %r11, %r9;\n"
5032 " cvt.u64.u32 %rd2, %r11;\n"
5033 " .loc 3 637 0\n"
5034 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_true_g_idata];\n"
5035 " .loc 3 677 0\n"
5036 " add.u64 %rd3, %rd2, %rd1;\n"
5037 " ld.global.u8 %rh2, [%rd3+0];\n"
5038 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5039 " .loc 3 678 0\n"
5040 " ld.global.u8 %rh3, [%rd3+1];\n"
5041 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5042 " .loc 3 679 0\n"
5043 " ld.global.u8 %rh4, [%rd3+2];\n"
5044 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5045 " .loc 3 680 0\n"
5046 " ld.global.u8 %rh5, [%rd3+3];\n"
5047 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5048 " add.u32 %r9, %r8, %r9;\n"
5049 " setp.lt.u32 %p2, %r9, %r10;\n"
5050 " @%p2 bra $Lt_46_17154;\n"
5051 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5052 " bra.uni $Lt_46_16642;\n"
5053 "$Lt_46_18178:\n"
5054 " mov.f32 %f1, 0f00000000; // 0\n"
5055 "$Lt_46_16642:\n"
5056 " .loc 3 692 0\n"
5057 " mov.u64 %rd4, __smem;\n"
5058 " cvt.u64.u32 %rd5, %r3;\n"
5059 " mul.wide.u32 %rd6, %r3, 4;\n"
5060 " add.u64 %rd7, %rd4, %rd6;\n"
5061 " st.shared.f32 [%rd7+0], %f1;\n"
5062 " .loc 3 693 0\n"
5063 " bar.sync 0;\n"
5064 " mov.u32 %r12, 0;\n"
5065 " setp.ne.u32 %p3, %r3, %r12;\n"
5066 " @%p3 bra $Lt_46_17666;\n"
5067 " .loc 3 719 0\n"
5068 " ld.shared.f32 %f2, [__smem+0];\n"
5069 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_false_true_g_odata];\n"
5070 " cvt.u64.u32 %rd9, %r1;\n"
5071 " mul.wide.u32 %rd10, %r1, 4;\n"
5072 " add.u64 %rd11, %rd8, %rd10;\n"
5073 " st.global.f32 [%rd11+0], %f2;\n"
5074 "$Lt_46_17666:\n"
5075 " .loc 3 725 0\n"
5076 " exit;\n"
5077 "$LDWend_packed_float_reduce_1_false_true:\n"
5078 " } // packed_float_reduce_1_false_true\n"
5079 "\n"
5080 " .entry packed_float_reduce_1_true_false (\n"
5081 " .param .u64 __cudaparm_packed_float_reduce_1_true_false_g_idata,\n"
5082 " .param .u64 __cudaparm_packed_float_reduce_1_true_false_g_odata,\n"
5083 " .param .u32 __cudaparm_packed_float_reduce_1_true_false_n)\n"
5084 " {\n"
5085 " .reg .u16 %rh<7>;\n"
5086 " .reg .u32 %r<14>;\n"
5087 " .reg .u64 %rd<13>;\n"
5088 " .reg .f32 %f<4>;\n"
5089 " .reg .pred %p<5>;\n"
5090 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5091 " .loc 3 726 0\n"
5092 "$LDWbegin_packed_float_reduce_1_true_false:\n"
5093 " .loc 3 637 0\n"
5094 " cvt.u32.u16 %r1, %ctaid.x;\n"
5095 " mul24.lo.u32 %r2, %r1, 2;\n"
5096 " cvt.u32.u16 %r3, %tid.x;\n"
5097 " add.u32 %r4, %r2, %r3;\n"
5098 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_false_n];\n"
5099 " setp.ge.u32 %p1, %r4, %r5;\n"
5100 " @%p1 bra $Lt_47_18178;\n"
5101 " mul.lo.u32 %r6, %r4, 4;\n"
5102 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_false_n];\n"
5103 " mul.lo.u32 %r7, %r5, 4;\n"
5104 " mov.u16 %rh1, %nctaid.x;\n"
5105 " mul.wide.u16 %r8, %rh1, 8;\n"
5106 " add.u32 %r9, %r6, 4;\n"
5107 " add.u32 %r10, %r7, 4;\n"
5108 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_false_g_idata];\n"
5109 "$Lt_47_17154:\n"
5110 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5111 " .loc 3 677 0\n"
5112 " cvt.u8.u32 %r11, %r9;\n"
5113 " cvt.u64.u32 %rd2, %r11;\n"
5114 " .loc 3 637 0\n"
5115 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_false_g_idata];\n"
5116 " .loc 3 677 0\n"
5117 " add.u64 %rd3, %rd2, %rd1;\n"
5118 " ld.global.u8 %rh2, [%rd3+0];\n"
5119 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5120 " .loc 3 678 0\n"
5121 " ld.global.u8 %rh3, [%rd3+1];\n"
5122 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5123 " .loc 3 679 0\n"
5124 " ld.global.u8 %rh4, [%rd3+2];\n"
5125 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5126 " .loc 3 680 0\n"
5127 " ld.global.u8 %rh5, [%rd3+3];\n"
5128 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5129 " add.u32 %r9, %r8, %r9;\n"
5130 " setp.lt.u32 %p2, %r9, %r10;\n"
5131 " @%p2 bra $Lt_47_17154;\n"
5132 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5133 " bra.uni $Lt_47_16642;\n"
5134 "$Lt_47_18178:\n"
5135 " mov.f32 %f1, 0f00000000; // 0\n"
5136 "$Lt_47_16642:\n"
5137 " .loc 3 692 0\n"
5138 " mov.u64 %rd4, __smem;\n"
5139 " cvt.u64.u32 %rd5, %r3;\n"
5140 " mul.wide.u32 %rd6, %r3, 4;\n"
5141 " add.u64 %rd7, %rd4, %rd6;\n"
5142 " st.shared.f32 [%rd7+0], %f1;\n"
5143 " .loc 3 693 0\n"
5144 " bar.sync 0;\n"
5145 " mov.u32 %r12, 0;\n"
5146 " setp.ne.u32 %p3, %r3, %r12;\n"
5147 " @%p3 bra $Lt_47_17666;\n"
5148 " .loc 3 719 0\n"
5149 " ld.shared.f32 %f2, [__smem+0];\n"
5150 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_true_false_g_odata];\n"
5151 " cvt.u64.u32 %rd9, %r1;\n"
5152 " mul.wide.u32 %rd10, %r1, 4;\n"
5153 " add.u64 %rd11, %rd8, %rd10;\n"
5154 " st.global.f32 [%rd11+0], %f2;\n"
5155 "$Lt_47_17666:\n"
5156 " .loc 3 727 0\n"
5157 " exit;\n"
5158 "$LDWend_packed_float_reduce_1_true_false:\n"
5159 " } // packed_float_reduce_1_true_false\n"
5160 "\n"
5161 " .entry packed_float_reduce_1_true_true (\n"
5162 " .param .u64 __cudaparm_packed_float_reduce_1_true_true_g_idata,\n"
5163 " .param .u64 __cudaparm_packed_float_reduce_1_true_true_g_odata,\n"
5164 " .param .u32 __cudaparm_packed_float_reduce_1_true_true_n)\n"
5165 " {\n"
5166 " .reg .u16 %rh<7>;\n"
5167 " .reg .u32 %r<14>;\n"
5168 " .reg .u64 %rd<13>;\n"
5169 " .reg .f32 %f<4>;\n"
5170 " .reg .pred %p<5>;\n"
5171 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5172 " .loc 3 728 0\n"
5173 "$LDWbegin_packed_float_reduce_1_true_true:\n"
5174 " .loc 3 637 0\n"
5175 " cvt.u32.u16 %r1, %ctaid.x;\n"
5176 " mul24.lo.u32 %r2, %r1, 2;\n"
5177 " cvt.u32.u16 %r3, %tid.x;\n"
5178 " add.u32 %r4, %r2, %r3;\n"
5179 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_true_n];\n"
5180 " setp.ge.u32 %p1, %r4, %r5;\n"
5181 " @%p1 bra $Lt_48_18178;\n"
5182 " mul.lo.u32 %r6, %r4, 4;\n"
5183 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_true_n];\n"
5184 " mul.lo.u32 %r7, %r5, 4;\n"
5185 " mov.u16 %rh1, %nctaid.x;\n"
5186 " mul.wide.u16 %r8, %rh1, 8;\n"
5187 " add.u32 %r9, %r6, 4;\n"
5188 " add.u32 %r10, %r7, 4;\n"
5189 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_true_g_idata];\n"
5190 "$Lt_48_17154:\n"
5191 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5192 " .loc 3 677 0\n"
5193 " cvt.u8.u32 %r11, %r9;\n"
5194 " cvt.u64.u32 %rd2, %r11;\n"
5195 " .loc 3 637 0\n"
5196 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_true_g_idata];\n"
5197 " .loc 3 677 0\n"
5198 " add.u64 %rd3, %rd2, %rd1;\n"
5199 " ld.global.u8 %rh2, [%rd3+0];\n"
5200 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5201 " .loc 3 678 0\n"
5202 " ld.global.u8 %rh3, [%rd3+1];\n"
5203 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5204 " .loc 3 679 0\n"
5205 " ld.global.u8 %rh4, [%rd3+2];\n"
5206 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5207 " .loc 3 680 0\n"
5208 " ld.global.u8 %rh5, [%rd3+3];\n"
5209 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5210 " add.u32 %r9, %r8, %r9;\n"
5211 " setp.lt.u32 %p2, %r9, %r10;\n"
5212 " @%p2 bra $Lt_48_17154;\n"
5213 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5214 " bra.uni $Lt_48_16642;\n"
5215 "$Lt_48_18178:\n"
5216 " mov.f32 %f1, 0f00000000; // 0\n"
5217 "$Lt_48_16642:\n"
5218 " .loc 3 692 0\n"
5219 " mov.u64 %rd4, __smem;\n"
5220 " cvt.u64.u32 %rd5, %r3;\n"
5221 " mul.wide.u32 %rd6, %r3, 4;\n"
5222 " add.u64 %rd7, %rd4, %rd6;\n"
5223 " st.shared.f32 [%rd7+0], %f1;\n"
5224 " .loc 3 693 0\n"
5225 " bar.sync 0;\n"
5226 " mov.u32 %r12, 0;\n"
5227 " setp.ne.u32 %p3, %r3, %r12;\n"
5228 " @%p3 bra $Lt_48_17666;\n"
5229 " .loc 3 719 0\n"
5230 " ld.shared.f32 %f2, [__smem+0];\n"
5231 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_true_true_g_odata];\n"
5232 " cvt.u64.u32 %rd9, %r1;\n"
5233 " mul.wide.u32 %rd10, %r1, 4;\n"
5234 " add.u64 %rd11, %rd8, %rd10;\n"
5235 " st.global.f32 [%rd11+0], %f2;\n"
5236 "$Lt_48_17666:\n"
5237 " .loc 3 729 0\n"
5238 " exit;\n"
5239 "$LDWend_packed_float_reduce_1_true_true:\n"
5240 " } // packed_float_reduce_1_true_true\n"
5241 "\n"
5242 " .entry packed_float_reduce_2_false_false (\n"
5243 " .param .u64 __cudaparm_packed_float_reduce_2_false_false_g_idata,\n"
5244 " .param .u64 __cudaparm_packed_float_reduce_2_false_false_g_odata,\n"
5245 " .param .u32 __cudaparm_packed_float_reduce_2_false_false_n)\n"
5246 " {\n"
5247 " .reg .u16 %rh<7>;\n"
5248 " .reg .u32 %r<15>;\n"
5249 " .reg .u64 %rd<13>;\n"
5250 " .reg .f32 %f<5>;\n"
5251 " .reg .pred %p<6>;\n"
5252 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5253 " .loc 3 731 0\n"
5254 "$LDWbegin_packed_float_reduce_2_false_false:\n"
5255 " .loc 3 637 0\n"
5256 " cvt.u32.u16 %r1, %ctaid.x;\n"
5257 " mul24.lo.u32 %r2, %r1, 4;\n"
5258 " cvt.u32.u16 %r3, %tid.x;\n"
5259 " add.u32 %r4, %r2, %r3;\n"
5260 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_false_n];\n"
5261 " setp.ge.u32 %p1, %r4, %r5;\n"
5262 " @%p1 bra $Lt_49_18434;\n"
5263 " mul.lo.u32 %r6, %r4, 4;\n"
5264 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_false_n];\n"
5265 " mul.lo.u32 %r7, %r5, 4;\n"
5266 " mov.u16 %rh1, %nctaid.x;\n"
5267 " mul.wide.u16 %r8, %rh1, 16;\n"
5268 " add.u32 %r9, %r6, 8;\n"
5269 " add.u32 %r10, %r7, 8;\n"
5270 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_false_g_idata];\n"
5271 "$Lt_49_16898:\n"
5272 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5273 " .loc 3 677 0\n"
5274 " cvt.u8.u32 %r11, %r9;\n"
5275 " cvt.u64.u32 %rd2, %r11;\n"
5276 " .loc 3 637 0\n"
5277 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_false_g_idata];\n"
5278 " .loc 3 677 0\n"
5279 " add.u64 %rd3, %rd2, %rd1;\n"
5280 " ld.global.u8 %rh2, [%rd3+0];\n"
5281 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5282 " .loc 3 678 0\n"
5283 " ld.global.u8 %rh3, [%rd3+1];\n"
5284 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5285 " .loc 3 679 0\n"
5286 " ld.global.u8 %rh4, [%rd3+2];\n"
5287 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5288 " .loc 3 680 0\n"
5289 " ld.global.u8 %rh5, [%rd3+3];\n"
5290 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5291 " add.u32 %r9, %r8, %r9;\n"
5292 " setp.lt.u32 %p2, %r9, %r10;\n"
5293 " @%p2 bra $Lt_49_16898;\n"
5294 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5295 " bra.uni $Lt_49_16386;\n"
5296 "$Lt_49_18434:\n"
5297 " mov.f32 %f1, 0f00000000; // 0\n"
5298 "$Lt_49_16386:\n"
5299 " .loc 3 692 0\n"
5300 " mov.u64 %rd4, __smem;\n"
5301 " cvt.u64.u32 %rd5, %r3;\n"
5302 " mul.wide.u32 %rd6, %r3, 4;\n"
5303 " add.u64 %rd7, %rd4, %rd6;\n"
5304 " st.shared.f32 [%rd7+0], %f1;\n"
5305 " .loc 3 693 0\n"
5306 " bar.sync 0;\n"
5307 " mov.u32 %r12, 31;\n"
5308 " setp.gt.u32 %p3, %r3, %r12;\n"
5309 " @%p3 bra $Lt_49_17410;\n"
5310 " .loc 3 714 0\n"
5311 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5312 " add.f32 %f1, %f2, %f1;\n"
5313 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5314 "$Lt_49_17410:\n"
5315 " mov.u32 %r13, 0;\n"
5316 " setp.ne.u32 %p4, %r3, %r13;\n"
5317 " @%p4 bra $Lt_49_17922;\n"
5318 " .loc 3 719 0\n"
5319 " ld.shared.f32 %f3, [__smem+0];\n"
5320 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_false_false_g_odata];\n"
5321 " cvt.u64.u32 %rd9, %r1;\n"
5322 " mul.wide.u32 %rd10, %r1, 4;\n"
5323 " add.u64 %rd11, %rd8, %rd10;\n"
5324 " st.global.f32 [%rd11+0], %f3;\n"
5325 "$Lt_49_17922:\n"
5326 " .loc 3 732 0\n"
5327 " exit;\n"
5328 "$LDWend_packed_float_reduce_2_false_false:\n"
5329 " } // packed_float_reduce_2_false_false\n"
5330 "\n"
5331 " .entry packed_float_reduce_2_false_true (\n"
5332 " .param .u64 __cudaparm_packed_float_reduce_2_false_true_g_idata,\n"
5333 " .param .u64 __cudaparm_packed_float_reduce_2_false_true_g_odata,\n"
5334 " .param .u32 __cudaparm_packed_float_reduce_2_false_true_n)\n"
5335 " {\n"
5336 " .reg .u16 %rh<7>;\n"
5337 " .reg .u32 %r<15>;\n"
5338 " .reg .u64 %rd<13>;\n"
5339 " .reg .f32 %f<5>;\n"
5340 " .reg .pred %p<6>;\n"
5341 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5342 " .loc 3 733 0\n"
5343 "$LDWbegin_packed_float_reduce_2_false_true:\n"
5344 " .loc 3 637 0\n"
5345 " cvt.u32.u16 %r1, %ctaid.x;\n"
5346 " mul24.lo.u32 %r2, %r1, 4;\n"
5347 " cvt.u32.u16 %r3, %tid.x;\n"
5348 " add.u32 %r4, %r2, %r3;\n"
5349 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_true_n];\n"
5350 " setp.ge.u32 %p1, %r4, %r5;\n"
5351 " @%p1 bra $Lt_50_18434;\n"
5352 " mul.lo.u32 %r6, %r4, 4;\n"
5353 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_true_n];\n"
5354 " mul.lo.u32 %r7, %r5, 4;\n"
5355 " mov.u16 %rh1, %nctaid.x;\n"
5356 " mul.wide.u16 %r8, %rh1, 16;\n"
5357 " add.u32 %r9, %r6, 8;\n"
5358 " add.u32 %r10, %r7, 8;\n"
5359 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_true_g_idata];\n"
5360 "$Lt_50_16898:\n"
5361 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5362 " .loc 3 677 0\n"
5363 " cvt.u8.u32 %r11, %r9;\n"
5364 " cvt.u64.u32 %rd2, %r11;\n"
5365 " .loc 3 637 0\n"
5366 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_true_g_idata];\n"
5367 " .loc 3 677 0\n"
5368 " add.u64 %rd3, %rd2, %rd1;\n"
5369 " ld.global.u8 %rh2, [%rd3+0];\n"
5370 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5371 " .loc 3 678 0\n"
5372 " ld.global.u8 %rh3, [%rd3+1];\n"
5373 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5374 " .loc 3 679 0\n"
5375 " ld.global.u8 %rh4, [%rd3+2];\n"
5376 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5377 " .loc 3 680 0\n"
5378 " ld.global.u8 %rh5, [%rd3+3];\n"
5379 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5380 " add.u32 %r9, %r8, %r9;\n"
5381 " setp.lt.u32 %p2, %r9, %r10;\n"
5382 " @%p2 bra $Lt_50_16898;\n"
5383 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5384 " bra.uni $Lt_50_16386;\n"
5385 "$Lt_50_18434:\n"
5386 " mov.f32 %f1, 0f00000000; // 0\n"
5387 "$Lt_50_16386:\n"
5388 " .loc 3 692 0\n"
5389 " mov.u64 %rd4, __smem;\n"
5390 " cvt.u64.u32 %rd5, %r3;\n"
5391 " mul.wide.u32 %rd6, %r3, 4;\n"
5392 " add.u64 %rd7, %rd4, %rd6;\n"
5393 " st.shared.f32 [%rd7+0], %f1;\n"
5394 " .loc 3 693 0\n"
5395 " bar.sync 0;\n"
5396 " mov.u32 %r12, 31;\n"
5397 " setp.gt.u32 %p3, %r3, %r12;\n"
5398 " @%p3 bra $Lt_50_17410;\n"
5399 " .loc 3 714 0\n"
5400 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5401 " add.f32 %f1, %f2, %f1;\n"
5402 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5403 "$Lt_50_17410:\n"
5404 " mov.u32 %r13, 0;\n"
5405 " setp.ne.u32 %p4, %r3, %r13;\n"
5406 " @%p4 bra $Lt_50_17922;\n"
5407 " .loc 3 719 0\n"
5408 " ld.shared.f32 %f3, [__smem+0];\n"
5409 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_false_true_g_odata];\n"
5410 " cvt.u64.u32 %rd9, %r1;\n"
5411 " mul.wide.u32 %rd10, %r1, 4;\n"
5412 " add.u64 %rd11, %rd8, %rd10;\n"
5413 " st.global.f32 [%rd11+0], %f3;\n"
5414 "$Lt_50_17922:\n"
5415 " .loc 3 734 0\n"
5416 " exit;\n"
5417 "$LDWend_packed_float_reduce_2_false_true:\n"
5418 " } // packed_float_reduce_2_false_true\n"
5419 "\n"
5420 " .entry packed_float_reduce_2_true_false (\n"
5421 " .param .u64 __cudaparm_packed_float_reduce_2_true_false_g_idata,\n"
5422 " .param .u64 __cudaparm_packed_float_reduce_2_true_false_g_odata,\n"
5423 " .param .u32 __cudaparm_packed_float_reduce_2_true_false_n)\n"
5424 " {\n"
5425 " .reg .u16 %rh<7>;\n"
5426 " .reg .u32 %r<15>;\n"
5427 " .reg .u64 %rd<13>;\n"
5428 " .reg .f32 %f<5>;\n"
5429 " .reg .pred %p<6>;\n"
5430 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5431 " .loc 3 735 0\n"
5432 "$LDWbegin_packed_float_reduce_2_true_false:\n"
5433 " .loc 3 637 0\n"
5434 " cvt.u32.u16 %r1, %ctaid.x;\n"
5435 " mul24.lo.u32 %r2, %r1, 4;\n"
5436 " cvt.u32.u16 %r3, %tid.x;\n"
5437 " add.u32 %r4, %r2, %r3;\n"
5438 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_false_n];\n"
5439 " setp.ge.u32 %p1, %r4, %r5;\n"
5440 " @%p1 bra $Lt_51_18434;\n"
5441 " mul.lo.u32 %r6, %r4, 4;\n"
5442 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_false_n];\n"
5443 " mul.lo.u32 %r7, %r5, 4;\n"
5444 " mov.u16 %rh1, %nctaid.x;\n"
5445 " mul.wide.u16 %r8, %rh1, 16;\n"
5446 " add.u32 %r9, %r6, 8;\n"
5447 " add.u32 %r10, %r7, 8;\n"
5448 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_false_g_idata];\n"
5449 "$Lt_51_16898:\n"
5450 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5451 " .loc 3 677 0\n"
5452 " cvt.u8.u32 %r11, %r9;\n"
5453 " cvt.u64.u32 %rd2, %r11;\n"
5454 " .loc 3 637 0\n"
5455 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_false_g_idata];\n"
5456 " .loc 3 677 0\n"
5457 " add.u64 %rd3, %rd2, %rd1;\n"
5458 " ld.global.u8 %rh2, [%rd3+0];\n"
5459 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5460 " .loc 3 678 0\n"
5461 " ld.global.u8 %rh3, [%rd3+1];\n"
5462 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5463 " .loc 3 679 0\n"
5464 " ld.global.u8 %rh4, [%rd3+2];\n"
5465 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5466 " .loc 3 680 0\n"
5467 " ld.global.u8 %rh5, [%rd3+3];\n"
5468 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5469 " add.u32 %r9, %r8, %r9;\n"
5470 " setp.lt.u32 %p2, %r9, %r10;\n"
5471 " @%p2 bra $Lt_51_16898;\n"
5472 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5473 " bra.uni $Lt_51_16386;\n"
5474 "$Lt_51_18434:\n"
5475 " mov.f32 %f1, 0f00000000; // 0\n"
5476 "$Lt_51_16386:\n"
5477 " .loc 3 692 0\n"
5478 " mov.u64 %rd4, __smem;\n"
5479 " cvt.u64.u32 %rd5, %r3;\n"
5480 " mul.wide.u32 %rd6, %r3, 4;\n"
5481 " add.u64 %rd7, %rd4, %rd6;\n"
5482 " st.shared.f32 [%rd7+0], %f1;\n"
5483 " .loc 3 693 0\n"
5484 " bar.sync 0;\n"
5485 " mov.u32 %r12, 31;\n"
5486 " setp.gt.u32 %p3, %r3, %r12;\n"
5487 " @%p3 bra $Lt_51_17410;\n"
5488 " .loc 3 714 0\n"
5489 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5490 " add.f32 %f1, %f2, %f1;\n"
5491 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5492 "$Lt_51_17410:\n"
5493 " mov.u32 %r13, 0;\n"
5494 " setp.ne.u32 %p4, %r3, %r13;\n"
5495 " @%p4 bra $Lt_51_17922;\n"
5496 " .loc 3 719 0\n"
5497 " ld.shared.f32 %f3, [__smem+0];\n"
5498 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_true_false_g_odata];\n"
5499 " cvt.u64.u32 %rd9, %r1;\n"
5500 " mul.wide.u32 %rd10, %r1, 4;\n"
5501 " add.u64 %rd11, %rd8, %rd10;\n"
5502 " st.global.f32 [%rd11+0], %f3;\n"
5503 "$Lt_51_17922:\n"
5504 " .loc 3 736 0\n"
5505 " exit;\n"
5506 "$LDWend_packed_float_reduce_2_true_false:\n"
5507 " } // packed_float_reduce_2_true_false\n"
5508 "\n"
5509 " .entry packed_float_reduce_2_true_true (\n"
5510 " .param .u64 __cudaparm_packed_float_reduce_2_true_true_g_idata,\n"
5511 " .param .u64 __cudaparm_packed_float_reduce_2_true_true_g_odata,\n"
5512 " .param .u32 __cudaparm_packed_float_reduce_2_true_true_n)\n"
5513 " {\n"
5514 " .reg .u16 %rh<7>;\n"
5515 " .reg .u32 %r<15>;\n"
5516 " .reg .u64 %rd<13>;\n"
5517 " .reg .f32 %f<5>;\n"
5518 " .reg .pred %p<6>;\n"
5519 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5520 " .loc 3 737 0\n"
5521 "$LDWbegin_packed_float_reduce_2_true_true:\n"
5522 " .loc 3 637 0\n"
5523 " cvt.u32.u16 %r1, %ctaid.x;\n"
5524 " mul24.lo.u32 %r2, %r1, 4;\n"
5525 " cvt.u32.u16 %r3, %tid.x;\n"
5526 " add.u32 %r4, %r2, %r3;\n"
5527 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_true_n];\n"
5528 " setp.ge.u32 %p1, %r4, %r5;\n"
5529 " @%p1 bra $Lt_52_18434;\n"
5530 " mul.lo.u32 %r6, %r4, 4;\n"
5531 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_true_n];\n"
5532 " mul.lo.u32 %r7, %r5, 4;\n"
5533 " mov.u16 %rh1, %nctaid.x;\n"
5534 " mul.wide.u16 %r8, %rh1, 16;\n"
5535 " add.u32 %r9, %r6, 8;\n"
5536 " add.u32 %r10, %r7, 8;\n"
5537 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_true_g_idata];\n"
5538 "$Lt_52_16898:\n"
5539 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5540 " .loc 3 677 0\n"
5541 " cvt.u8.u32 %r11, %r9;\n"
5542 " cvt.u64.u32 %rd2, %r11;\n"
5543 " .loc 3 637 0\n"
5544 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_true_g_idata];\n"
5545 " .loc 3 677 0\n"
5546 " add.u64 %rd3, %rd2, %rd1;\n"
5547 " ld.global.u8 %rh2, [%rd3+0];\n"
5548 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5549 " .loc 3 678 0\n"
5550 " ld.global.u8 %rh3, [%rd3+1];\n"
5551 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5552 " .loc 3 679 0\n"
5553 " ld.global.u8 %rh4, [%rd3+2];\n"
5554 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5555 " .loc 3 680 0\n"
5556 " ld.global.u8 %rh5, [%rd3+3];\n"
5557 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5558 " add.u32 %r9, %r8, %r9;\n"
5559 " setp.lt.u32 %p2, %r9, %r10;\n"
5560 " @%p2 bra $Lt_52_16898;\n"
5561 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5562 " bra.uni $Lt_52_16386;\n"
5563 "$Lt_52_18434:\n"
5564 " mov.f32 %f1, 0f00000000; // 0\n"
5565 "$Lt_52_16386:\n"
5566 " .loc 3 692 0\n"
5567 " mov.u64 %rd4, __smem;\n"
5568 " cvt.u64.u32 %rd5, %r3;\n"
5569 " mul.wide.u32 %rd6, %r3, 4;\n"
5570 " add.u64 %rd7, %rd4, %rd6;\n"
5571 " st.shared.f32 [%rd7+0], %f1;\n"
5572 " .loc 3 693 0\n"
5573 " bar.sync 0;\n"
5574 " mov.u32 %r12, 31;\n"
5575 " setp.gt.u32 %p3, %r3, %r12;\n"
5576 " @%p3 bra $Lt_52_17410;\n"
5577 " .loc 3 714 0\n"
5578 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5579 " add.f32 %f1, %f2, %f1;\n"
5580 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5581 "$Lt_52_17410:\n"
5582 " mov.u32 %r13, 0;\n"
5583 " setp.ne.u32 %p4, %r3, %r13;\n"
5584 " @%p4 bra $Lt_52_17922;\n"
5585 " .loc 3 719 0\n"
5586 " ld.shared.f32 %f3, [__smem+0];\n"
5587 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_true_true_g_odata];\n"
5588 " cvt.u64.u32 %rd9, %r1;\n"
5589 " mul.wide.u32 %rd10, %r1, 4;\n"
5590 " add.u64 %rd11, %rd8, %rd10;\n"
5591 " st.global.f32 [%rd11+0], %f3;\n"
5592 "$Lt_52_17922:\n"
5593 " .loc 3 738 0\n"
5594 " exit;\n"
5595 "$LDWend_packed_float_reduce_2_true_true:\n"
5596 " } // packed_float_reduce_2_true_true\n"
5597 "\n"
5598 " .entry packed_float_reduce_4_false_false (\n"
5599 " .param .u64 __cudaparm_packed_float_reduce_4_false_false_g_idata,\n"
5600 " .param .u64 __cudaparm_packed_float_reduce_4_false_false_g_odata,\n"
5601 " .param .u32 __cudaparm_packed_float_reduce_4_false_false_n)\n"
5602 " {\n"
5603 " .reg .u16 %rh<7>;\n"
5604 " .reg .u32 %r<15>;\n"
5605 " .reg .u64 %rd<13>;\n"
5606 " .reg .f32 %f<7>;\n"
5607 " .reg .pred %p<6>;\n"
5608 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5609 " .loc 3 740 0\n"
5610 "$LDWbegin_packed_float_reduce_4_false_false:\n"
5611 " .loc 3 637 0\n"
5612 " cvt.u32.u16 %r1, %ctaid.x;\n"
5613 " mul24.lo.u32 %r2, %r1, 8;\n"
5614 " cvt.u32.u16 %r3, %tid.x;\n"
5615 " add.u32 %r4, %r2, %r3;\n"
5616 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_false_n];\n"
5617 " setp.ge.u32 %p1, %r4, %r5;\n"
5618 " @%p1 bra $Lt_53_18178;\n"
5619 " mul.lo.u32 %r6, %r4, 4;\n"
5620 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_false_n];\n"
5621 " mul.lo.u32 %r7, %r5, 4;\n"
5622 " mov.u16 %rh1, %nctaid.x;\n"
5623 " mul.wide.u16 %r8, %rh1, 32;\n"
5624 " add.u32 %r9, %r6, 16;\n"
5625 " add.u32 %r10, %r7, 16;\n"
5626 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_false_g_idata];\n"
5627 "$Lt_53_16642:\n"
5628 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5629 " .loc 3 677 0\n"
5630 " cvt.u8.u32 %r11, %r9;\n"
5631 " cvt.u64.u32 %rd2, %r11;\n"
5632 " .loc 3 637 0\n"
5633 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_false_g_idata];\n"
5634 " .loc 3 677 0\n"
5635 " add.u64 %rd3, %rd2, %rd1;\n"
5636 " ld.global.u8 %rh2, [%rd3+0];\n"
5637 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5638 " .loc 3 678 0\n"
5639 " ld.global.u8 %rh3, [%rd3+1];\n"
5640 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5641 " .loc 3 679 0\n"
5642 " ld.global.u8 %rh4, [%rd3+2];\n"
5643 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5644 " .loc 3 680 0\n"
5645 " ld.global.u8 %rh5, [%rd3+3];\n"
5646 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5647 " add.u32 %r9, %r8, %r9;\n"
5648 " setp.lt.u32 %p2, %r9, %r10;\n"
5649 " @%p2 bra $Lt_53_16642;\n"
5650 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5651 " bra.uni $Lt_53_16130;\n"
5652 "$Lt_53_18178:\n"
5653 " mov.f32 %f1, 0f00000000; // 0\n"
5654 "$Lt_53_16130:\n"
5655 " .loc 3 692 0\n"
5656 " mov.u64 %rd4, __smem;\n"
5657 " cvt.u64.u32 %rd5, %r3;\n"
5658 " mul.wide.u32 %rd6, %r3, 4;\n"
5659 " add.u64 %rd7, %rd4, %rd6;\n"
5660 " st.shared.f32 [%rd7+0], %f1;\n"
5661 " .loc 3 693 0\n"
5662 " bar.sync 0;\n"
5663 " mov.u32 %r12, 31;\n"
5664 " setp.gt.u32 %p3, %r3, %r12;\n"
5665 " @%p3 bra $Lt_53_17154;\n"
5666 " .loc 3 713 0\n"
5667 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5668 " add.f32 %f3, %f2, %f1;\n"
5669 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5670 " .loc 3 714 0\n"
5671 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5672 " add.f32 %f1, %f4, %f3;\n"
5673 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5674 "$Lt_53_17154:\n"
5675 " mov.u32 %r13, 0;\n"
5676 " setp.ne.u32 %p4, %r3, %r13;\n"
5677 " @%p4 bra $Lt_53_17666;\n"
5678 " .loc 3 719 0\n"
5679 " ld.shared.f32 %f5, [__smem+0];\n"
5680 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_false_false_g_odata];\n"
5681 " cvt.u64.u32 %rd9, %r1;\n"
5682 " mul.wide.u32 %rd10, %r1, 4;\n"
5683 " add.u64 %rd11, %rd8, %rd10;\n"
5684 " st.global.f32 [%rd11+0], %f5;\n"
5685 "$Lt_53_17666:\n"
5686 " .loc 3 741 0\n"
5687 " exit;\n"
5688 "$LDWend_packed_float_reduce_4_false_false:\n"
5689 " } // packed_float_reduce_4_false_false\n"
5690 "\n"
5691 " .entry packed_float_reduce_4_false_true (\n"
5692 " .param .u64 __cudaparm_packed_float_reduce_4_false_true_g_idata,\n"
5693 " .param .u64 __cudaparm_packed_float_reduce_4_false_true_g_odata,\n"
5694 " .param .u32 __cudaparm_packed_float_reduce_4_false_true_n)\n"
5695 " {\n"
5696 " .reg .u16 %rh<7>;\n"
5697 " .reg .u32 %r<15>;\n"
5698 " .reg .u64 %rd<13>;\n"
5699 " .reg .f32 %f<7>;\n"
5700 " .reg .pred %p<6>;\n"
5701 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5702 " .loc 3 742 0\n"
5703 "$LDWbegin_packed_float_reduce_4_false_true:\n"
5704 " .loc 3 637 0\n"
5705 " cvt.u32.u16 %r1, %ctaid.x;\n"
5706 " mul24.lo.u32 %r2, %r1, 8;\n"
5707 " cvt.u32.u16 %r3, %tid.x;\n"
5708 " add.u32 %r4, %r2, %r3;\n"
5709 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_true_n];\n"
5710 " setp.ge.u32 %p1, %r4, %r5;\n"
5711 " @%p1 bra $Lt_54_18178;\n"
5712 " mul.lo.u32 %r6, %r4, 4;\n"
5713 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_true_n];\n"
5714 " mul.lo.u32 %r7, %r5, 4;\n"
5715 " mov.u16 %rh1, %nctaid.x;\n"
5716 " mul.wide.u16 %r8, %rh1, 32;\n"
5717 " add.u32 %r9, %r6, 16;\n"
5718 " add.u32 %r10, %r7, 16;\n"
5719 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_true_g_idata];\n"
5720 "$Lt_54_16642:\n"
5721 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5722 " .loc 3 677 0\n"
5723 " cvt.u8.u32 %r11, %r9;\n"
5724 " cvt.u64.u32 %rd2, %r11;\n"
5725 " .loc 3 637 0\n"
5726 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_true_g_idata];\n"
5727 " .loc 3 677 0\n"
5728 " add.u64 %rd3, %rd2, %rd1;\n"
5729 " ld.global.u8 %rh2, [%rd3+0];\n"
5730 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5731 " .loc 3 678 0\n"
5732 " ld.global.u8 %rh3, [%rd3+1];\n"
5733 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5734 " .loc 3 679 0\n"
5735 " ld.global.u8 %rh4, [%rd3+2];\n"
5736 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5737 " .loc 3 680 0\n"
5738 " ld.global.u8 %rh5, [%rd3+3];\n"
5739 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5740 " add.u32 %r9, %r8, %r9;\n"
5741 " setp.lt.u32 %p2, %r9, %r10;\n"
5742 " @%p2 bra $Lt_54_16642;\n"
5743 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5744 " bra.uni $Lt_54_16130;\n"
5745 "$Lt_54_18178:\n"
5746 " mov.f32 %f1, 0f00000000; // 0\n"
5747 "$Lt_54_16130:\n"
5748 " .loc 3 692 0\n"
5749 " mov.u64 %rd4, __smem;\n"
5750 " cvt.u64.u32 %rd5, %r3;\n"
5751 " mul.wide.u32 %rd6, %r3, 4;\n"
5752 " add.u64 %rd7, %rd4, %rd6;\n"
5753 " st.shared.f32 [%rd7+0], %f1;\n"
5754 " .loc 3 693 0\n"
5755 " bar.sync 0;\n"
5756 " mov.u32 %r12, 31;\n"
5757 " setp.gt.u32 %p3, %r3, %r12;\n"
5758 " @%p3 bra $Lt_54_17154;\n"
5759 " .loc 3 713 0\n"
5760 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5761 " add.f32 %f3, %f2, %f1;\n"
5762 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5763 " .loc 3 714 0\n"
5764 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5765 " add.f32 %f1, %f4, %f3;\n"
5766 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5767 "$Lt_54_17154:\n"
5768 " mov.u32 %r13, 0;\n"
5769 " setp.ne.u32 %p4, %r3, %r13;\n"
5770 " @%p4 bra $Lt_54_17666;\n"
5771 " .loc 3 719 0\n"
5772 " ld.shared.f32 %f5, [__smem+0];\n"
5773 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_false_true_g_odata];\n"
5774 " cvt.u64.u32 %rd9, %r1;\n"
5775 " mul.wide.u32 %rd10, %r1, 4;\n"
5776 " add.u64 %rd11, %rd8, %rd10;\n"
5777 " st.global.f32 [%rd11+0], %f5;\n"
5778 "$Lt_54_17666:\n"
5779 " .loc 3 743 0\n"
5780 " exit;\n"
5781 "$LDWend_packed_float_reduce_4_false_true:\n"
5782 " } // packed_float_reduce_4_false_true\n"
5783 "\n"
5784 " .entry packed_float_reduce_4_true_false (\n"
5785 " .param .u64 __cudaparm_packed_float_reduce_4_true_false_g_idata,\n"
5786 " .param .u64 __cudaparm_packed_float_reduce_4_true_false_g_odata,\n"
5787 " .param .u32 __cudaparm_packed_float_reduce_4_true_false_n)\n"
5788 " {\n"
5789 " .reg .u16 %rh<7>;\n"
5790 " .reg .u32 %r<15>;\n"
5791 " .reg .u64 %rd<13>;\n"
5792 " .reg .f32 %f<7>;\n"
5793 " .reg .pred %p<6>;\n"
5794 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5795 " .loc 3 744 0\n"
5796 "$LDWbegin_packed_float_reduce_4_true_false:\n"
5797 " .loc 3 637 0\n"
5798 " cvt.u32.u16 %r1, %ctaid.x;\n"
5799 " mul24.lo.u32 %r2, %r1, 8;\n"
5800 " cvt.u32.u16 %r3, %tid.x;\n"
5801 " add.u32 %r4, %r2, %r3;\n"
5802 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_false_n];\n"
5803 " setp.ge.u32 %p1, %r4, %r5;\n"
5804 " @%p1 bra $Lt_55_18178;\n"
5805 " mul.lo.u32 %r6, %r4, 4;\n"
5806 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_false_n];\n"
5807 " mul.lo.u32 %r7, %r5, 4;\n"
5808 " mov.u16 %rh1, %nctaid.x;\n"
5809 " mul.wide.u16 %r8, %rh1, 32;\n"
5810 " add.u32 %r9, %r6, 16;\n"
5811 " add.u32 %r10, %r7, 16;\n"
5812 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_false_g_idata];\n"
5813 "$Lt_55_16642:\n"
5814 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5815 " .loc 3 677 0\n"
5816 " cvt.u8.u32 %r11, %r9;\n"
5817 " cvt.u64.u32 %rd2, %r11;\n"
5818 " .loc 3 637 0\n"
5819 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_false_g_idata];\n"
5820 " .loc 3 677 0\n"
5821 " add.u64 %rd3, %rd2, %rd1;\n"
5822 " ld.global.u8 %rh2, [%rd3+0];\n"
5823 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5824 " .loc 3 678 0\n"
5825 " ld.global.u8 %rh3, [%rd3+1];\n"
5826 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5827 " .loc 3 679 0\n"
5828 " ld.global.u8 %rh4, [%rd3+2];\n"
5829 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5830 " .loc 3 680 0\n"
5831 " ld.global.u8 %rh5, [%rd3+3];\n"
5832 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5833 " add.u32 %r9, %r8, %r9;\n"
5834 " setp.lt.u32 %p2, %r9, %r10;\n"
5835 " @%p2 bra $Lt_55_16642;\n"
5836 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5837 " bra.uni $Lt_55_16130;\n"
5838 "$Lt_55_18178:\n"
5839 " mov.f32 %f1, 0f00000000; // 0\n"
5840 "$Lt_55_16130:\n"
5841 " .loc 3 692 0\n"
5842 " mov.u64 %rd4, __smem;\n"
5843 " cvt.u64.u32 %rd5, %r3;\n"
5844 " mul.wide.u32 %rd6, %r3, 4;\n"
5845 " add.u64 %rd7, %rd4, %rd6;\n"
5846 " st.shared.f32 [%rd7+0], %f1;\n"
5847 " .loc 3 693 0\n"
5848 " bar.sync 0;\n"
5849 " mov.u32 %r12, 31;\n"
5850 " setp.gt.u32 %p3, %r3, %r12;\n"
5851 " @%p3 bra $Lt_55_17154;\n"
5852 " .loc 3 713 0\n"
5853 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5854 " add.f32 %f3, %f2, %f1;\n"
5855 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5856 " .loc 3 714 0\n"
5857 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5858 " add.f32 %f1, %f4, %f3;\n"
5859 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5860 "$Lt_55_17154:\n"
5861 " mov.u32 %r13, 0;\n"
5862 " setp.ne.u32 %p4, %r3, %r13;\n"
5863 " @%p4 bra $Lt_55_17666;\n"
5864 " .loc 3 719 0\n"
5865 " ld.shared.f32 %f5, [__smem+0];\n"
5866 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_true_false_g_odata];\n"
5867 " cvt.u64.u32 %rd9, %r1;\n"
5868 " mul.wide.u32 %rd10, %r1, 4;\n"
5869 " add.u64 %rd11, %rd8, %rd10;\n"
5870 " st.global.f32 [%rd11+0], %f5;\n"
5871 "$Lt_55_17666:\n"
5872 " .loc 3 745 0\n"
5873 " exit;\n"
5874 "$LDWend_packed_float_reduce_4_true_false:\n"
5875 " } // packed_float_reduce_4_true_false\n"
5876 "\n"
5877 " .entry packed_float_reduce_4_true_true (\n"
5878 " .param .u64 __cudaparm_packed_float_reduce_4_true_true_g_idata,\n"
5879 " .param .u64 __cudaparm_packed_float_reduce_4_true_true_g_odata,\n"
5880 " .param .u32 __cudaparm_packed_float_reduce_4_true_true_n)\n"
5881 " {\n"
5882 " .reg .u16 %rh<7>;\n"
5883 " .reg .u32 %r<15>;\n"
5884 " .reg .u64 %rd<13>;\n"
5885 " .reg .f32 %f<7>;\n"
5886 " .reg .pred %p<6>;\n"
5887 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5888 " .loc 3 746 0\n"
5889 "$LDWbegin_packed_float_reduce_4_true_true:\n"
5890 " .loc 3 637 0\n"
5891 " cvt.u32.u16 %r1, %ctaid.x;\n"
5892 " mul24.lo.u32 %r2, %r1, 8;\n"
5893 " cvt.u32.u16 %r3, %tid.x;\n"
5894 " add.u32 %r4, %r2, %r3;\n"
5895 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_true_n];\n"
5896 " setp.ge.u32 %p1, %r4, %r5;\n"
5897 " @%p1 bra $Lt_56_18178;\n"
5898 " mul.lo.u32 %r6, %r4, 4;\n"
5899 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_true_n];\n"
5900 " mul.lo.u32 %r7, %r5, 4;\n"
5901 " mov.u16 %rh1, %nctaid.x;\n"
5902 " mul.wide.u16 %r8, %rh1, 32;\n"
5903 " add.u32 %r9, %r6, 16;\n"
5904 " add.u32 %r10, %r7, 16;\n"
5905 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_true_g_idata];\n"
5906 "$Lt_56_16642:\n"
5907 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5908 " .loc 3 677 0\n"
5909 " cvt.u8.u32 %r11, %r9;\n"
5910 " cvt.u64.u32 %rd2, %r11;\n"
5911 " .loc 3 637 0\n"
5912 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_true_g_idata];\n"
5913 " .loc 3 677 0\n"
5914 " add.u64 %rd3, %rd2, %rd1;\n"
5915 " ld.global.u8 %rh2, [%rd3+0];\n"
5916 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5917 " .loc 3 678 0\n"
5918 " ld.global.u8 %rh3, [%rd3+1];\n"
5919 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5920 " .loc 3 679 0\n"
5921 " ld.global.u8 %rh4, [%rd3+2];\n"
5922 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5923 " .loc 3 680 0\n"
5924 " ld.global.u8 %rh5, [%rd3+3];\n"
5925 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5926 " add.u32 %r9, %r8, %r9;\n"
5927 " setp.lt.u32 %p2, %r9, %r10;\n"
5928 " @%p2 bra $Lt_56_16642;\n"
5929 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5930 " bra.uni $Lt_56_16130;\n"
5931 "$Lt_56_18178:\n"
5932 " mov.f32 %f1, 0f00000000; // 0\n"
5933 "$Lt_56_16130:\n"
5934 " .loc 3 692 0\n"
5935 " mov.u64 %rd4, __smem;\n"
5936 " cvt.u64.u32 %rd5, %r3;\n"
5937 " mul.wide.u32 %rd6, %r3, 4;\n"
5938 " add.u64 %rd7, %rd4, %rd6;\n"
5939 " st.shared.f32 [%rd7+0], %f1;\n"
5940 " .loc 3 693 0\n"
5941 " bar.sync 0;\n"
5942 " mov.u32 %r12, 31;\n"
5943 " setp.gt.u32 %p3, %r3, %r12;\n"
5944 " @%p3 bra $Lt_56_17154;\n"
5945 " .loc 3 713 0\n"
5946 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5947 " add.f32 %f3, %f2, %f1;\n"
5948 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5949 " .loc 3 714 0\n"
5950 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5951 " add.f32 %f1, %f4, %f3;\n"
5952 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5953 "$Lt_56_17154:\n"
5954 " mov.u32 %r13, 0;\n"
5955 " setp.ne.u32 %p4, %r3, %r13;\n"
5956 " @%p4 bra $Lt_56_17666;\n"
5957 " .loc 3 719 0\n"
5958 " ld.shared.f32 %f5, [__smem+0];\n"
5959 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_true_true_g_odata];\n"
5960 " cvt.u64.u32 %rd9, %r1;\n"
5961 " mul.wide.u32 %rd10, %r1, 4;\n"
5962 " add.u64 %rd11, %rd8, %rd10;\n"
5963 " st.global.f32 [%rd11+0], %f5;\n"
5964 "$Lt_56_17666:\n"
5965 " .loc 3 747 0\n"
5966 " exit;\n"
5967 "$LDWend_packed_float_reduce_4_true_true:\n"
5968 " } // packed_float_reduce_4_true_true\n"
5969 "\n"
5970 " .entry packed_float_reduce_8_false_false (\n"
5971 " .param .u64 __cudaparm_packed_float_reduce_8_false_false_g_idata,\n"
5972 " .param .u64 __cudaparm_packed_float_reduce_8_false_false_g_odata,\n"
5973 " .param .u32 __cudaparm_packed_float_reduce_8_false_false_n)\n"
5974 " {\n"
5975 " .reg .u16 %rh<7>;\n"
5976 " .reg .u32 %r<15>;\n"
5977 " .reg .u64 %rd<13>;\n"
5978 " .reg .f32 %f<9>;\n"
5979 " .reg .pred %p<6>;\n"
5980 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5981 " .loc 3 749 0\n"
5982 "$LDWbegin_packed_float_reduce_8_false_false:\n"
5983 " .loc 3 637 0\n"
5984 " cvt.u32.u16 %r1, %ctaid.x;\n"
5985 " mul24.lo.u32 %r2, %r1, 16;\n"
5986 " cvt.u32.u16 %r3, %tid.x;\n"
5987 " add.u32 %r4, %r2, %r3;\n"
5988 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_false_false_n];\n"
5989 " setp.ge.u32 %p1, %r4, %r5;\n"
5990 " @%p1 bra $Lt_57_17922;\n"
5991 " mul.lo.u32 %r6, %r4, 4;\n"
5992 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_false_false_n];\n"
5993 " mul.lo.u32 %r7, %r5, 4;\n"
5994 " mov.u16 %rh1, %nctaid.x;\n"
5995 " mul.wide.u16 %r8, %rh1, 64;\n"
5996 " add.u32 %r9, %r6, 32;\n"
5997 " add.u32 %r10, %r7, 32;\n"
5998 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_false_false_g_idata];\n"
5999 "$Lt_57_16386:\n"
6000 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6001 " .loc 3 677 0\n"
6002 " cvt.u8.u32 %r11, %r9;\n"
6003 " cvt.u64.u32 %rd2, %r11;\n"
6004 " .loc 3 637 0\n"
6005 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_false_false_g_idata];\n"
6006 " .loc 3 677 0\n"
6007 " add.u64 %rd3, %rd2, %rd1;\n"
6008 " ld.global.u8 %rh2, [%rd3+0];\n"
6009 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6010 " .loc 3 678 0\n"
6011 " ld.global.u8 %rh3, [%rd3+1];\n"
6012 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6013 " .loc 3 679 0\n"
6014 " ld.global.u8 %rh4, [%rd3+2];\n"
6015 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6016 " .loc 3 680 0\n"
6017 " ld.global.u8 %rh5, [%rd3+3];\n"
6018 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6019 " add.u32 %r9, %r8, %r9;\n"
6020 " setp.lt.u32 %p2, %r9, %r10;\n"
6021 " @%p2 bra $Lt_57_16386;\n"
6022 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6023 " bra.uni $Lt_57_15874;\n"
6024 "$Lt_57_17922:\n"
6025 " mov.f32 %f1, 0f00000000; // 0\n"
6026 "$Lt_57_15874:\n"
6027 " .loc 3 692 0\n"
6028 " mov.u64 %rd4, __smem;\n"
6029 " cvt.u64.u32 %rd5, %r3;\n"
6030 " mul.wide.u32 %rd6, %r3, 4;\n"
6031 " add.u64 %rd7, %rd4, %rd6;\n"
6032 " st.shared.f32 [%rd7+0], %f1;\n"
6033 " .loc 3 693 0\n"
6034 " bar.sync 0;\n"
6035 " mov.u32 %r12, 31;\n"
6036 " setp.gt.u32 %p3, %r3, %r12;\n"
6037 " @%p3 bra $Lt_57_16898;\n"
6038 " .loc 3 712 0\n"
6039 " ld.volatile.shared.f32 %f2, [%rd7+16];\n"
6040 " add.f32 %f3, %f2, %f1;\n"
6041 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6042 " .loc 3 713 0\n"
6043 " ld.volatile.shared.f32 %f4, [%rd7+8];\n"
6044 " add.f32 %f5, %f4, %f3;\n"
6045 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6046 " .loc 3 714 0\n"
6047 " ld.volatile.shared.f32 %f6, [%rd7+4];\n"
6048 " add.f32 %f1, %f6, %f5;\n"
6049 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6050 "$Lt_57_16898:\n"
6051 " mov.u32 %r13, 0;\n"
6052 " setp.ne.u32 %p4, %r3, %r13;\n"
6053 " @%p4 bra $Lt_57_17410;\n"
6054 " .loc 3 719 0\n"
6055 " ld.shared.f32 %f7, [__smem+0];\n"
6056 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_8_false_false_g_odata];\n"
6057