KJB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
reduce_ptx.h
Go to the documentation of this file.
1 static const char* reduce_ptx =
2 " .version 1.4\n"
3 " .target sm_10, map_f64_to_f32\n"
4 " // compiled with /usr/local/cuda/open64/lib//be\n"
5 " // nvopencc 3.1 built on 2010-06-07\n"
6 "\n"
7 " //-----------------------------------------------------------\n"
8 " // Compiling /tmp/tmpxft_00007884_00000000-7_reduce.cpp3.i (/tmp/ccBI#.LV1fMO)\n"
9 " //-----------------------------------------------------------\n"
10 "\n"
11 " //-----------------------------------------------------------\n"
12 " // Options:\n"
13 " //-----------------------------------------------------------\n"
14 " // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64\n"
15 " // -O3 (Optimization level)\n"
16 " // -g0 (Debug level)\n"
17 " // -m2 (Report advisories)\n"
18 " //-----------------------------------------------------------\n"
19 "\n"
20 " .file 1 \"<command-line>\"\n"
21 " .file 2 \"/tmp/tmpxft_00007884_00000000-6_reduce.cudafe2.gpu\"\n"
22 " .file 3 \"reduce.cu\"\n"
23 " .file 4 \"/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h\"\n"
24 " .file 5 \"/usr/local/cuda/bin/../include/crt/device_runtime.h\"\n"
25 " .file 6 \"/usr/local/cuda/bin/../include/host_defines.h\"\n"
26 " .file 7 \"/usr/local/cuda/bin/../include/builtin_types.h\"\n"
27 " .file 8 \"/usr/local/cuda/bin/../include/device_types.h\"\n"
28 " .file 9 \"/usr/local/cuda/bin/../include/driver_types.h\"\n"
29 " .file 10 \"/usr/local/cuda/bin/../include/surface_types.h\"\n"
30 " .file 11 \"/usr/local/cuda/bin/../include/texture_types.h\"\n"
31 " .file 12 \"/usr/local/cuda/bin/../include/vector_types.h\"\n"
32 " .file 13 \"/usr/local/cuda/bin/../include/device_launch_parameters.h\"\n"
33 " .file 14 \"/usr/local/cuda/bin/../include/crt/storage_class.h\"\n"
34 " .file 15 \"/usr/include/bits/types.h\"\n"
35 " .file 16 \"/usr/include/time.h\"\n"
36 " .file 17 \"/usr/local/cuda/bin/../include/common_functions.h\"\n"
37 " .file 18 \"/usr/local/cuda/bin/../include/math_functions.h\"\n"
38 " .file 19 \"/usr/local/cuda/bin/../include/math_constants.h\"\n"
39 " .file 20 \"/usr/local/cuda/bin/../include/device_functions.h\"\n"
40 " .file 21 \"/usr/local/cuda/bin/../include/sm_11_atomic_functions.h\"\n"
41 " .file 22 \"/usr/local/cuda/bin/../include/sm_12_atomic_functions.h\"\n"
42 " .file 23 \"/usr/local/cuda/bin/../include/sm_13_double_functions.h\"\n"
43 " .file 24 \"/usr/local/cuda/bin/../include/sm_20_atomic_functions.h\"\n"
44 " .file 25 \"/usr/local/cuda/bin/../include/sm_20_intrinsics.h\"\n"
45 " .file 26 \"/usr/local/cuda/bin/../include/surface_functions.h\"\n"
46 " .file 27 \"/usr/local/cuda/bin/../include/texture_fetch_functions.h\"\n"
47 " .file 28 \"/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h\"\n"
48 "\n"
49 " .extern .shared .align 4 .b8 __smem[];\n"
50 " .tex .u64 tex_ref_1;\n"
51 " .tex .u64 tex_ref_2;\n"
52 "\n"
53 " .entry chamfer_and_reduce (\n"
54 " .param .u64 __cudaparm_chamfer_and_reduce_g_idata_1,\n"
55 " .param .u64 __cudaparm_chamfer_and_reduce_g_idata_2,\n"
56 " .param .u64 __cudaparm_chamfer_and_reduce_g_odata,\n"
57 " .param .u32 __cudaparm_chamfer_and_reduce_n)\n"
58 " {\n"
59 " .reg .u16 %rh<3>;\n"
60 " .reg .u32 %r<14>;\n"
61 " .reg .u64 %rd<18>;\n"
62 " .reg .f32 %f<23>;\n"
63 " .reg .pred %p<9>;\n"
64 " .loc 3 148 0\n"
65 "$LDWbegin_chamfer_and_reduce:\n"
66 " .loc 3 105 0\n"
67 " cvt.u32.u16 %r1, %ctaid.x;\n"
68 " mul.lo.u32 %r2, %r1, 512;\n"
69 " cvt.u32.u16 %r3, %tid.x;\n"
70 " add.u32 %r4, %r2, %r3;\n"
71 " ld.param.u32 %r5, [__cudaparm_chamfer_and_reduce_n];\n"
72 " setp.ge.u32 %p1, %r4, %r5;\n"
73 " @%p1 bra $Lt_0_18178;\n"
74 " add.u32 %r6, %r4, 256;\n"
75 " ld.param.u32 %r5, [__cudaparm_chamfer_and_reduce_n];\n"
76 " add.u32 %r7, %r5, 256;\n"
77 " mov.u16 %rh1, %nctaid.x;\n"
78 " mul.wide.u16 %r8, %rh1, 512;\n"
79 " cvt.u64.u32 %rd1, %r4;\n"
80 " mul.wide.u32 %rd2, %r4, 4;\n"
81 " cvt.s64.u32 %rd3, %r8;\n"
82 " ld.param.u64 %rd4, [__cudaparm_chamfer_and_reduce_g_idata_1];\n"
83 " add.u64 %rd5, %rd4, %rd2;\n"
84 " mul.wide.u32 %rd6, %r8, 4;\n"
85 " ld.param.u64 %rd7, [__cudaparm_chamfer_and_reduce_g_idata_2];\n"
86 " add.u64 %rd8, %rd7, %rd2;\n"
87 " mov.f32 %f1, 0f00000000; // 0\n"
88 "$Lt_0_15106:\n"
89 " //<loop> Loop body line 105, nesting depth: 1, estimated iterations: unknown\n"
90 " .loc 3 119 0\n"
91 " ld.global.f32 %f2, [%rd5+0];\n"
92 " ld.global.f32 %f3, [%rd8+0];\n"
93 " mad.f32 %f1, %f2, %f3, %f1;\n"
94 " .loc 3 105 0\n"
95 " ld.param.u32 %r5, [__cudaparm_chamfer_and_reduce_n];\n"
96 " .loc 3 119 0\n"
97 " setp.ge.u32 %p2, %r6, %r5;\n"
98 " @%p2 bra $Lt_0_15362;\n"
99 " //<loop> Part of loop body line 105, head labeled $Lt_0_15106\n"
100 " .loc 3 133 0\n"
101 " ld.global.f32 %f4, [%rd5+1024];\n"
102 " ld.global.f32 %f5, [%rd8+1024];\n"
103 " mad.f32 %f1, %f4, %f5, %f1;\n"
104 "$Lt_0_15362:\n"
105 " //<loop> Part of loop body line 105, head labeled $Lt_0_15106\n"
106 " add.u32 %r6, %r6, %r8;\n"
107 " add.u64 %rd8, %rd8, %rd6;\n"
108 " add.u64 %rd5, %rd5, %rd6;\n"
109 " setp.lt.u32 %p3, %r6, %r7;\n"
110 " @%p3 bra $Lt_0_15106;\n"
111 " bra.uni $Lt_0_14594;\n"
112 "$Lt_0_18178:\n"
113 " mov.f32 %f1, 0f00000000; // 0\n"
114 "$Lt_0_14594:\n"
115 " .loc 3 139 0\n"
116 " mov.f32 %f6, %f1;\n"
117 " mov.f32 %f7, %f6;\n"
118 " .loc 3 71 0\n"
119 " mov.u64 %rd9, __smem;\n"
120 " cvt.u64.u32 %rd10, %r3;\n"
121 " mul.wide.u32 %rd11, %r3, 4;\n"
122 " add.u64 %rd12, %rd9, %rd11;\n"
123 " st.volatile.shared.f32 [%rd12+0], %f6;\n"
124 " .loc 3 72 0\n"
125 " bar.sync 0;\n"
126 " mov.u32 %r9, 127;\n"
127 " setp.gt.u32 %p4, %r3, %r9;\n"
128 " @%p4 bra $Lt_0_16130;\n"
129 " .loc 3 76 0\n"
130 " ld.volatile.shared.f32 %f8, [%rd12+512];\n"
131 " add.f32 %f7, %f8, %f6;\n"
132 " st.volatile.shared.f32 [%rd12+0], %f7;\n"
133 "$Lt_0_16130:\n"
134 " bar.sync 0;\n"
135 " mov.u32 %r10, 63;\n"
136 " setp.gt.u32 %p5, %r3, %r10;\n"
137 " @%p5 bra $Lt_0_16642;\n"
138 " .loc 3 77 0\n"
139 " ld.volatile.shared.f32 %f9, [%rd12+256];\n"
140 " add.f32 %f7, %f9, %f7;\n"
141 " st.volatile.shared.f32 [%rd12+0], %f7;\n"
142 "$Lt_0_16642:\n"
143 " bar.sync 0;\n"
144 " mov.u32 %r11, 31;\n"
145 " setp.gt.u32 %p6, %r3, %r11;\n"
146 " @%p6 bra $Lt_0_17154;\n"
147 " .loc 3 83 0\n"
148 " ld.volatile.shared.f32 %f10, [%rd12+128];\n"
149 " add.f32 %f11, %f10, %f7;\n"
150 " st.volatile.shared.f32 [%rd12+0], %f11;\n"
151 " .loc 3 84 0\n"
152 " ld.volatile.shared.f32 %f12, [%rd12+64];\n"
153 " add.f32 %f13, %f12, %f11;\n"
154 " st.volatile.shared.f32 [%rd12+0], %f13;\n"
155 " .loc 3 85 0\n"
156 " ld.volatile.shared.f32 %f14, [%rd12+32];\n"
157 " add.f32 %f15, %f14, %f13;\n"
158 " st.volatile.shared.f32 [%rd12+0], %f15;\n"
159 " .loc 3 86 0\n"
160 " ld.volatile.shared.f32 %f16, [%rd12+16];\n"
161 " add.f32 %f17, %f16, %f15;\n"
162 " st.volatile.shared.f32 [%rd12+0], %f17;\n"
163 " .loc 3 87 0\n"
164 " ld.volatile.shared.f32 %f18, [%rd12+8];\n"
165 " add.f32 %f19, %f18, %f17;\n"
166 " st.volatile.shared.f32 [%rd12+0], %f19;\n"
167 " .loc 3 88 0\n"
168 " ld.volatile.shared.f32 %f20, [%rd12+4];\n"
169 " add.f32 %f7, %f20, %f19;\n"
170 " st.volatile.shared.f32 [%rd12+0], %f7;\n"
171 "$Lt_0_17154:\n"
172 " .loc 3 139 0\n"
173 " mov.u32 %r12, 0;\n"
174 " setp.ne.u32 %p7, %r3, %r12;\n"
175 " @%p7 bra $Lt_0_17666;\n"
176 " .loc 3 143 0\n"
177 " ld.shared.f32 %f21, [__smem+0];\n"
178 " ld.param.u64 %rd13, [__cudaparm_chamfer_and_reduce_g_odata];\n"
179 " cvt.u64.u32 %rd14, %r1;\n"
180 " mul.wide.u32 %rd15, %r1, 4;\n"
181 " add.u64 %rd16, %rd13, %rd15;\n"
182 " st.global.f32 [%rd16+0], %f21;\n"
183 "$Lt_0_17666:\n"
184 " .loc 3 151 0\n"
185 " exit;\n"
186 "$LDWend_chamfer_and_reduce:\n"
187 " } // chamfer_and_reduce\n"
188 "\n"
189 " .entry squared_chamfer_and_reduce (\n"
190 " .param .u64 __cudaparm_squared_chamfer_and_reduce_g_idata_1,\n"
191 " .param .u64 __cudaparm_squared_chamfer_and_reduce_g_idata_2,\n"
192 " .param .u64 __cudaparm_squared_chamfer_and_reduce_g_odata,\n"
193 " .param .u32 __cudaparm_squared_chamfer_and_reduce_n)\n"
194 " {\n"
195 " .reg .u16 %rh<3>;\n"
196 " .reg .u32 %r<14>;\n"
197 " .reg .u64 %rd<18>;\n"
198 " .reg .f32 %f<25>;\n"
199 " .reg .pred %p<9>;\n"
200 " .loc 3 154 0\n"
201 "$LDWbegin_squared_chamfer_and_reduce:\n"
202 " .loc 3 105 0\n"
203 " cvt.u32.u16 %r1, %ctaid.x;\n"
204 " mul.lo.u32 %r2, %r1, 512;\n"
205 " cvt.u32.u16 %r3, %tid.x;\n"
206 " add.u32 %r4, %r2, %r3;\n"
207 " ld.param.u32 %r5, [__cudaparm_squared_chamfer_and_reduce_n];\n"
208 " setp.ge.u32 %p1, %r4, %r5;\n"
209 " @%p1 bra $Lt_1_18178;\n"
210 " add.u32 %r6, %r4, 256;\n"
211 " ld.param.u32 %r5, [__cudaparm_squared_chamfer_and_reduce_n];\n"
212 " add.u32 %r7, %r5, 256;\n"
213 " mov.u16 %rh1, %nctaid.x;\n"
214 " mul.wide.u16 %r8, %rh1, 512;\n"
215 " cvt.u64.u32 %rd1, %r4;\n"
216 " mul.wide.u32 %rd2, %r4, 4;\n"
217 " cvt.s64.u32 %rd3, %r8;\n"
218 " ld.param.u64 %rd4, [__cudaparm_squared_chamfer_and_reduce_g_idata_1];\n"
219 " add.u64 %rd5, %rd4, %rd2;\n"
220 " mul.wide.u32 %rd6, %r8, 4;\n"
221 " ld.param.u64 %rd7, [__cudaparm_squared_chamfer_and_reduce_g_idata_2];\n"
222 " add.u64 %rd8, %rd7, %rd2;\n"
223 " mov.f32 %f1, 0f00000000; // 0\n"
224 "$Lt_1_15106:\n"
225 " //<loop> Loop body line 105, nesting depth: 1, estimated iterations: unknown\n"
226 " .loc 3 114 0\n"
227 " ld.global.f32 %f2, [%rd5+0];\n"
228 " ld.global.f32 %f3, [%rd8+0];\n"
229 " mul.f32 %f4, %f2, %f3;\n"
230 " .loc 3 115 0\n"
231 " mad.f32 %f1, %f4, %f4, %f1;\n"
232 " .loc 3 105 0\n"
233 " ld.param.u32 %r5, [__cudaparm_squared_chamfer_and_reduce_n];\n"
234 " .loc 3 115 0\n"
235 " setp.ge.u32 %p2, %r6, %r5;\n"
236 " @%p2 bra $Lt_1_15362;\n"
237 " //<loop> Part of loop body line 105, head labeled $Lt_1_15106\n"
238 " .loc 3 127 0\n"
239 " ld.global.f32 %f5, [%rd5+1024];\n"
240 " ld.global.f32 %f6, [%rd8+1024];\n"
241 " mul.f32 %f7, %f5, %f6;\n"
242 " .loc 3 129 0\n"
243 " mad.f32 %f1, %f7, %f7, %f1;\n"
244 "$Lt_1_15362:\n"
245 " //<loop> Part of loop body line 105, head labeled $Lt_1_15106\n"
246 " add.u32 %r6, %r6, %r8;\n"
247 " add.u64 %rd8, %rd8, %rd6;\n"
248 " add.u64 %rd5, %rd5, %rd6;\n"
249 " setp.lt.u32 %p3, %r6, %r7;\n"
250 " @%p3 bra $Lt_1_15106;\n"
251 " bra.uni $Lt_1_14594;\n"
252 "$Lt_1_18178:\n"
253 " mov.f32 %f1, 0f00000000; // 0\n"
254 "$Lt_1_14594:\n"
255 " .loc 3 139 0\n"
256 " mov.f32 %f8, %f1;\n"
257 " mov.f32 %f9, %f8;\n"
258 " .loc 3 71 0\n"
259 " mov.u64 %rd9, __smem;\n"
260 " cvt.u64.u32 %rd10, %r3;\n"
261 " mul.wide.u32 %rd11, %r3, 4;\n"
262 " add.u64 %rd12, %rd9, %rd11;\n"
263 " st.volatile.shared.f32 [%rd12+0], %f8;\n"
264 " .loc 3 72 0\n"
265 " bar.sync 0;\n"
266 " mov.u32 %r9, 127;\n"
267 " setp.gt.u32 %p4, %r3, %r9;\n"
268 " @%p4 bra $Lt_1_16130;\n"
269 " .loc 3 76 0\n"
270 " ld.volatile.shared.f32 %f10, [%rd12+512];\n"
271 " add.f32 %f9, %f10, %f8;\n"
272 " st.volatile.shared.f32 [%rd12+0], %f9;\n"
273 "$Lt_1_16130:\n"
274 " bar.sync 0;\n"
275 " mov.u32 %r10, 63;\n"
276 " setp.gt.u32 %p5, %r3, %r10;\n"
277 " @%p5 bra $Lt_1_16642;\n"
278 " .loc 3 77 0\n"
279 " ld.volatile.shared.f32 %f11, [%rd12+256];\n"
280 " add.f32 %f9, %f11, %f9;\n"
281 " st.volatile.shared.f32 [%rd12+0], %f9;\n"
282 "$Lt_1_16642:\n"
283 " bar.sync 0;\n"
284 " mov.u32 %r11, 31;\n"
285 " setp.gt.u32 %p6, %r3, %r11;\n"
286 " @%p6 bra $Lt_1_17154;\n"
287 " .loc 3 83 0\n"
288 " ld.volatile.shared.f32 %f12, [%rd12+128];\n"
289 " add.f32 %f13, %f12, %f9;\n"
290 " st.volatile.shared.f32 [%rd12+0], %f13;\n"
291 " .loc 3 84 0\n"
292 " ld.volatile.shared.f32 %f14, [%rd12+64];\n"
293 " add.f32 %f15, %f14, %f13;\n"
294 " st.volatile.shared.f32 [%rd12+0], %f15;\n"
295 " .loc 3 85 0\n"
296 " ld.volatile.shared.f32 %f16, [%rd12+32];\n"
297 " add.f32 %f17, %f16, %f15;\n"
298 " st.volatile.shared.f32 [%rd12+0], %f17;\n"
299 " .loc 3 86 0\n"
300 " ld.volatile.shared.f32 %f18, [%rd12+16];\n"
301 " add.f32 %f19, %f18, %f17;\n"
302 " st.volatile.shared.f32 [%rd12+0], %f19;\n"
303 " .loc 3 87 0\n"
304 " ld.volatile.shared.f32 %f20, [%rd12+8];\n"
305 " add.f32 %f21, %f20, %f19;\n"
306 " st.volatile.shared.f32 [%rd12+0], %f21;\n"
307 " .loc 3 88 0\n"
308 " ld.volatile.shared.f32 %f22, [%rd12+4];\n"
309 " add.f32 %f9, %f22, %f21;\n"
310 " st.volatile.shared.f32 [%rd12+0], %f9;\n"
311 "$Lt_1_17154:\n"
312 " .loc 3 139 0\n"
313 " mov.u32 %r12, 0;\n"
314 " setp.ne.u32 %p7, %r3, %r12;\n"
315 " @%p7 bra $Lt_1_17666;\n"
316 " .loc 3 143 0\n"
317 " ld.shared.f32 %f23, [__smem+0];\n"
318 " ld.param.u64 %rd13, [__cudaparm_squared_chamfer_and_reduce_g_odata];\n"
319 " cvt.u64.u32 %rd14, %r1;\n"
320 " mul.wide.u32 %rd15, %r1, 4;\n"
321 " add.u64 %rd16, %rd13, %rd15;\n"
322 " st.global.f32 [%rd16+0], %f23;\n"
323 "$Lt_1_17666:\n"
324 " .loc 3 157 0\n"
325 " exit;\n"
326 "$LDWend_squared_chamfer_and_reduce:\n"
327 " } // squared_chamfer_and_reduce\n"
328 "\n"
329 " .entry reduce_float_1_true (\n"
330 " .param .u64 __cudaparm_reduce_float_1_true_g_idata,\n"
331 " .param .u64 __cudaparm_reduce_float_1_true_g_odata,\n"
332 " .param .u32 __cudaparm_reduce_float_1_true_n)\n"
333 " {\n"
334 " .reg .u16 %rh<3>;\n"
335 " .reg .u32 %r<10>;\n"
336 " .reg .u64 %rd<16>;\n"
337 " .reg .f32 %f<7>;\n"
338 " .reg .pred %p<5>;\n"
339 " .loc 3 372 0\n"
340 "$LDWbegin_reduce_float_1_true:\n"
341 " .loc 3 181 0\n"
342 " cvt.u32.u16 %r1, %ctaid.x;\n"
343 " mul24.lo.u32 %r2, %r1, 2;\n"
344 " cvt.u32.u16 %r3, %tid.x;\n"
345 " add.u32 %r4, %r2, %r3;\n"
346 " mov.s32 %r5, %r4;\n"
347 " ld.param.u32 %r6, [__cudaparm_reduce_float_1_true_n];\n"
348 " setp.ge.u32 %p1, %r4, %r6;\n"
349 " @%p1 bra $Lt_2_16642;\n"
350 " mov.u16 %rh1, %nctaid.x;\n"
351 " mul.wide.u16 %r7, %rh1, 2;\n"
352 " cvt.s64.u32 %rd1, %r7;\n"
353 " ld.param.u64 %rd2, [__cudaparm_reduce_float_1_true_g_idata];\n"
354 " cvt.u64.u32 %rd3, %r4;\n"
355 " mul.wide.u32 %rd4, %r4, 4;\n"
356 " add.u64 %rd5, %rd2, %rd4;\n"
357 " mul.wide.u32 %rd6, %r7, 4;\n"
358 " mov.f32 %f1, 0f00000000; // 0\n"
359 "$Lt_2_15618:\n"
360 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
361 " .loc 3 188 0\n"
362 " ld.global.f32 %f2, [%rd5+0];\n"
363 " add.f32 %f3, %f2, %f1;\n"
364 " .loc 3 191 0\n"
365 " ld.global.f32 %f4, [%rd5+4];\n"
366 " add.f32 %f1, %f4, %f3;\n"
367 " add.u32 %r5, %r7, %r5;\n"
368 " add.u64 %rd5, %rd5, %rd6;\n"
369 " .loc 3 181 0\n"
370 " ld.param.u32 %r6, [__cudaparm_reduce_float_1_true_n];\n"
371 " .loc 3 191 0\n"
372 " setp.lt.u32 %p2, %r5, %r6;\n"
373 " @%p2 bra $Lt_2_15618;\n"
374 " bra.uni $Lt_2_15106;\n"
375 "$Lt_2_16642:\n"
376 " mov.f32 %f1, 0f00000000; // 0\n"
377 "$Lt_2_15106:\n"
378 " .loc 3 71 0\n"
379 " mov.u64 %rd7, __smem;\n"
380 " cvt.u64.u32 %rd8, %r3;\n"
381 " mul.wide.u32 %rd9, %r3, 4;\n"
382 " add.u64 %rd10, %rd7, %rd9;\n"
383 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
384 " .loc 3 72 0\n"
385 " bar.sync 0;\n"
386 " .loc 3 195 0\n"
387 " mov.u32 %r8, 0;\n"
388 " setp.ne.u32 %p3, %r3, %r8;\n"
389 " @%p3 bra $Lt_2_16130;\n"
390 " .loc 3 199 0\n"
391 " ld.shared.f32 %f5, [__smem+0];\n"
392 " ld.param.u64 %rd11, [__cudaparm_reduce_float_1_true_g_odata];\n"
393 " cvt.u64.u32 %rd12, %r1;\n"
394 " mul.wide.u32 %rd13, %r1, 4;\n"
395 " add.u64 %rd14, %rd11, %rd13;\n"
396 " st.global.f32 [%rd14+0], %f5;\n"
397 "$Lt_2_16130:\n"
398 " .loc 3 375 0\n"
399 " exit;\n"
400 "$LDWend_reduce_float_1_true:\n"
401 " } // reduce_float_1_true\n"
402 "\n"
403 " .entry reduce_float_2_true (\n"
404 " .param .u64 __cudaparm_reduce_float_2_true_g_idata,\n"
405 " .param .u64 __cudaparm_reduce_float_2_true_g_odata,\n"
406 " .param .u32 __cudaparm_reduce_float_2_true_n)\n"
407 " {\n"
408 " .reg .u16 %rh<3>;\n"
409 " .reg .u32 %r<11>;\n"
410 " .reg .u64 %rd<16>;\n"
411 " .reg .f32 %f<9>;\n"
412 " .reg .pred %p<6>;\n"
413 " .loc 3 377 0\n"
414 "$LDWbegin_reduce_float_2_true:\n"
415 " .loc 3 181 0\n"
416 " cvt.u32.u16 %r1, %ctaid.x;\n"
417 " mul24.lo.u32 %r2, %r1, 4;\n"
418 " cvt.u32.u16 %r3, %tid.x;\n"
419 " add.u32 %r4, %r2, %r3;\n"
420 " mov.s32 %r5, %r4;\n"
421 " ld.param.u32 %r6, [__cudaparm_reduce_float_2_true_n];\n"
422 " setp.ge.u32 %p1, %r4, %r6;\n"
423 " @%p1 bra $Lt_3_16898;\n"
424 " mov.u16 %rh1, %nctaid.x;\n"
425 " mul.wide.u16 %r7, %rh1, 4;\n"
426 " cvt.s64.u32 %rd1, %r7;\n"
427 " ld.param.u64 %rd2, [__cudaparm_reduce_float_2_true_g_idata];\n"
428 " cvt.u64.u32 %rd3, %r4;\n"
429 " mul.wide.u32 %rd4, %r4, 4;\n"
430 " add.u64 %rd5, %rd2, %rd4;\n"
431 " mul.wide.u32 %rd6, %r7, 4;\n"
432 " mov.f32 %f1, 0f00000000; // 0\n"
433 "$Lt_3_15362:\n"
434 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
435 " .loc 3 188 0\n"
436 " ld.global.f32 %f2, [%rd5+0];\n"
437 " add.f32 %f3, %f2, %f1;\n"
438 " .loc 3 191 0\n"
439 " ld.global.f32 %f4, [%rd5+8];\n"
440 " add.f32 %f1, %f4, %f3;\n"
441 " add.u32 %r5, %r7, %r5;\n"
442 " add.u64 %rd5, %rd5, %rd6;\n"
443 " .loc 3 181 0\n"
444 " ld.param.u32 %r6, [__cudaparm_reduce_float_2_true_n];\n"
445 " .loc 3 191 0\n"
446 " setp.lt.u32 %p2, %r5, %r6;\n"
447 " @%p2 bra $Lt_3_15362;\n"
448 " bra.uni $Lt_3_14850;\n"
449 "$Lt_3_16898:\n"
450 " mov.f32 %f1, 0f00000000; // 0\n"
451 "$Lt_3_14850:\n"
452 " .loc 3 71 0\n"
453 " mov.u64 %rd7, __smem;\n"
454 " cvt.u64.u32 %rd8, %r3;\n"
455 " mul.wide.u32 %rd9, %r3, 4;\n"
456 " add.u64 %rd10, %rd7, %rd9;\n"
457 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
458 " .loc 3 72 0\n"
459 " bar.sync 0;\n"
460 " mov.u32 %r8, 31;\n"
461 " setp.gt.u32 %p3, %r3, %r8;\n"
462 " @%p3 bra $Lt_3_15874;\n"
463 " .loc 3 88 0\n"
464 " ld.volatile.shared.f32 %f5, [%rd10+4];\n"
465 " add.f32 %f6, %f5, %f1;\n"
466 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
467 "$Lt_3_15874:\n"
468 " .loc 3 195 0\n"
469 " mov.u32 %r9, 0;\n"
470 " setp.ne.u32 %p4, %r3, %r9;\n"
471 " @%p4 bra $Lt_3_16386;\n"
472 " .loc 3 199 0\n"
473 " ld.shared.f32 %f7, [__smem+0];\n"
474 " ld.param.u64 %rd11, [__cudaparm_reduce_float_2_true_g_odata];\n"
475 " cvt.u64.u32 %rd12, %r1;\n"
476 " mul.wide.u32 %rd13, %r1, 4;\n"
477 " add.u64 %rd14, %rd11, %rd13;\n"
478 " st.global.f32 [%rd14+0], %f7;\n"
479 "$Lt_3_16386:\n"
480 " .loc 3 380 0\n"
481 " exit;\n"
482 "$LDWend_reduce_float_2_true:\n"
483 " } // reduce_float_2_true\n"
484 "\n"
485 " .entry reduce_float_4_true (\n"
486 " .param .u64 __cudaparm_reduce_float_4_true_g_idata,\n"
487 " .param .u64 __cudaparm_reduce_float_4_true_g_odata,\n"
488 " .param .u32 __cudaparm_reduce_float_4_true_n)\n"
489 " {\n"
490 " .reg .u16 %rh<3>;\n"
491 " .reg .u32 %r<11>;\n"
492 " .reg .u64 %rd<16>;\n"
493 " .reg .f32 %f<11>;\n"
494 " .reg .pred %p<6>;\n"
495 " .loc 3 382 0\n"
496 "$LDWbegin_reduce_float_4_true:\n"
497 " .loc 3 181 0\n"
498 " cvt.u32.u16 %r1, %ctaid.x;\n"
499 " mul24.lo.u32 %r2, %r1, 8;\n"
500 " cvt.u32.u16 %r3, %tid.x;\n"
501 " add.u32 %r4, %r2, %r3;\n"
502 " mov.s32 %r5, %r4;\n"
503 " ld.param.u32 %r6, [__cudaparm_reduce_float_4_true_n];\n"
504 " setp.ge.u32 %p1, %r4, %r6;\n"
505 " @%p1 bra $Lt_4_16642;\n"
506 " mov.u16 %rh1, %nctaid.x;\n"
507 " mul.wide.u16 %r7, %rh1, 8;\n"
508 " cvt.s64.u32 %rd1, %r7;\n"
509 " ld.param.u64 %rd2, [__cudaparm_reduce_float_4_true_g_idata];\n"
510 " cvt.u64.u32 %rd3, %r4;\n"
511 " mul.wide.u32 %rd4, %r4, 4;\n"
512 " add.u64 %rd5, %rd2, %rd4;\n"
513 " mul.wide.u32 %rd6, %r7, 4;\n"
514 " mov.f32 %f1, 0f00000000; // 0\n"
515 "$Lt_4_15106:\n"
516 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
517 " .loc 3 188 0\n"
518 " ld.global.f32 %f2, [%rd5+0];\n"
519 " add.f32 %f3, %f2, %f1;\n"
520 " .loc 3 191 0\n"
521 " ld.global.f32 %f4, [%rd5+16];\n"
522 " add.f32 %f1, %f4, %f3;\n"
523 " add.u32 %r5, %r7, %r5;\n"
524 " add.u64 %rd5, %rd5, %rd6;\n"
525 " .loc 3 181 0\n"
526 " ld.param.u32 %r6, [__cudaparm_reduce_float_4_true_n];\n"
527 " .loc 3 191 0\n"
528 " setp.lt.u32 %p2, %r5, %r6;\n"
529 " @%p2 bra $Lt_4_15106;\n"
530 " bra.uni $Lt_4_14594;\n"
531 "$Lt_4_16642:\n"
532 " mov.f32 %f1, 0f00000000; // 0\n"
533 "$Lt_4_14594:\n"
534 " .loc 3 71 0\n"
535 " mov.u64 %rd7, __smem;\n"
536 " cvt.u64.u32 %rd8, %r3;\n"
537 " mul.wide.u32 %rd9, %r3, 4;\n"
538 " add.u64 %rd10, %rd7, %rd9;\n"
539 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
540 " .loc 3 72 0\n"
541 " bar.sync 0;\n"
542 " mov.u32 %r8, 31;\n"
543 " setp.gt.u32 %p3, %r3, %r8;\n"
544 " @%p3 bra $Lt_4_15618;\n"
545 " .loc 3 87 0\n"
546 " ld.volatile.shared.f32 %f5, [%rd10+8];\n"
547 " add.f32 %f6, %f5, %f1;\n"
548 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
549 " .loc 3 88 0\n"
550 " ld.volatile.shared.f32 %f7, [%rd10+4];\n"
551 " add.f32 %f8, %f7, %f6;\n"
552 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
553 "$Lt_4_15618:\n"
554 " .loc 3 195 0\n"
555 " mov.u32 %r9, 0;\n"
556 " setp.ne.u32 %p4, %r3, %r9;\n"
557 " @%p4 bra $Lt_4_16130;\n"
558 " .loc 3 199 0\n"
559 " ld.shared.f32 %f9, [__smem+0];\n"
560 " ld.param.u64 %rd11, [__cudaparm_reduce_float_4_true_g_odata];\n"
561 " cvt.u64.u32 %rd12, %r1;\n"
562 " mul.wide.u32 %rd13, %r1, 4;\n"
563 " add.u64 %rd14, %rd11, %rd13;\n"
564 " st.global.f32 [%rd14+0], %f9;\n"
565 "$Lt_4_16130:\n"
566 " .loc 3 385 0\n"
567 " exit;\n"
568 "$LDWend_reduce_float_4_true:\n"
569 " } // reduce_float_4_true\n"
570 "\n"
571 " .entry reduce_float_8_true (\n"
572 " .param .u64 __cudaparm_reduce_float_8_true_g_idata,\n"
573 " .param .u64 __cudaparm_reduce_float_8_true_g_odata,\n"
574 " .param .u32 __cudaparm_reduce_float_8_true_n)\n"
575 " {\n"
576 " .reg .u16 %rh<3>;\n"
577 " .reg .u32 %r<11>;\n"
578 " .reg .u64 %rd<16>;\n"
579 " .reg .f32 %f<13>;\n"
580 " .reg .pred %p<6>;\n"
581 " .loc 3 387 0\n"
582 "$LDWbegin_reduce_float_8_true:\n"
583 " .loc 3 181 0\n"
584 " cvt.u32.u16 %r1, %ctaid.x;\n"
585 " mul24.lo.u32 %r2, %r1, 16;\n"
586 " cvt.u32.u16 %r3, %tid.x;\n"
587 " add.u32 %r4, %r2, %r3;\n"
588 " mov.s32 %r5, %r4;\n"
589 " ld.param.u32 %r6, [__cudaparm_reduce_float_8_true_n];\n"
590 " setp.ge.u32 %p1, %r4, %r6;\n"
591 " @%p1 bra $Lt_5_16386;\n"
592 " mov.u16 %rh1, %nctaid.x;\n"
593 " mul.wide.u16 %r7, %rh1, 16;\n"
594 " cvt.s64.u32 %rd1, %r7;\n"
595 " ld.param.u64 %rd2, [__cudaparm_reduce_float_8_true_g_idata];\n"
596 " cvt.u64.u32 %rd3, %r4;\n"
597 " mul.wide.u32 %rd4, %r4, 4;\n"
598 " add.u64 %rd5, %rd2, %rd4;\n"
599 " mul.wide.u32 %rd6, %r7, 4;\n"
600 " mov.f32 %f1, 0f00000000; // 0\n"
601 "$Lt_5_14850:\n"
602 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
603 " .loc 3 188 0\n"
604 " ld.global.f32 %f2, [%rd5+0];\n"
605 " add.f32 %f3, %f2, %f1;\n"
606 " .loc 3 191 0\n"
607 " ld.global.f32 %f4, [%rd5+32];\n"
608 " add.f32 %f1, %f4, %f3;\n"
609 " add.u32 %r5, %r7, %r5;\n"
610 " add.u64 %rd5, %rd5, %rd6;\n"
611 " .loc 3 181 0\n"
612 " ld.param.u32 %r6, [__cudaparm_reduce_float_8_true_n];\n"
613 " .loc 3 191 0\n"
614 " setp.lt.u32 %p2, %r5, %r6;\n"
615 " @%p2 bra $Lt_5_14850;\n"
616 " bra.uni $Lt_5_14338;\n"
617 "$Lt_5_16386:\n"
618 " mov.f32 %f1, 0f00000000; // 0\n"
619 "$Lt_5_14338:\n"
620 " .loc 3 71 0\n"
621 " mov.u64 %rd7, __smem;\n"
622 " cvt.u64.u32 %rd8, %r3;\n"
623 " mul.wide.u32 %rd9, %r3, 4;\n"
624 " add.u64 %rd10, %rd7, %rd9;\n"
625 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
626 " .loc 3 72 0\n"
627 " bar.sync 0;\n"
628 " mov.u32 %r8, 31;\n"
629 " setp.gt.u32 %p3, %r3, %r8;\n"
630 " @%p3 bra $Lt_5_15362;\n"
631 " .loc 3 86 0\n"
632 " ld.volatile.shared.f32 %f5, [%rd10+16];\n"
633 " add.f32 %f6, %f5, %f1;\n"
634 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
635 " .loc 3 87 0\n"
636 " ld.volatile.shared.f32 %f7, [%rd10+8];\n"
637 " add.f32 %f8, %f7, %f6;\n"
638 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
639 " .loc 3 88 0\n"
640 " ld.volatile.shared.f32 %f9, [%rd10+4];\n"
641 " add.f32 %f10, %f9, %f8;\n"
642 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
643 "$Lt_5_15362:\n"
644 " .loc 3 195 0\n"
645 " mov.u32 %r9, 0;\n"
646 " setp.ne.u32 %p4, %r3, %r9;\n"
647 " @%p4 bra $Lt_5_15874;\n"
648 " .loc 3 199 0\n"
649 " ld.shared.f32 %f11, [__smem+0];\n"
650 " ld.param.u64 %rd11, [__cudaparm_reduce_float_8_true_g_odata];\n"
651 " cvt.u64.u32 %rd12, %r1;\n"
652 " mul.wide.u32 %rd13, %r1, 4;\n"
653 " add.u64 %rd14, %rd11, %rd13;\n"
654 " st.global.f32 [%rd14+0], %f11;\n"
655 "$Lt_5_15874:\n"
656 " .loc 3 390 0\n"
657 " exit;\n"
658 "$LDWend_reduce_float_8_true:\n"
659 " } // reduce_float_8_true\n"
660 "\n"
661 " .entry reduce_float_16_true (\n"
662 " .param .u64 __cudaparm_reduce_float_16_true_g_idata,\n"
663 " .param .u64 __cudaparm_reduce_float_16_true_g_odata,\n"
664 " .param .u32 __cudaparm_reduce_float_16_true_n)\n"
665 " {\n"
666 " .reg .u16 %rh<3>;\n"
667 " .reg .u32 %r<11>;\n"
668 " .reg .u64 %rd<16>;\n"
669 " .reg .f32 %f<15>;\n"
670 " .reg .pred %p<6>;\n"
671 " .loc 3 392 0\n"
672 "$LDWbegin_reduce_float_16_true:\n"
673 " .loc 3 181 0\n"
674 " cvt.u32.u16 %r1, %ctaid.x;\n"
675 " mul24.lo.u32 %r2, %r1, 32;\n"
676 " cvt.u32.u16 %r3, %tid.x;\n"
677 " add.u32 %r4, %r2, %r3;\n"
678 " mov.s32 %r5, %r4;\n"
679 " ld.param.u32 %r6, [__cudaparm_reduce_float_16_true_n];\n"
680 " setp.ge.u32 %p1, %r4, %r6;\n"
681 " @%p1 bra $Lt_6_16130;\n"
682 " mov.u16 %rh1, %nctaid.x;\n"
683 " mul.wide.u16 %r7, %rh1, 32;\n"
684 " cvt.s64.u32 %rd1, %r7;\n"
685 " ld.param.u64 %rd2, [__cudaparm_reduce_float_16_true_g_idata];\n"
686 " cvt.u64.u32 %rd3, %r4;\n"
687 " mul.wide.u32 %rd4, %r4, 4;\n"
688 " add.u64 %rd5, %rd2, %rd4;\n"
689 " mul.wide.u32 %rd6, %r7, 4;\n"
690 " mov.f32 %f1, 0f00000000; // 0\n"
691 "$Lt_6_14594:\n"
692 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
693 " .loc 3 188 0\n"
694 " ld.global.f32 %f2, [%rd5+0];\n"
695 " add.f32 %f3, %f2, %f1;\n"
696 " .loc 3 191 0\n"
697 " ld.global.f32 %f4, [%rd5+64];\n"
698 " add.f32 %f1, %f4, %f3;\n"
699 " add.u32 %r5, %r7, %r5;\n"
700 " add.u64 %rd5, %rd5, %rd6;\n"
701 " .loc 3 181 0\n"
702 " ld.param.u32 %r6, [__cudaparm_reduce_float_16_true_n];\n"
703 " .loc 3 191 0\n"
704 " setp.lt.u32 %p2, %r5, %r6;\n"
705 " @%p2 bra $Lt_6_14594;\n"
706 " bra.uni $Lt_6_14082;\n"
707 "$Lt_6_16130:\n"
708 " mov.f32 %f1, 0f00000000; // 0\n"
709 "$Lt_6_14082:\n"
710 " .loc 3 71 0\n"
711 " mov.u64 %rd7, __smem;\n"
712 " cvt.u64.u32 %rd8, %r3;\n"
713 " mul.wide.u32 %rd9, %r3, 4;\n"
714 " add.u64 %rd10, %rd7, %rd9;\n"
715 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
716 " .loc 3 72 0\n"
717 " bar.sync 0;\n"
718 " mov.u32 %r8, 31;\n"
719 " setp.gt.u32 %p3, %r3, %r8;\n"
720 " @%p3 bra $Lt_6_15106;\n"
721 " .loc 3 85 0\n"
722 " ld.volatile.shared.f32 %f5, [%rd10+32];\n"
723 " add.f32 %f6, %f5, %f1;\n"
724 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
725 " .loc 3 86 0\n"
726 " ld.volatile.shared.f32 %f7, [%rd10+16];\n"
727 " add.f32 %f8, %f7, %f6;\n"
728 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
729 " .loc 3 87 0\n"
730 " ld.volatile.shared.f32 %f9, [%rd10+8];\n"
731 " add.f32 %f10, %f9, %f8;\n"
732 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
733 " .loc 3 88 0\n"
734 " ld.volatile.shared.f32 %f11, [%rd10+4];\n"
735 " add.f32 %f12, %f11, %f10;\n"
736 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
737 "$Lt_6_15106:\n"
738 " .loc 3 195 0\n"
739 " mov.u32 %r9, 0;\n"
740 " setp.ne.u32 %p4, %r3, %r9;\n"
741 " @%p4 bra $Lt_6_15618;\n"
742 " .loc 3 199 0\n"
743 " ld.shared.f32 %f13, [__smem+0];\n"
744 " ld.param.u64 %rd11, [__cudaparm_reduce_float_16_true_g_odata];\n"
745 " cvt.u64.u32 %rd12, %r1;\n"
746 " mul.wide.u32 %rd13, %r1, 4;\n"
747 " add.u64 %rd14, %rd11, %rd13;\n"
748 " st.global.f32 [%rd14+0], %f13;\n"
749 "$Lt_6_15618:\n"
750 " .loc 3 395 0\n"
751 " exit;\n"
752 "$LDWend_reduce_float_16_true:\n"
753 " } // reduce_float_16_true\n"
754 "\n"
755 " .entry reduce_float_32_true (\n"
756 " .param .u64 __cudaparm_reduce_float_32_true_g_idata,\n"
757 " .param .u64 __cudaparm_reduce_float_32_true_g_odata,\n"
758 " .param .u32 __cudaparm_reduce_float_32_true_n)\n"
759 " {\n"
760 " .reg .u16 %rh<3>;\n"
761 " .reg .u32 %r<11>;\n"
762 " .reg .u64 %rd<16>;\n"
763 " .reg .f32 %f<17>;\n"
764 " .reg .pred %p<6>;\n"
765 " .loc 3 397 0\n"
766 "$LDWbegin_reduce_float_32_true:\n"
767 " .loc 3 181 0\n"
768 " cvt.u32.u16 %r1, %ctaid.x;\n"
769 " mul24.lo.u32 %r2, %r1, 64;\n"
770 " cvt.u32.u16 %r3, %tid.x;\n"
771 " add.u32 %r4, %r2, %r3;\n"
772 " mov.s32 %r5, %r4;\n"
773 " ld.param.u32 %r6, [__cudaparm_reduce_float_32_true_n];\n"
774 " setp.ge.u32 %p1, %r4, %r6;\n"
775 " @%p1 bra $Lt_7_15874;\n"
776 " mov.u16 %rh1, %nctaid.x;\n"
777 " mul.wide.u16 %r7, %rh1, 64;\n"
778 " cvt.s64.u32 %rd1, %r7;\n"
779 " ld.param.u64 %rd2, [__cudaparm_reduce_float_32_true_g_idata];\n"
780 " cvt.u64.u32 %rd3, %r4;\n"
781 " mul.wide.u32 %rd4, %r4, 4;\n"
782 " add.u64 %rd5, %rd2, %rd4;\n"
783 " mul.wide.u32 %rd6, %r7, 4;\n"
784 " mov.f32 %f1, 0f00000000; // 0\n"
785 "$Lt_7_14338:\n"
786 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
787 " .loc 3 188 0\n"
788 " ld.global.f32 %f2, [%rd5+0];\n"
789 " add.f32 %f3, %f2, %f1;\n"
790 " .loc 3 191 0\n"
791 " ld.global.f32 %f4, [%rd5+128];\n"
792 " add.f32 %f1, %f4, %f3;\n"
793 " add.u32 %r5, %r7, %r5;\n"
794 " add.u64 %rd5, %rd5, %rd6;\n"
795 " .loc 3 181 0\n"
796 " ld.param.u32 %r6, [__cudaparm_reduce_float_32_true_n];\n"
797 " .loc 3 191 0\n"
798 " setp.lt.u32 %p2, %r5, %r6;\n"
799 " @%p2 bra $Lt_7_14338;\n"
800 " bra.uni $Lt_7_13826;\n"
801 "$Lt_7_15874:\n"
802 " mov.f32 %f1, 0f00000000; // 0\n"
803 "$Lt_7_13826:\n"
804 " .loc 3 71 0\n"
805 " mov.u64 %rd7, __smem;\n"
806 " cvt.u64.u32 %rd8, %r3;\n"
807 " mul.wide.u32 %rd9, %r3, 4;\n"
808 " add.u64 %rd10, %rd7, %rd9;\n"
809 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
810 " .loc 3 72 0\n"
811 " bar.sync 0;\n"
812 " mov.u32 %r8, 31;\n"
813 " setp.gt.u32 %p3, %r3, %r8;\n"
814 " @%p3 bra $Lt_7_14850;\n"
815 " .loc 3 84 0\n"
816 " ld.volatile.shared.f32 %f5, [%rd10+64];\n"
817 " add.f32 %f6, %f5, %f1;\n"
818 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
819 " .loc 3 85 0\n"
820 " ld.volatile.shared.f32 %f7, [%rd10+32];\n"
821 " add.f32 %f8, %f7, %f6;\n"
822 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
823 " .loc 3 86 0\n"
824 " ld.volatile.shared.f32 %f9, [%rd10+16];\n"
825 " add.f32 %f10, %f9, %f8;\n"
826 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
827 " .loc 3 87 0\n"
828 " ld.volatile.shared.f32 %f11, [%rd10+8];\n"
829 " add.f32 %f12, %f11, %f10;\n"
830 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
831 " .loc 3 88 0\n"
832 " ld.volatile.shared.f32 %f13, [%rd10+4];\n"
833 " add.f32 %f14, %f13, %f12;\n"
834 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
835 "$Lt_7_14850:\n"
836 " .loc 3 195 0\n"
837 " mov.u32 %r9, 0;\n"
838 " setp.ne.u32 %p4, %r3, %r9;\n"
839 " @%p4 bra $Lt_7_15362;\n"
840 " .loc 3 199 0\n"
841 " ld.shared.f32 %f15, [__smem+0];\n"
842 " ld.param.u64 %rd11, [__cudaparm_reduce_float_32_true_g_odata];\n"
843 " cvt.u64.u32 %rd12, %r1;\n"
844 " mul.wide.u32 %rd13, %r1, 4;\n"
845 " add.u64 %rd14, %rd11, %rd13;\n"
846 " st.global.f32 [%rd14+0], %f15;\n"
847 "$Lt_7_15362:\n"
848 " .loc 3 400 0\n"
849 " exit;\n"
850 "$LDWend_reduce_float_32_true:\n"
851 " } // reduce_float_32_true\n"
852 "\n"
853 " .entry reduce_float_64_true (\n"
854 " .param .u64 __cudaparm_reduce_float_64_true_g_idata,\n"
855 " .param .u64 __cudaparm_reduce_float_64_true_g_odata,\n"
856 " .param .u32 __cudaparm_reduce_float_64_true_n)\n"
857 " {\n"
858 " .reg .u16 %rh<3>;\n"
859 " .reg .u32 %r<11>;\n"
860 " .reg .u64 %rd<16>;\n"
861 " .reg .f32 %f<19>;\n"
862 " .reg .pred %p<6>;\n"
863 " .loc 3 402 0\n"
864 "$LDWbegin_reduce_float_64_true:\n"
865 " .loc 3 181 0\n"
866 " cvt.u32.u16 %r1, %ctaid.x;\n"
867 " mul24.lo.u32 %r2, %r1, 128;\n"
868 " cvt.u32.u16 %r3, %tid.x;\n"
869 " add.u32 %r4, %r2, %r3;\n"
870 " mov.s32 %r5, %r4;\n"
871 " ld.param.u32 %r6, [__cudaparm_reduce_float_64_true_n];\n"
872 " setp.ge.u32 %p1, %r4, %r6;\n"
873 " @%p1 bra $Lt_8_15618;\n"
874 " mov.u16 %rh1, %nctaid.x;\n"
875 " mul.wide.u16 %r7, %rh1, 128;\n"
876 " cvt.s64.u32 %rd1, %r7;\n"
877 " ld.param.u64 %rd2, [__cudaparm_reduce_float_64_true_g_idata];\n"
878 " cvt.u64.u32 %rd3, %r4;\n"
879 " mul.wide.u32 %rd4, %r4, 4;\n"
880 " add.u64 %rd5, %rd2, %rd4;\n"
881 " mul.wide.u32 %rd6, %r7, 4;\n"
882 " mov.f32 %f1, 0f00000000; // 0\n"
883 "$Lt_8_14082:\n"
884 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
885 " .loc 3 188 0\n"
886 " ld.global.f32 %f2, [%rd5+0];\n"
887 " add.f32 %f3, %f2, %f1;\n"
888 " .loc 3 191 0\n"
889 " ld.global.f32 %f4, [%rd5+256];\n"
890 " add.f32 %f1, %f4, %f3;\n"
891 " add.u32 %r5, %r7, %r5;\n"
892 " add.u64 %rd5, %rd5, %rd6;\n"
893 " .loc 3 181 0\n"
894 " ld.param.u32 %r6, [__cudaparm_reduce_float_64_true_n];\n"
895 " .loc 3 191 0\n"
896 " setp.lt.u32 %p2, %r5, %r6;\n"
897 " @%p2 bra $Lt_8_14082;\n"
898 " bra.uni $Lt_8_13570;\n"
899 "$Lt_8_15618:\n"
900 " mov.f32 %f1, 0f00000000; // 0\n"
901 "$Lt_8_13570:\n"
902 " .loc 3 71 0\n"
903 " mov.u64 %rd7, __smem;\n"
904 " cvt.u64.u32 %rd8, %r3;\n"
905 " mul.wide.u32 %rd9, %r3, 4;\n"
906 " add.u64 %rd10, %rd7, %rd9;\n"
907 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
908 " .loc 3 72 0\n"
909 " bar.sync 0;\n"
910 " mov.u32 %r8, 31;\n"
911 " setp.gt.u32 %p3, %r3, %r8;\n"
912 " @%p3 bra $Lt_8_14594;\n"
913 " .loc 3 83 0\n"
914 " ld.volatile.shared.f32 %f5, [%rd10+128];\n"
915 " add.f32 %f6, %f5, %f1;\n"
916 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
917 " .loc 3 84 0\n"
918 " ld.volatile.shared.f32 %f7, [%rd10+64];\n"
919 " add.f32 %f8, %f7, %f6;\n"
920 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
921 " .loc 3 85 0\n"
922 " ld.volatile.shared.f32 %f9, [%rd10+32];\n"
923 " add.f32 %f10, %f9, %f8;\n"
924 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
925 " .loc 3 86 0\n"
926 " ld.volatile.shared.f32 %f11, [%rd10+16];\n"
927 " add.f32 %f12, %f11, %f10;\n"
928 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
929 " .loc 3 87 0\n"
930 " ld.volatile.shared.f32 %f13, [%rd10+8];\n"
931 " add.f32 %f14, %f13, %f12;\n"
932 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
933 " .loc 3 88 0\n"
934 " ld.volatile.shared.f32 %f15, [%rd10+4];\n"
935 " add.f32 %f16, %f15, %f14;\n"
936 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
937 "$Lt_8_14594:\n"
938 " .loc 3 195 0\n"
939 " mov.u32 %r9, 0;\n"
940 " setp.ne.u32 %p4, %r3, %r9;\n"
941 " @%p4 bra $Lt_8_15106;\n"
942 " .loc 3 199 0\n"
943 " ld.shared.f32 %f17, [__smem+0];\n"
944 " ld.param.u64 %rd11, [__cudaparm_reduce_float_64_true_g_odata];\n"
945 " cvt.u64.u32 %rd12, %r1;\n"
946 " mul.wide.u32 %rd13, %r1, 4;\n"
947 " add.u64 %rd14, %rd11, %rd13;\n"
948 " st.global.f32 [%rd14+0], %f17;\n"
949 "$Lt_8_15106:\n"
950 " .loc 3 405 0\n"
951 " exit;\n"
952 "$LDWend_reduce_float_64_true:\n"
953 " } // reduce_float_64_true\n"
954 "\n"
955 " .entry reduce_float_128_true (\n"
956 " .param .u64 __cudaparm_reduce_float_128_true_g_idata,\n"
957 " .param .u64 __cudaparm_reduce_float_128_true_g_odata,\n"
958 " .param .u32 __cudaparm_reduce_float_128_true_n)\n"
959 " {\n"
960 " .reg .u16 %rh<3>;\n"
961 " .reg .u32 %r<12>;\n"
962 " .reg .u64 %rd<16>;\n"
963 " .reg .f32 %f<21>;\n"
964 " .reg .pred %p<7>;\n"
965 " .loc 3 407 0\n"
966 "$LDWbegin_reduce_float_128_true:\n"
967 " .loc 3 181 0\n"
968 " cvt.u32.u16 %r1, %ctaid.x;\n"
969 " mul.lo.u32 %r2, %r1, 256;\n"
970 " cvt.u32.u16 %r3, %tid.x;\n"
971 " add.u32 %r4, %r2, %r3;\n"
972 " mov.s32 %r5, %r4;\n"
973 " ld.param.u32 %r6, [__cudaparm_reduce_float_128_true_n];\n"
974 " setp.ge.u32 %p1, %r4, %r6;\n"
975 " @%p1 bra $Lt_9_15874;\n"
976 " mov.u16 %rh1, %nctaid.x;\n"
977 " mul.wide.u16 %r7, %rh1, 256;\n"
978 " cvt.s64.u32 %rd1, %r7;\n"
979 " ld.param.u64 %rd2, [__cudaparm_reduce_float_128_true_g_idata];\n"
980 " cvt.u64.u32 %rd3, %r4;\n"
981 " mul.wide.u32 %rd4, %r4, 4;\n"
982 " add.u64 %rd5, %rd2, %rd4;\n"
983 " mul.wide.u32 %rd6, %r7, 4;\n"
984 " mov.f32 %f1, 0f00000000; // 0\n"
985 "$Lt_9_13826:\n"
986 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
987 " .loc 3 188 0\n"
988 " ld.global.f32 %f2, [%rd5+0];\n"
989 " add.f32 %f3, %f2, %f1;\n"
990 " .loc 3 191 0\n"
991 " ld.global.f32 %f4, [%rd5+512];\n"
992 " add.f32 %f1, %f4, %f3;\n"
993 " add.u32 %r5, %r7, %r5;\n"
994 " add.u64 %rd5, %rd5, %rd6;\n"
995 " .loc 3 181 0\n"
996 " ld.param.u32 %r6, [__cudaparm_reduce_float_128_true_n];\n"
997 " .loc 3 191 0\n"
998 " setp.lt.u32 %p2, %r5, %r6;\n"
999 " @%p2 bra $Lt_9_13826;\n"
1000 " bra.uni $Lt_9_13314;\n"
1001 "$Lt_9_15874:\n"
1002 " mov.f32 %f1, 0f00000000; // 0\n"
1003 "$Lt_9_13314:\n"
1004 " .loc 3 195 0\n"
1005 " mov.f32 %f5, %f1;\n"
1006 " mov.f32 %f6, %f5;\n"
1007 " .loc 3 71 0\n"
1008 " mov.u64 %rd7, __smem;\n"
1009 " cvt.u64.u32 %rd8, %r3;\n"
1010 " mul.wide.u32 %rd9, %r3, 4;\n"
1011 " add.u64 %rd10, %rd7, %rd9;\n"
1012 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1013 " .loc 3 72 0\n"
1014 " bar.sync 0;\n"
1015 " mov.u32 %r8, 63;\n"
1016 " setp.gt.u32 %p3, %r3, %r8;\n"
1017 " @%p3 bra $Lt_9_14338;\n"
1018 " .loc 3 77 0\n"
1019 " ld.volatile.shared.f32 %f7, [%rd10+256];\n"
1020 " add.f32 %f6, %f7, %f5;\n"
1021 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1022 "$Lt_9_14338:\n"
1023 " bar.sync 0;\n"
1024 " mov.u32 %r9, 31;\n"
1025 " setp.gt.u32 %p4, %r3, %r9;\n"
1026 " @%p4 bra $Lt_9_14850;\n"
1027 " .loc 3 83 0\n"
1028 " ld.volatile.shared.f32 %f8, [%rd10+128];\n"
1029 " add.f32 %f9, %f8, %f6;\n"
1030 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1031 " .loc 3 84 0\n"
1032 " ld.volatile.shared.f32 %f10, [%rd10+64];\n"
1033 " add.f32 %f11, %f10, %f9;\n"
1034 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1035 " .loc 3 85 0\n"
1036 " ld.volatile.shared.f32 %f12, [%rd10+32];\n"
1037 " add.f32 %f13, %f12, %f11;\n"
1038 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1039 " .loc 3 86 0\n"
1040 " ld.volatile.shared.f32 %f14, [%rd10+16];\n"
1041 " add.f32 %f15, %f14, %f13;\n"
1042 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
1043 " .loc 3 87 0\n"
1044 " ld.volatile.shared.f32 %f16, [%rd10+8];\n"
1045 " add.f32 %f17, %f16, %f15;\n"
1046 " st.volatile.shared.f32 [%rd10+0], %f17;\n"
1047 " .loc 3 88 0\n"
1048 " ld.volatile.shared.f32 %f18, [%rd10+4];\n"
1049 " add.f32 %f6, %f18, %f17;\n"
1050 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1051 "$Lt_9_14850:\n"
1052 " .loc 3 195 0\n"
1053 " mov.u32 %r10, 0;\n"
1054 " setp.ne.u32 %p5, %r3, %r10;\n"
1055 " @%p5 bra $Lt_9_15362;\n"
1056 " .loc 3 199 0\n"
1057 " ld.shared.f32 %f19, [__smem+0];\n"
1058 " ld.param.u64 %rd11, [__cudaparm_reduce_float_128_true_g_odata];\n"
1059 " cvt.u64.u32 %rd12, %r1;\n"
1060 " mul.wide.u32 %rd13, %r1, 4;\n"
1061 " add.u64 %rd14, %rd11, %rd13;\n"
1062 " st.global.f32 [%rd14+0], %f19;\n"
1063 "$Lt_9_15362:\n"
1064 " .loc 3 410 0\n"
1065 " exit;\n"
1066 "$LDWend_reduce_float_128_true:\n"
1067 " } // reduce_float_128_true\n"
1068 "\n"
1069 " .entry reduce_float_256_true (\n"
1070 " .param .u64 __cudaparm_reduce_float_256_true_g_idata,\n"
1071 " .param .u64 __cudaparm_reduce_float_256_true_g_odata,\n"
1072 " .param .u32 __cudaparm_reduce_float_256_true_n)\n"
1073 " {\n"
1074 " .reg .u16 %rh<3>;\n"
1075 " .reg .u32 %r<13>;\n"
1076 " .reg .u64 %rd<16>;\n"
1077 " .reg .f32 %f<22>;\n"
1078 " .reg .pred %p<8>;\n"
1079 " .loc 3 412 0\n"
1080 "$LDWbegin_reduce_float_256_true:\n"
1081 " .loc 3 181 0\n"
1082 " cvt.u32.u16 %r1, %ctaid.x;\n"
1083 " mul.lo.u32 %r2, %r1, 512;\n"
1084 " cvt.u32.u16 %r3, %tid.x;\n"
1085 " add.u32 %r4, %r2, %r3;\n"
1086 " mov.s32 %r5, %r4;\n"
1087 " ld.param.u32 %r6, [__cudaparm_reduce_float_256_true_n];\n"
1088 " setp.ge.u32 %p1, %r4, %r6;\n"
1089 " @%p1 bra $Lt_10_16130;\n"
1090 " mov.u16 %rh1, %nctaid.x;\n"
1091 " mul.wide.u16 %r7, %rh1, 512;\n"
1092 " cvt.s64.u32 %rd1, %r7;\n"
1093 " ld.param.u64 %rd2, [__cudaparm_reduce_float_256_true_g_idata];\n"
1094 " cvt.u64.u32 %rd3, %r4;\n"
1095 " mul.wide.u32 %rd4, %r4, 4;\n"
1096 " add.u64 %rd5, %rd2, %rd4;\n"
1097 " mul.wide.u32 %rd6, %r7, 4;\n"
1098 " mov.f32 %f1, 0f00000000; // 0\n"
1099 "$Lt_10_13570:\n"
1100 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1101 " .loc 3 188 0\n"
1102 " ld.global.f32 %f2, [%rd5+0];\n"
1103 " add.f32 %f3, %f2, %f1;\n"
1104 " .loc 3 191 0\n"
1105 " ld.global.f32 %f4, [%rd5+1024];\n"
1106 " add.f32 %f1, %f4, %f3;\n"
1107 " add.u32 %r5, %r7, %r5;\n"
1108 " add.u64 %rd5, %rd5, %rd6;\n"
1109 " .loc 3 181 0\n"
1110 " ld.param.u32 %r6, [__cudaparm_reduce_float_256_true_n];\n"
1111 " .loc 3 191 0\n"
1112 " setp.lt.u32 %p2, %r5, %r6;\n"
1113 " @%p2 bra $Lt_10_13570;\n"
1114 " bra.uni $Lt_10_13058;\n"
1115 "$Lt_10_16130:\n"
1116 " mov.f32 %f1, 0f00000000; // 0\n"
1117 "$Lt_10_13058:\n"
1118 " .loc 3 195 0\n"
1119 " mov.f32 %f5, %f1;\n"
1120 " mov.f32 %f6, %f5;\n"
1121 " .loc 3 71 0\n"
1122 " mov.u64 %rd7, __smem;\n"
1123 " cvt.u64.u32 %rd8, %r3;\n"
1124 " mul.wide.u32 %rd9, %r3, 4;\n"
1125 " add.u64 %rd10, %rd7, %rd9;\n"
1126 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1127 " .loc 3 72 0\n"
1128 " bar.sync 0;\n"
1129 " mov.u32 %r8, 127;\n"
1130 " setp.gt.u32 %p3, %r3, %r8;\n"
1131 " @%p3 bra $Lt_10_14082;\n"
1132 " .loc 3 76 0\n"
1133 " ld.volatile.shared.f32 %f7, [%rd10+512];\n"
1134 " add.f32 %f6, %f7, %f5;\n"
1135 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1136 "$Lt_10_14082:\n"
1137 " bar.sync 0;\n"
1138 " mov.u32 %r9, 63;\n"
1139 " setp.gt.u32 %p4, %r3, %r9;\n"
1140 " @%p4 bra $Lt_10_14594;\n"
1141 " .loc 3 77 0\n"
1142 " ld.volatile.shared.f32 %f8, [%rd10+256];\n"
1143 " add.f32 %f6, %f8, %f6;\n"
1144 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1145 "$Lt_10_14594:\n"
1146 " bar.sync 0;\n"
1147 " mov.u32 %r10, 31;\n"
1148 " setp.gt.u32 %p5, %r3, %r10;\n"
1149 " @%p5 bra $Lt_10_15106;\n"
1150 " .loc 3 83 0\n"
1151 " ld.volatile.shared.f32 %f9, [%rd10+128];\n"
1152 " add.f32 %f10, %f9, %f6;\n"
1153 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
1154 " .loc 3 84 0\n"
1155 " ld.volatile.shared.f32 %f11, [%rd10+64];\n"
1156 " add.f32 %f12, %f11, %f10;\n"
1157 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
1158 " .loc 3 85 0\n"
1159 " ld.volatile.shared.f32 %f13, [%rd10+32];\n"
1160 " add.f32 %f14, %f13, %f12;\n"
1161 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
1162 " .loc 3 86 0\n"
1163 " ld.volatile.shared.f32 %f15, [%rd10+16];\n"
1164 " add.f32 %f16, %f15, %f14;\n"
1165 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
1166 " .loc 3 87 0\n"
1167 " ld.volatile.shared.f32 %f17, [%rd10+8];\n"
1168 " add.f32 %f18, %f17, %f16;\n"
1169 " st.volatile.shared.f32 [%rd10+0], %f18;\n"
1170 " .loc 3 88 0\n"
1171 " ld.volatile.shared.f32 %f19, [%rd10+4];\n"
1172 " add.f32 %f6, %f19, %f18;\n"
1173 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1174 "$Lt_10_15106:\n"
1175 " .loc 3 195 0\n"
1176 " mov.u32 %r11, 0;\n"
1177 " setp.ne.u32 %p6, %r3, %r11;\n"
1178 " @%p6 bra $Lt_10_15618;\n"
1179 " .loc 3 199 0\n"
1180 " ld.shared.f32 %f20, [__smem+0];\n"
1181 " ld.param.u64 %rd11, [__cudaparm_reduce_float_256_true_g_odata];\n"
1182 " cvt.u64.u32 %rd12, %r1;\n"
1183 " mul.wide.u32 %rd13, %r1, 4;\n"
1184 " add.u64 %rd14, %rd11, %rd13;\n"
1185 " st.global.f32 [%rd14+0], %f20;\n"
1186 "$Lt_10_15618:\n"
1187 " .loc 3 415 0\n"
1188 " exit;\n"
1189 "$LDWend_reduce_float_256_true:\n"
1190 " } // reduce_float_256_true\n"
1191 "\n"
1192 " .entry reduce_float_512_true (\n"
1193 " .param .u64 __cudaparm_reduce_float_512_true_g_idata,\n"
1194 " .param .u64 __cudaparm_reduce_float_512_true_g_odata,\n"
1195 " .param .u32 __cudaparm_reduce_float_512_true_n)\n"
1196 " {\n"
1197 " .reg .u16 %rh<3>;\n"
1198 " .reg .u32 %r<14>;\n"
1199 " .reg .u64 %rd<16>;\n"
1200 " .reg .f32 %f<23>;\n"
1201 " .reg .pred %p<9>;\n"
1202 " .loc 3 417 0\n"
1203 "$LDWbegin_reduce_float_512_true:\n"
1204 " .loc 3 181 0\n"
1205 " cvt.u32.u16 %r1, %ctaid.x;\n"
1206 " mul.lo.u32 %r2, %r1, 1024;\n"
1207 " cvt.u32.u16 %r3, %tid.x;\n"
1208 " add.u32 %r4, %r2, %r3;\n"
1209 " mov.s32 %r5, %r4;\n"
1210 " ld.param.u32 %r6, [__cudaparm_reduce_float_512_true_n];\n"
1211 " setp.ge.u32 %p1, %r4, %r6;\n"
1212 " @%p1 bra $Lt_11_16386;\n"
1213 " mov.u16 %rh1, %nctaid.x;\n"
1214 " mul.wide.u16 %r7, %rh1, 1024;\n"
1215 " cvt.s64.u32 %rd1, %r7;\n"
1216 " ld.param.u64 %rd2, [__cudaparm_reduce_float_512_true_g_idata];\n"
1217 " cvt.u64.u32 %rd3, %r4;\n"
1218 " mul.wide.u32 %rd4, %r4, 4;\n"
1219 " add.u64 %rd5, %rd2, %rd4;\n"
1220 " mul.wide.u32 %rd6, %r7, 4;\n"
1221 " mov.f32 %f1, 0f00000000; // 0\n"
1222 "$Lt_11_13314:\n"
1223 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1224 " .loc 3 188 0\n"
1225 " ld.global.f32 %f2, [%rd5+0];\n"
1226 " add.f32 %f3, %f2, %f1;\n"
1227 " .loc 3 191 0\n"
1228 " ld.global.f32 %f4, [%rd5+2048];\n"
1229 " add.f32 %f1, %f4, %f3;\n"
1230 " add.u32 %r5, %r7, %r5;\n"
1231 " add.u64 %rd5, %rd5, %rd6;\n"
1232 " .loc 3 181 0\n"
1233 " ld.param.u32 %r6, [__cudaparm_reduce_float_512_true_n];\n"
1234 " .loc 3 191 0\n"
1235 " setp.lt.u32 %p2, %r5, %r6;\n"
1236 " @%p2 bra $Lt_11_13314;\n"
1237 " bra.uni $Lt_11_12802;\n"
1238 "$Lt_11_16386:\n"
1239 " mov.f32 %f1, 0f00000000; // 0\n"
1240 "$Lt_11_12802:\n"
1241 " .loc 3 195 0\n"
1242 " mov.f32 %f5, %f1;\n"
1243 " mov.f32 %f6, %f5;\n"
1244 " .loc 3 71 0\n"
1245 " mov.u64 %rd7, __smem;\n"
1246 " cvt.u64.u32 %rd8, %r3;\n"
1247 " mul.wide.u32 %rd9, %r3, 4;\n"
1248 " add.u64 %rd10, %rd7, %rd9;\n"
1249 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1250 " .loc 3 72 0\n"
1251 " bar.sync 0;\n"
1252 " mov.u32 %r8, 255;\n"
1253 " setp.gt.u32 %p3, %r3, %r8;\n"
1254 " @%p3 bra $Lt_11_13826;\n"
1255 " .loc 3 75 0\n"
1256 " ld.volatile.shared.f32 %f7, [%rd10+1024];\n"
1257 " add.f32 %f6, %f7, %f5;\n"
1258 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1259 "$Lt_11_13826:\n"
1260 " bar.sync 0;\n"
1261 " mov.u32 %r9, 127;\n"
1262 " setp.gt.u32 %p4, %r3, %r9;\n"
1263 " @%p4 bra $Lt_11_14338;\n"
1264 " .loc 3 76 0\n"
1265 " ld.volatile.shared.f32 %f8, [%rd10+512];\n"
1266 " add.f32 %f6, %f8, %f6;\n"
1267 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1268 "$Lt_11_14338:\n"
1269 " bar.sync 0;\n"
1270 " mov.u32 %r10, 63;\n"
1271 " setp.gt.u32 %p5, %r3, %r10;\n"
1272 " @%p5 bra $Lt_11_14850;\n"
1273 " .loc 3 77 0\n"
1274 " ld.volatile.shared.f32 %f9, [%rd10+256];\n"
1275 " add.f32 %f6, %f9, %f6;\n"
1276 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1277 "$Lt_11_14850:\n"
1278 " bar.sync 0;\n"
1279 " mov.u32 %r11, 31;\n"
1280 " setp.gt.u32 %p6, %r3, %r11;\n"
1281 " @%p6 bra $Lt_11_15362;\n"
1282 " .loc 3 83 0\n"
1283 " ld.volatile.shared.f32 %f10, [%rd10+128];\n"
1284 " add.f32 %f11, %f10, %f6;\n"
1285 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1286 " .loc 3 84 0\n"
1287 " ld.volatile.shared.f32 %f12, [%rd10+64];\n"
1288 " add.f32 %f13, %f12, %f11;\n"
1289 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1290 " .loc 3 85 0\n"
1291 " ld.volatile.shared.f32 %f14, [%rd10+32];\n"
1292 " add.f32 %f15, %f14, %f13;\n"
1293 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
1294 " .loc 3 86 0\n"
1295 " ld.volatile.shared.f32 %f16, [%rd10+16];\n"
1296 " add.f32 %f17, %f16, %f15;\n"
1297 " st.volatile.shared.f32 [%rd10+0], %f17;\n"
1298 " .loc 3 87 0\n"
1299 " ld.volatile.shared.f32 %f18, [%rd10+8];\n"
1300 " add.f32 %f19, %f18, %f17;\n"
1301 " st.volatile.shared.f32 [%rd10+0], %f19;\n"
1302 " .loc 3 88 0\n"
1303 " ld.volatile.shared.f32 %f20, [%rd10+4];\n"
1304 " add.f32 %f6, %f20, %f19;\n"
1305 " st.volatile.shared.f32 [%rd10+0], %f6;\n"
1306 "$Lt_11_15362:\n"
1307 " .loc 3 195 0\n"
1308 " mov.u32 %r12, 0;\n"
1309 " setp.ne.u32 %p7, %r3, %r12;\n"
1310 " @%p7 bra $Lt_11_15874;\n"
1311 " .loc 3 199 0\n"
1312 " ld.shared.f32 %f21, [__smem+0];\n"
1313 " ld.param.u64 %rd11, [__cudaparm_reduce_float_512_true_g_odata];\n"
1314 " cvt.u64.u32 %rd12, %r1;\n"
1315 " mul.wide.u32 %rd13, %r1, 4;\n"
1316 " add.u64 %rd14, %rd11, %rd13;\n"
1317 " st.global.f32 [%rd14+0], %f21;\n"
1318 "$Lt_11_15874:\n"
1319 " .loc 3 420 0\n"
1320 " exit;\n"
1321 "$LDWend_reduce_float_512_true:\n"
1322 " } // reduce_float_512_true\n"
1323 "\n"
1324 " .entry reduce_float_1_false (\n"
1325 " .param .u64 __cudaparm_reduce_float_1_false_g_idata,\n"
1326 " .param .u64 __cudaparm_reduce_float_1_false_g_odata,\n"
1327 " .param .u32 __cudaparm_reduce_float_1_false_n)\n"
1328 " {\n"
1329 " .reg .u16 %rh<3>;\n"
1330 " .reg .u32 %r<11>;\n"
1331 " .reg .u64 %rd<16>;\n"
1332 " .reg .f32 %f<6>;\n"
1333 " .reg .pred %p<6>;\n"
1334 " .loc 3 423 0\n"
1335 "$LDWbegin_reduce_float_1_false:\n"
1336 " .loc 3 181 0\n"
1337 " cvt.u32.u16 %r1, %ctaid.x;\n"
1338 " mul24.lo.u32 %r2, %r1, 2;\n"
1339 " cvt.u32.u16 %r3, %tid.x;\n"
1340 " add.u32 %r4, %r2, %r3;\n"
1341 " ld.param.u32 %r5, [__cudaparm_reduce_float_1_false_n];\n"
1342 " setp.ge.u32 %p1, %r4, %r5;\n"
1343 " @%p1 bra $Lt_12_17154;\n"
1344 " add.u32 %r6, %r4, 1;\n"
1345 " ld.param.u32 %r5, [__cudaparm_reduce_float_1_false_n];\n"
1346 " add.u32 %r7, %r5, 1;\n"
1347 " mov.u16 %rh1, %nctaid.x;\n"
1348 " mul.wide.u16 %r8, %rh1, 2;\n"
1349 " cvt.s64.u32 %rd1, %r8;\n"
1350 " ld.param.u64 %rd2, [__cudaparm_reduce_float_1_false_g_idata];\n"
1351 " cvt.u64.u32 %rd3, %r4;\n"
1352 " mul.wide.u32 %rd4, %r4, 4;\n"
1353 " add.u64 %rd5, %rd2, %rd4;\n"
1354 " mul.wide.u32 %rd6, %r8, 4;\n"
1355 " mov.f32 %f1, 0f00000000; // 0\n"
1356 "$Lt_12_15618:\n"
1357 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1358 " .loc 3 188 0\n"
1359 " ld.global.f32 %f2, [%rd5+0];\n"
1360 " add.f32 %f1, %f2, %f1;\n"
1361 " .loc 3 181 0\n"
1362 " ld.param.u32 %r5, [__cudaparm_reduce_float_1_false_n];\n"
1363 " .loc 3 188 0\n"
1364 " setp.ge.u32 %p2, %r6, %r5;\n"
1365 " @%p2 bra $Lt_12_15874;\n"
1366 " //<loop> Part of loop body line 181, head labeled $Lt_12_15618\n"
1367 " .loc 3 191 0\n"
1368 " ld.global.f32 %f3, [%rd5+4];\n"
1369 " add.f32 %f1, %f3, %f1;\n"
1370 "$Lt_12_15874:\n"
1371 " //<loop> Part of loop body line 181, head labeled $Lt_12_15618\n"
1372 " add.u32 %r6, %r6, %r8;\n"
1373 " add.u64 %rd5, %rd5, %rd6;\n"
1374 " setp.lt.u32 %p3, %r6, %r7;\n"
1375 " @%p3 bra $Lt_12_15618;\n"
1376 " bra.uni $Lt_12_15106;\n"
1377 "$Lt_12_17154:\n"
1378 " mov.f32 %f1, 0f00000000; // 0\n"
1379 "$Lt_12_15106:\n"
1380 " .loc 3 71 0\n"
1381 " mov.u64 %rd7, __smem;\n"
1382 " cvt.u64.u32 %rd8, %r3;\n"
1383 " mul.wide.u32 %rd9, %r3, 4;\n"
1384 " add.u64 %rd10, %rd7, %rd9;\n"
1385 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1386 " .loc 3 72 0\n"
1387 " bar.sync 0;\n"
1388 " .loc 3 195 0\n"
1389 " mov.u32 %r9, 0;\n"
1390 " setp.ne.u32 %p4, %r3, %r9;\n"
1391 " @%p4 bra $Lt_12_16642;\n"
1392 " .loc 3 199 0\n"
1393 " ld.shared.f32 %f4, [__smem+0];\n"
1394 " ld.param.u64 %rd11, [__cudaparm_reduce_float_1_false_g_odata];\n"
1395 " cvt.u64.u32 %rd12, %r1;\n"
1396 " mul.wide.u32 %rd13, %r1, 4;\n"
1397 " add.u64 %rd14, %rd11, %rd13;\n"
1398 " st.global.f32 [%rd14+0], %f4;\n"
1399 "$Lt_12_16642:\n"
1400 " .loc 3 426 0\n"
1401 " exit;\n"
1402 "$LDWend_reduce_float_1_false:\n"
1403 " } // reduce_float_1_false\n"
1404 "\n"
1405 " .entry reduce_float_2_false (\n"
1406 " .param .u64 __cudaparm_reduce_float_2_false_g_idata,\n"
1407 " .param .u64 __cudaparm_reduce_float_2_false_g_odata,\n"
1408 " .param .u32 __cudaparm_reduce_float_2_false_n)\n"
1409 " {\n"
1410 " .reg .u16 %rh<3>;\n"
1411 " .reg .u32 %r<12>;\n"
1412 " .reg .u64 %rd<16>;\n"
1413 " .reg .f32 %f<8>;\n"
1414 " .reg .pred %p<7>;\n"
1415 " .loc 3 428 0\n"
1416 "$LDWbegin_reduce_float_2_false:\n"
1417 " .loc 3 181 0\n"
1418 " cvt.u32.u16 %r1, %ctaid.x;\n"
1419 " mul24.lo.u32 %r2, %r1, 4;\n"
1420 " cvt.u32.u16 %r3, %tid.x;\n"
1421 " add.u32 %r4, %r2, %r3;\n"
1422 " ld.param.u32 %r5, [__cudaparm_reduce_float_2_false_n];\n"
1423 " setp.ge.u32 %p1, %r4, %r5;\n"
1424 " @%p1 bra $Lt_13_17410;\n"
1425 " add.u32 %r6, %r4, 2;\n"
1426 " ld.param.u32 %r5, [__cudaparm_reduce_float_2_false_n];\n"
1427 " add.u32 %r7, %r5, 2;\n"
1428 " mov.u16 %rh1, %nctaid.x;\n"
1429 " mul.wide.u16 %r8, %rh1, 4;\n"
1430 " cvt.s64.u32 %rd1, %r8;\n"
1431 " ld.param.u64 %rd2, [__cudaparm_reduce_float_2_false_g_idata];\n"
1432 " cvt.u64.u32 %rd3, %r4;\n"
1433 " mul.wide.u32 %rd4, %r4, 4;\n"
1434 " add.u64 %rd5, %rd2, %rd4;\n"
1435 " mul.wide.u32 %rd6, %r8, 4;\n"
1436 " mov.f32 %f1, 0f00000000; // 0\n"
1437 "$Lt_13_15362:\n"
1438 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1439 " .loc 3 188 0\n"
1440 " ld.global.f32 %f2, [%rd5+0];\n"
1441 " add.f32 %f1, %f2, %f1;\n"
1442 " .loc 3 181 0\n"
1443 " ld.param.u32 %r5, [__cudaparm_reduce_float_2_false_n];\n"
1444 " .loc 3 188 0\n"
1445 " setp.ge.u32 %p2, %r6, %r5;\n"
1446 " @%p2 bra $Lt_13_15618;\n"
1447 " //<loop> Part of loop body line 181, head labeled $Lt_13_15362\n"
1448 " .loc 3 191 0\n"
1449 " ld.global.f32 %f3, [%rd5+8];\n"
1450 " add.f32 %f1, %f3, %f1;\n"
1451 "$Lt_13_15618:\n"
1452 " //<loop> Part of loop body line 181, head labeled $Lt_13_15362\n"
1453 " add.u32 %r6, %r6, %r8;\n"
1454 " add.u64 %rd5, %rd5, %rd6;\n"
1455 " setp.lt.u32 %p3, %r6, %r7;\n"
1456 " @%p3 bra $Lt_13_15362;\n"
1457 " bra.uni $Lt_13_14850;\n"
1458 "$Lt_13_17410:\n"
1459 " mov.f32 %f1, 0f00000000; // 0\n"
1460 "$Lt_13_14850:\n"
1461 " .loc 3 71 0\n"
1462 " mov.u64 %rd7, __smem;\n"
1463 " cvt.u64.u32 %rd8, %r3;\n"
1464 " mul.wide.u32 %rd9, %r3, 4;\n"
1465 " add.u64 %rd10, %rd7, %rd9;\n"
1466 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1467 " .loc 3 72 0\n"
1468 " bar.sync 0;\n"
1469 " mov.u32 %r9, 31;\n"
1470 " setp.gt.u32 %p4, %r3, %r9;\n"
1471 " @%p4 bra $Lt_13_16386;\n"
1472 " .loc 3 88 0\n"
1473 " ld.volatile.shared.f32 %f4, [%rd10+4];\n"
1474 " add.f32 %f5, %f4, %f1;\n"
1475 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1476 "$Lt_13_16386:\n"
1477 " .loc 3 195 0\n"
1478 " mov.u32 %r10, 0;\n"
1479 " setp.ne.u32 %p5, %r3, %r10;\n"
1480 " @%p5 bra $Lt_13_16898;\n"
1481 " .loc 3 199 0\n"
1482 " ld.shared.f32 %f6, [__smem+0];\n"
1483 " ld.param.u64 %rd11, [__cudaparm_reduce_float_2_false_g_odata];\n"
1484 " cvt.u64.u32 %rd12, %r1;\n"
1485 " mul.wide.u32 %rd13, %r1, 4;\n"
1486 " add.u64 %rd14, %rd11, %rd13;\n"
1487 " st.global.f32 [%rd14+0], %f6;\n"
1488 "$Lt_13_16898:\n"
1489 " .loc 3 431 0\n"
1490 " exit;\n"
1491 "$LDWend_reduce_float_2_false:\n"
1492 " } // reduce_float_2_false\n"
1493 "\n"
1494 " .entry reduce_float_4_false (\n"
1495 " .param .u64 __cudaparm_reduce_float_4_false_g_idata,\n"
1496 " .param .u64 __cudaparm_reduce_float_4_false_g_odata,\n"
1497 " .param .u32 __cudaparm_reduce_float_4_false_n)\n"
1498 " {\n"
1499 " .reg .u16 %rh<3>;\n"
1500 " .reg .u32 %r<12>;\n"
1501 " .reg .u64 %rd<16>;\n"
1502 " .reg .f32 %f<10>;\n"
1503 " .reg .pred %p<7>;\n"
1504 " .loc 3 433 0\n"
1505 "$LDWbegin_reduce_float_4_false:\n"
1506 " .loc 3 181 0\n"
1507 " cvt.u32.u16 %r1, %ctaid.x;\n"
1508 " mul24.lo.u32 %r2, %r1, 8;\n"
1509 " cvt.u32.u16 %r3, %tid.x;\n"
1510 " add.u32 %r4, %r2, %r3;\n"
1511 " ld.param.u32 %r5, [__cudaparm_reduce_float_4_false_n];\n"
1512 " setp.ge.u32 %p1, %r4, %r5;\n"
1513 " @%p1 bra $Lt_14_17154;\n"
1514 " add.u32 %r6, %r4, 4;\n"
1515 " ld.param.u32 %r5, [__cudaparm_reduce_float_4_false_n];\n"
1516 " add.u32 %r7, %r5, 4;\n"
1517 " mov.u16 %rh1, %nctaid.x;\n"
1518 " mul.wide.u16 %r8, %rh1, 8;\n"
1519 " cvt.s64.u32 %rd1, %r8;\n"
1520 " ld.param.u64 %rd2, [__cudaparm_reduce_float_4_false_g_idata];\n"
1521 " cvt.u64.u32 %rd3, %r4;\n"
1522 " mul.wide.u32 %rd4, %r4, 4;\n"
1523 " add.u64 %rd5, %rd2, %rd4;\n"
1524 " mul.wide.u32 %rd6, %r8, 4;\n"
1525 " mov.f32 %f1, 0f00000000; // 0\n"
1526 "$Lt_14_15106:\n"
1527 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1528 " .loc 3 188 0\n"
1529 " ld.global.f32 %f2, [%rd5+0];\n"
1530 " add.f32 %f1, %f2, %f1;\n"
1531 " .loc 3 181 0\n"
1532 " ld.param.u32 %r5, [__cudaparm_reduce_float_4_false_n];\n"
1533 " .loc 3 188 0\n"
1534 " setp.ge.u32 %p2, %r6, %r5;\n"
1535 " @%p2 bra $Lt_14_15362;\n"
1536 " //<loop> Part of loop body line 181, head labeled $Lt_14_15106\n"
1537 " .loc 3 191 0\n"
1538 " ld.global.f32 %f3, [%rd5+16];\n"
1539 " add.f32 %f1, %f3, %f1;\n"
1540 "$Lt_14_15362:\n"
1541 " //<loop> Part of loop body line 181, head labeled $Lt_14_15106\n"
1542 " add.u32 %r6, %r6, %r8;\n"
1543 " add.u64 %rd5, %rd5, %rd6;\n"
1544 " setp.lt.u32 %p3, %r6, %r7;\n"
1545 " @%p3 bra $Lt_14_15106;\n"
1546 " bra.uni $Lt_14_14594;\n"
1547 "$Lt_14_17154:\n"
1548 " mov.f32 %f1, 0f00000000; // 0\n"
1549 "$Lt_14_14594:\n"
1550 " .loc 3 71 0\n"
1551 " mov.u64 %rd7, __smem;\n"
1552 " cvt.u64.u32 %rd8, %r3;\n"
1553 " mul.wide.u32 %rd9, %r3, 4;\n"
1554 " add.u64 %rd10, %rd7, %rd9;\n"
1555 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1556 " .loc 3 72 0\n"
1557 " bar.sync 0;\n"
1558 " mov.u32 %r9, 31;\n"
1559 " setp.gt.u32 %p4, %r3, %r9;\n"
1560 " @%p4 bra $Lt_14_16130;\n"
1561 " .loc 3 87 0\n"
1562 " ld.volatile.shared.f32 %f4, [%rd10+8];\n"
1563 " add.f32 %f5, %f4, %f1;\n"
1564 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1565 " .loc 3 88 0\n"
1566 " ld.volatile.shared.f32 %f6, [%rd10+4];\n"
1567 " add.f32 %f7, %f6, %f5;\n"
1568 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1569 "$Lt_14_16130:\n"
1570 " .loc 3 195 0\n"
1571 " mov.u32 %r10, 0;\n"
1572 " setp.ne.u32 %p5, %r3, %r10;\n"
1573 " @%p5 bra $Lt_14_16642;\n"
1574 " .loc 3 199 0\n"
1575 " ld.shared.f32 %f8, [__smem+0];\n"
1576 " ld.param.u64 %rd11, [__cudaparm_reduce_float_4_false_g_odata];\n"
1577 " cvt.u64.u32 %rd12, %r1;\n"
1578 " mul.wide.u32 %rd13, %r1, 4;\n"
1579 " add.u64 %rd14, %rd11, %rd13;\n"
1580 " st.global.f32 [%rd14+0], %f8;\n"
1581 "$Lt_14_16642:\n"
1582 " .loc 3 436 0\n"
1583 " exit;\n"
1584 "$LDWend_reduce_float_4_false:\n"
1585 " } // reduce_float_4_false\n"
1586 "\n"
1587 " .entry reduce_float_8_false (\n"
1588 " .param .u64 __cudaparm_reduce_float_8_false_g_idata,\n"
1589 " .param .u64 __cudaparm_reduce_float_8_false_g_odata,\n"
1590 " .param .u32 __cudaparm_reduce_float_8_false_n)\n"
1591 " {\n"
1592 " .reg .u16 %rh<3>;\n"
1593 " .reg .u32 %r<12>;\n"
1594 " .reg .u64 %rd<16>;\n"
1595 " .reg .f32 %f<12>;\n"
1596 " .reg .pred %p<7>;\n"
1597 " .loc 3 438 0\n"
1598 "$LDWbegin_reduce_float_8_false:\n"
1599 " .loc 3 181 0\n"
1600 " cvt.u32.u16 %r1, %ctaid.x;\n"
1601 " mul24.lo.u32 %r2, %r1, 16;\n"
1602 " cvt.u32.u16 %r3, %tid.x;\n"
1603 " add.u32 %r4, %r2, %r3;\n"
1604 " ld.param.u32 %r5, [__cudaparm_reduce_float_8_false_n];\n"
1605 " setp.ge.u32 %p1, %r4, %r5;\n"
1606 " @%p1 bra $Lt_15_16898;\n"
1607 " add.u32 %r6, %r4, 8;\n"
1608 " ld.param.u32 %r5, [__cudaparm_reduce_float_8_false_n];\n"
1609 " add.u32 %r7, %r5, 8;\n"
1610 " mov.u16 %rh1, %nctaid.x;\n"
1611 " mul.wide.u16 %r8, %rh1, 16;\n"
1612 " cvt.s64.u32 %rd1, %r8;\n"
1613 " ld.param.u64 %rd2, [__cudaparm_reduce_float_8_false_g_idata];\n"
1614 " cvt.u64.u32 %rd3, %r4;\n"
1615 " mul.wide.u32 %rd4, %r4, 4;\n"
1616 " add.u64 %rd5, %rd2, %rd4;\n"
1617 " mul.wide.u32 %rd6, %r8, 4;\n"
1618 " mov.f32 %f1, 0f00000000; // 0\n"
1619 "$Lt_15_14850:\n"
1620 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1621 " .loc 3 188 0\n"
1622 " ld.global.f32 %f2, [%rd5+0];\n"
1623 " add.f32 %f1, %f2, %f1;\n"
1624 " .loc 3 181 0\n"
1625 " ld.param.u32 %r5, [__cudaparm_reduce_float_8_false_n];\n"
1626 " .loc 3 188 0\n"
1627 " setp.ge.u32 %p2, %r6, %r5;\n"
1628 " @%p2 bra $Lt_15_15106;\n"
1629 " //<loop> Part of loop body line 181, head labeled $Lt_15_14850\n"
1630 " .loc 3 191 0\n"
1631 " ld.global.f32 %f3, [%rd5+32];\n"
1632 " add.f32 %f1, %f3, %f1;\n"
1633 "$Lt_15_15106:\n"
1634 " //<loop> Part of loop body line 181, head labeled $Lt_15_14850\n"
1635 " add.u32 %r6, %r6, %r8;\n"
1636 " add.u64 %rd5, %rd5, %rd6;\n"
1637 " setp.lt.u32 %p3, %r6, %r7;\n"
1638 " @%p3 bra $Lt_15_14850;\n"
1639 " bra.uni $Lt_15_14338;\n"
1640 "$Lt_15_16898:\n"
1641 " mov.f32 %f1, 0f00000000; // 0\n"
1642 "$Lt_15_14338:\n"
1643 " .loc 3 71 0\n"
1644 " mov.u64 %rd7, __smem;\n"
1645 " cvt.u64.u32 %rd8, %r3;\n"
1646 " mul.wide.u32 %rd9, %r3, 4;\n"
1647 " add.u64 %rd10, %rd7, %rd9;\n"
1648 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1649 " .loc 3 72 0\n"
1650 " bar.sync 0;\n"
1651 " mov.u32 %r9, 31;\n"
1652 " setp.gt.u32 %p4, %r3, %r9;\n"
1653 " @%p4 bra $Lt_15_15874;\n"
1654 " .loc 3 86 0\n"
1655 " ld.volatile.shared.f32 %f4, [%rd10+16];\n"
1656 " add.f32 %f5, %f4, %f1;\n"
1657 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1658 " .loc 3 87 0\n"
1659 " ld.volatile.shared.f32 %f6, [%rd10+8];\n"
1660 " add.f32 %f7, %f6, %f5;\n"
1661 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1662 " .loc 3 88 0\n"
1663 " ld.volatile.shared.f32 %f8, [%rd10+4];\n"
1664 " add.f32 %f9, %f8, %f7;\n"
1665 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1666 "$Lt_15_15874:\n"
1667 " .loc 3 195 0\n"
1668 " mov.u32 %r10, 0;\n"
1669 " setp.ne.u32 %p5, %r3, %r10;\n"
1670 " @%p5 bra $Lt_15_16386;\n"
1671 " .loc 3 199 0\n"
1672 " ld.shared.f32 %f10, [__smem+0];\n"
1673 " ld.param.u64 %rd11, [__cudaparm_reduce_float_8_false_g_odata];\n"
1674 " cvt.u64.u32 %rd12, %r1;\n"
1675 " mul.wide.u32 %rd13, %r1, 4;\n"
1676 " add.u64 %rd14, %rd11, %rd13;\n"
1677 " st.global.f32 [%rd14+0], %f10;\n"
1678 "$Lt_15_16386:\n"
1679 " .loc 3 441 0\n"
1680 " exit;\n"
1681 "$LDWend_reduce_float_8_false:\n"
1682 " } // reduce_float_8_false\n"
1683 "\n"
1684 " .entry reduce_float_16_false (\n"
1685 " .param .u64 __cudaparm_reduce_float_16_false_g_idata,\n"
1686 " .param .u64 __cudaparm_reduce_float_16_false_g_odata,\n"
1687 " .param .u32 __cudaparm_reduce_float_16_false_n)\n"
1688 " {\n"
1689 " .reg .u16 %rh<3>;\n"
1690 " .reg .u32 %r<12>;\n"
1691 " .reg .u64 %rd<16>;\n"
1692 " .reg .f32 %f<14>;\n"
1693 " .reg .pred %p<7>;\n"
1694 " .loc 3 443 0\n"
1695 "$LDWbegin_reduce_float_16_false:\n"
1696 " .loc 3 181 0\n"
1697 " cvt.u32.u16 %r1, %ctaid.x;\n"
1698 " mul24.lo.u32 %r2, %r1, 32;\n"
1699 " cvt.u32.u16 %r3, %tid.x;\n"
1700 " add.u32 %r4, %r2, %r3;\n"
1701 " ld.param.u32 %r5, [__cudaparm_reduce_float_16_false_n];\n"
1702 " setp.ge.u32 %p1, %r4, %r5;\n"
1703 " @%p1 bra $Lt_16_16642;\n"
1704 " add.u32 %r6, %r4, 16;\n"
1705 " ld.param.u32 %r5, [__cudaparm_reduce_float_16_false_n];\n"
1706 " add.u32 %r7, %r5, 16;\n"
1707 " mov.u16 %rh1, %nctaid.x;\n"
1708 " mul.wide.u16 %r8, %rh1, 32;\n"
1709 " cvt.s64.u32 %rd1, %r8;\n"
1710 " ld.param.u64 %rd2, [__cudaparm_reduce_float_16_false_g_idata];\n"
1711 " cvt.u64.u32 %rd3, %r4;\n"
1712 " mul.wide.u32 %rd4, %r4, 4;\n"
1713 " add.u64 %rd5, %rd2, %rd4;\n"
1714 " mul.wide.u32 %rd6, %r8, 4;\n"
1715 " mov.f32 %f1, 0f00000000; // 0\n"
1716 "$Lt_16_14594:\n"
1717 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1718 " .loc 3 188 0\n"
1719 " ld.global.f32 %f2, [%rd5+0];\n"
1720 " add.f32 %f1, %f2, %f1;\n"
1721 " .loc 3 181 0\n"
1722 " ld.param.u32 %r5, [__cudaparm_reduce_float_16_false_n];\n"
1723 " .loc 3 188 0\n"
1724 " setp.ge.u32 %p2, %r6, %r5;\n"
1725 " @%p2 bra $Lt_16_14850;\n"
1726 " //<loop> Part of loop body line 181, head labeled $Lt_16_14594\n"
1727 " .loc 3 191 0\n"
1728 " ld.global.f32 %f3, [%rd5+64];\n"
1729 " add.f32 %f1, %f3, %f1;\n"
1730 "$Lt_16_14850:\n"
1731 " //<loop> Part of loop body line 181, head labeled $Lt_16_14594\n"
1732 " add.u32 %r6, %r6, %r8;\n"
1733 " add.u64 %rd5, %rd5, %rd6;\n"
1734 " setp.lt.u32 %p3, %r6, %r7;\n"
1735 " @%p3 bra $Lt_16_14594;\n"
1736 " bra.uni $Lt_16_14082;\n"
1737 "$Lt_16_16642:\n"
1738 " mov.f32 %f1, 0f00000000; // 0\n"
1739 "$Lt_16_14082:\n"
1740 " .loc 3 71 0\n"
1741 " mov.u64 %rd7, __smem;\n"
1742 " cvt.u64.u32 %rd8, %r3;\n"
1743 " mul.wide.u32 %rd9, %r3, 4;\n"
1744 " add.u64 %rd10, %rd7, %rd9;\n"
1745 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1746 " .loc 3 72 0\n"
1747 " bar.sync 0;\n"
1748 " mov.u32 %r9, 31;\n"
1749 " setp.gt.u32 %p4, %r3, %r9;\n"
1750 " @%p4 bra $Lt_16_15618;\n"
1751 " .loc 3 85 0\n"
1752 " ld.volatile.shared.f32 %f4, [%rd10+32];\n"
1753 " add.f32 %f5, %f4, %f1;\n"
1754 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1755 " .loc 3 86 0\n"
1756 " ld.volatile.shared.f32 %f6, [%rd10+16];\n"
1757 " add.f32 %f7, %f6, %f5;\n"
1758 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1759 " .loc 3 87 0\n"
1760 " ld.volatile.shared.f32 %f8, [%rd10+8];\n"
1761 " add.f32 %f9, %f8, %f7;\n"
1762 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1763 " .loc 3 88 0\n"
1764 " ld.volatile.shared.f32 %f10, [%rd10+4];\n"
1765 " add.f32 %f11, %f10, %f9;\n"
1766 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1767 "$Lt_16_15618:\n"
1768 " .loc 3 195 0\n"
1769 " mov.u32 %r10, 0;\n"
1770 " setp.ne.u32 %p5, %r3, %r10;\n"
1771 " @%p5 bra $Lt_16_16130;\n"
1772 " .loc 3 199 0\n"
1773 " ld.shared.f32 %f12, [__smem+0];\n"
1774 " ld.param.u64 %rd11, [__cudaparm_reduce_float_16_false_g_odata];\n"
1775 " cvt.u64.u32 %rd12, %r1;\n"
1776 " mul.wide.u32 %rd13, %r1, 4;\n"
1777 " add.u64 %rd14, %rd11, %rd13;\n"
1778 " st.global.f32 [%rd14+0], %f12;\n"
1779 "$Lt_16_16130:\n"
1780 " .loc 3 446 0\n"
1781 " exit;\n"
1782 "$LDWend_reduce_float_16_false:\n"
1783 " } // reduce_float_16_false\n"
1784 "\n"
1785 " .entry reduce_float_32_false (\n"
1786 " .param .u64 __cudaparm_reduce_float_32_false_g_idata,\n"
1787 " .param .u64 __cudaparm_reduce_float_32_false_g_odata,\n"
1788 " .param .u32 __cudaparm_reduce_float_32_false_n)\n"
1789 " {\n"
1790 " .reg .u16 %rh<3>;\n"
1791 " .reg .u32 %r<12>;\n"
1792 " .reg .u64 %rd<16>;\n"
1793 " .reg .f32 %f<16>;\n"
1794 " .reg .pred %p<7>;\n"
1795 " .loc 3 448 0\n"
1796 "$LDWbegin_reduce_float_32_false:\n"
1797 " .loc 3 181 0\n"
1798 " cvt.u32.u16 %r1, %ctaid.x;\n"
1799 " mul24.lo.u32 %r2, %r1, 64;\n"
1800 " cvt.u32.u16 %r3, %tid.x;\n"
1801 " add.u32 %r4, %r2, %r3;\n"
1802 " ld.param.u32 %r5, [__cudaparm_reduce_float_32_false_n];\n"
1803 " setp.ge.u32 %p1, %r4, %r5;\n"
1804 " @%p1 bra $Lt_17_16386;\n"
1805 " add.u32 %r6, %r4, 32;\n"
1806 " ld.param.u32 %r5, [__cudaparm_reduce_float_32_false_n];\n"
1807 " add.u32 %r7, %r5, 32;\n"
1808 " mov.u16 %rh1, %nctaid.x;\n"
1809 " mul.wide.u16 %r8, %rh1, 64;\n"
1810 " cvt.s64.u32 %rd1, %r8;\n"
1811 " ld.param.u64 %rd2, [__cudaparm_reduce_float_32_false_g_idata];\n"
1812 " cvt.u64.u32 %rd3, %r4;\n"
1813 " mul.wide.u32 %rd4, %r4, 4;\n"
1814 " add.u64 %rd5, %rd2, %rd4;\n"
1815 " mul.wide.u32 %rd6, %r8, 4;\n"
1816 " mov.f32 %f1, 0f00000000; // 0\n"
1817 "$Lt_17_14338:\n"
1818 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1819 " .loc 3 188 0\n"
1820 " ld.global.f32 %f2, [%rd5+0];\n"
1821 " add.f32 %f1, %f2, %f1;\n"
1822 " .loc 3 181 0\n"
1823 " ld.param.u32 %r5, [__cudaparm_reduce_float_32_false_n];\n"
1824 " .loc 3 188 0\n"
1825 " setp.ge.u32 %p2, %r6, %r5;\n"
1826 " @%p2 bra $Lt_17_14594;\n"
1827 " //<loop> Part of loop body line 181, head labeled $Lt_17_14338\n"
1828 " .loc 3 191 0\n"
1829 " ld.global.f32 %f3, [%rd5+128];\n"
1830 " add.f32 %f1, %f3, %f1;\n"
1831 "$Lt_17_14594:\n"
1832 " //<loop> Part of loop body line 181, head labeled $Lt_17_14338\n"
1833 " add.u32 %r6, %r6, %r8;\n"
1834 " add.u64 %rd5, %rd5, %rd6;\n"
1835 " setp.lt.u32 %p3, %r6, %r7;\n"
1836 " @%p3 bra $Lt_17_14338;\n"
1837 " bra.uni $Lt_17_13826;\n"
1838 "$Lt_17_16386:\n"
1839 " mov.f32 %f1, 0f00000000; // 0\n"
1840 "$Lt_17_13826:\n"
1841 " .loc 3 71 0\n"
1842 " mov.u64 %rd7, __smem;\n"
1843 " cvt.u64.u32 %rd8, %r3;\n"
1844 " mul.wide.u32 %rd9, %r3, 4;\n"
1845 " add.u64 %rd10, %rd7, %rd9;\n"
1846 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1847 " .loc 3 72 0\n"
1848 " bar.sync 0;\n"
1849 " mov.u32 %r9, 31;\n"
1850 " setp.gt.u32 %p4, %r3, %r9;\n"
1851 " @%p4 bra $Lt_17_15362;\n"
1852 " .loc 3 84 0\n"
1853 " ld.volatile.shared.f32 %f4, [%rd10+64];\n"
1854 " add.f32 %f5, %f4, %f1;\n"
1855 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1856 " .loc 3 85 0\n"
1857 " ld.volatile.shared.f32 %f6, [%rd10+32];\n"
1858 " add.f32 %f7, %f6, %f5;\n"
1859 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1860 " .loc 3 86 0\n"
1861 " ld.volatile.shared.f32 %f8, [%rd10+16];\n"
1862 " add.f32 %f9, %f8, %f7;\n"
1863 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1864 " .loc 3 87 0\n"
1865 " ld.volatile.shared.f32 %f10, [%rd10+8];\n"
1866 " add.f32 %f11, %f10, %f9;\n"
1867 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1868 " .loc 3 88 0\n"
1869 " ld.volatile.shared.f32 %f12, [%rd10+4];\n"
1870 " add.f32 %f13, %f12, %f11;\n"
1871 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1872 "$Lt_17_15362:\n"
1873 " .loc 3 195 0\n"
1874 " mov.u32 %r10, 0;\n"
1875 " setp.ne.u32 %p5, %r3, %r10;\n"
1876 " @%p5 bra $Lt_17_15874;\n"
1877 " .loc 3 199 0\n"
1878 " ld.shared.f32 %f14, [__smem+0];\n"
1879 " ld.param.u64 %rd11, [__cudaparm_reduce_float_32_false_g_odata];\n"
1880 " cvt.u64.u32 %rd12, %r1;\n"
1881 " mul.wide.u32 %rd13, %r1, 4;\n"
1882 " add.u64 %rd14, %rd11, %rd13;\n"
1883 " st.global.f32 [%rd14+0], %f14;\n"
1884 "$Lt_17_15874:\n"
1885 " .loc 3 451 0\n"
1886 " exit;\n"
1887 "$LDWend_reduce_float_32_false:\n"
1888 " } // reduce_float_32_false\n"
1889 "\n"
1890 " .entry reduce_float_64_false (\n"
1891 " .param .u64 __cudaparm_reduce_float_64_false_g_idata,\n"
1892 " .param .u64 __cudaparm_reduce_float_64_false_g_odata,\n"
1893 " .param .u32 __cudaparm_reduce_float_64_false_n)\n"
1894 " {\n"
1895 " .reg .u16 %rh<3>;\n"
1896 " .reg .u32 %r<12>;\n"
1897 " .reg .u64 %rd<16>;\n"
1898 " .reg .f32 %f<18>;\n"
1899 " .reg .pred %p<7>;\n"
1900 " .loc 3 453 0\n"
1901 "$LDWbegin_reduce_float_64_false:\n"
1902 " .loc 3 181 0\n"
1903 " cvt.u32.u16 %r1, %ctaid.x;\n"
1904 " mul24.lo.u32 %r2, %r1, 128;\n"
1905 " cvt.u32.u16 %r3, %tid.x;\n"
1906 " add.u32 %r4, %r2, %r3;\n"
1907 " ld.param.u32 %r5, [__cudaparm_reduce_float_64_false_n];\n"
1908 " setp.ge.u32 %p1, %r4, %r5;\n"
1909 " @%p1 bra $Lt_18_16130;\n"
1910 " add.u32 %r6, %r4, 64;\n"
1911 " ld.param.u32 %r5, [__cudaparm_reduce_float_64_false_n];\n"
1912 " add.u32 %r7, %r5, 64;\n"
1913 " mov.u16 %rh1, %nctaid.x;\n"
1914 " mul.wide.u16 %r8, %rh1, 128;\n"
1915 " cvt.s64.u32 %rd1, %r8;\n"
1916 " ld.param.u64 %rd2, [__cudaparm_reduce_float_64_false_g_idata];\n"
1917 " cvt.u64.u32 %rd3, %r4;\n"
1918 " mul.wide.u32 %rd4, %r4, 4;\n"
1919 " add.u64 %rd5, %rd2, %rd4;\n"
1920 " mul.wide.u32 %rd6, %r8, 4;\n"
1921 " mov.f32 %f1, 0f00000000; // 0\n"
1922 "$Lt_18_14082:\n"
1923 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
1924 " .loc 3 188 0\n"
1925 " ld.global.f32 %f2, [%rd5+0];\n"
1926 " add.f32 %f1, %f2, %f1;\n"
1927 " .loc 3 181 0\n"
1928 " ld.param.u32 %r5, [__cudaparm_reduce_float_64_false_n];\n"
1929 " .loc 3 188 0\n"
1930 " setp.ge.u32 %p2, %r6, %r5;\n"
1931 " @%p2 bra $Lt_18_14338;\n"
1932 " //<loop> Part of loop body line 181, head labeled $Lt_18_14082\n"
1933 " .loc 3 191 0\n"
1934 " ld.global.f32 %f3, [%rd5+256];\n"
1935 " add.f32 %f1, %f3, %f1;\n"
1936 "$Lt_18_14338:\n"
1937 " //<loop> Part of loop body line 181, head labeled $Lt_18_14082\n"
1938 " add.u32 %r6, %r6, %r8;\n"
1939 " add.u64 %rd5, %rd5, %rd6;\n"
1940 " setp.lt.u32 %p3, %r6, %r7;\n"
1941 " @%p3 bra $Lt_18_14082;\n"
1942 " bra.uni $Lt_18_13570;\n"
1943 "$Lt_18_16130:\n"
1944 " mov.f32 %f1, 0f00000000; // 0\n"
1945 "$Lt_18_13570:\n"
1946 " .loc 3 71 0\n"
1947 " mov.u64 %rd7, __smem;\n"
1948 " cvt.u64.u32 %rd8, %r3;\n"
1949 " mul.wide.u32 %rd9, %r3, 4;\n"
1950 " add.u64 %rd10, %rd7, %rd9;\n"
1951 " st.volatile.shared.f32 [%rd10+0], %f1;\n"
1952 " .loc 3 72 0\n"
1953 " bar.sync 0;\n"
1954 " mov.u32 %r9, 31;\n"
1955 " setp.gt.u32 %p4, %r3, %r9;\n"
1956 " @%p4 bra $Lt_18_15106;\n"
1957 " .loc 3 83 0\n"
1958 " ld.volatile.shared.f32 %f4, [%rd10+128];\n"
1959 " add.f32 %f5, %f4, %f1;\n"
1960 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
1961 " .loc 3 84 0\n"
1962 " ld.volatile.shared.f32 %f6, [%rd10+64];\n"
1963 " add.f32 %f7, %f6, %f5;\n"
1964 " st.volatile.shared.f32 [%rd10+0], %f7;\n"
1965 " .loc 3 85 0\n"
1966 " ld.volatile.shared.f32 %f8, [%rd10+32];\n"
1967 " add.f32 %f9, %f8, %f7;\n"
1968 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
1969 " .loc 3 86 0\n"
1970 " ld.volatile.shared.f32 %f10, [%rd10+16];\n"
1971 " add.f32 %f11, %f10, %f9;\n"
1972 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
1973 " .loc 3 87 0\n"
1974 " ld.volatile.shared.f32 %f12, [%rd10+8];\n"
1975 " add.f32 %f13, %f12, %f11;\n"
1976 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
1977 " .loc 3 88 0\n"
1978 " ld.volatile.shared.f32 %f14, [%rd10+4];\n"
1979 " add.f32 %f15, %f14, %f13;\n"
1980 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
1981 "$Lt_18_15106:\n"
1982 " .loc 3 195 0\n"
1983 " mov.u32 %r10, 0;\n"
1984 " setp.ne.u32 %p5, %r3, %r10;\n"
1985 " @%p5 bra $Lt_18_15618;\n"
1986 " .loc 3 199 0\n"
1987 " ld.shared.f32 %f16, [__smem+0];\n"
1988 " ld.param.u64 %rd11, [__cudaparm_reduce_float_64_false_g_odata];\n"
1989 " cvt.u64.u32 %rd12, %r1;\n"
1990 " mul.wide.u32 %rd13, %r1, 4;\n"
1991 " add.u64 %rd14, %rd11, %rd13;\n"
1992 " st.global.f32 [%rd14+0], %f16;\n"
1993 "$Lt_18_15618:\n"
1994 " .loc 3 456 0\n"
1995 " exit;\n"
1996 "$LDWend_reduce_float_64_false:\n"
1997 " } // reduce_float_64_false\n"
1998 "\n"
1999 " .entry reduce_float_128_false (\n"
2000 " .param .u64 __cudaparm_reduce_float_128_false_g_idata,\n"
2001 " .param .u64 __cudaparm_reduce_float_128_false_g_odata,\n"
2002 " .param .u32 __cudaparm_reduce_float_128_false_n)\n"
2003 " {\n"
2004 " .reg .u16 %rh<3>;\n"
2005 " .reg .u32 %r<13>;\n"
2006 " .reg .u64 %rd<16>;\n"
2007 " .reg .f32 %f<20>;\n"
2008 " .reg .pred %p<8>;\n"
2009 " .loc 3 458 0\n"
2010 "$LDWbegin_reduce_float_128_false:\n"
2011 " .loc 3 181 0\n"
2012 " cvt.u32.u16 %r1, %ctaid.x;\n"
2013 " mul.lo.u32 %r2, %r1, 256;\n"
2014 " cvt.u32.u16 %r3, %tid.x;\n"
2015 " add.u32 %r4, %r2, %r3;\n"
2016 " ld.param.u32 %r5, [__cudaparm_reduce_float_128_false_n];\n"
2017 " setp.ge.u32 %p1, %r4, %r5;\n"
2018 " @%p1 bra $Lt_19_16386;\n"
2019 " add.u32 %r6, %r4, 128;\n"
2020 " ld.param.u32 %r5, [__cudaparm_reduce_float_128_false_n];\n"
2021 " add.u32 %r7, %r5, 128;\n"
2022 " mov.u16 %rh1, %nctaid.x;\n"
2023 " mul.wide.u16 %r8, %rh1, 256;\n"
2024 " cvt.s64.u32 %rd1, %r8;\n"
2025 " ld.param.u64 %rd2, [__cudaparm_reduce_float_128_false_g_idata];\n"
2026 " cvt.u64.u32 %rd3, %r4;\n"
2027 " mul.wide.u32 %rd4, %r4, 4;\n"
2028 " add.u64 %rd5, %rd2, %rd4;\n"
2029 " mul.wide.u32 %rd6, %r8, 4;\n"
2030 " mov.f32 %f1, 0f00000000; // 0\n"
2031 "$Lt_19_13826:\n"
2032 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2033 " .loc 3 188 0\n"
2034 " ld.global.f32 %f2, [%rd5+0];\n"
2035 " add.f32 %f1, %f2, %f1;\n"
2036 " .loc 3 181 0\n"
2037 " ld.param.u32 %r5, [__cudaparm_reduce_float_128_false_n];\n"
2038 " .loc 3 188 0\n"
2039 " setp.ge.u32 %p2, %r6, %r5;\n"
2040 " @%p2 bra $Lt_19_14082;\n"
2041 " //<loop> Part of loop body line 181, head labeled $Lt_19_13826\n"
2042 " .loc 3 191 0\n"
2043 " ld.global.f32 %f3, [%rd5+512];\n"
2044 " add.f32 %f1, %f3, %f1;\n"
2045 "$Lt_19_14082:\n"
2046 " //<loop> Part of loop body line 181, head labeled $Lt_19_13826\n"
2047 " add.u32 %r6, %r6, %r8;\n"
2048 " add.u64 %rd5, %rd5, %rd6;\n"
2049 " setp.lt.u32 %p3, %r6, %r7;\n"
2050 " @%p3 bra $Lt_19_13826;\n"
2051 " bra.uni $Lt_19_13314;\n"
2052 "$Lt_19_16386:\n"
2053 " mov.f32 %f1, 0f00000000; // 0\n"
2054 "$Lt_19_13314:\n"
2055 " .loc 3 195 0\n"
2056 " mov.f32 %f4, %f1;\n"
2057 " mov.f32 %f5, %f4;\n"
2058 " .loc 3 71 0\n"
2059 " mov.u64 %rd7, __smem;\n"
2060 " cvt.u64.u32 %rd8, %r3;\n"
2061 " mul.wide.u32 %rd9, %r3, 4;\n"
2062 " add.u64 %rd10, %rd7, %rd9;\n"
2063 " st.volatile.shared.f32 [%rd10+0], %f4;\n"
2064 " .loc 3 72 0\n"
2065 " bar.sync 0;\n"
2066 " mov.u32 %r9, 63;\n"
2067 " setp.gt.u32 %p4, %r3, %r9;\n"
2068 " @%p4 bra $Lt_19_14850;\n"
2069 " .loc 3 77 0\n"
2070 " ld.volatile.shared.f32 %f6, [%rd10+256];\n"
2071 " add.f32 %f5, %f6, %f4;\n"
2072 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2073 "$Lt_19_14850:\n"
2074 " bar.sync 0;\n"
2075 " mov.u32 %r10, 31;\n"
2076 " setp.gt.u32 %p5, %r3, %r10;\n"
2077 " @%p5 bra $Lt_19_15362;\n"
2078 " .loc 3 83 0\n"
2079 " ld.volatile.shared.f32 %f7, [%rd10+128];\n"
2080 " add.f32 %f8, %f7, %f5;\n"
2081 " st.volatile.shared.f32 [%rd10+0], %f8;\n"
2082 " .loc 3 84 0\n"
2083 " ld.volatile.shared.f32 %f9, [%rd10+64];\n"
2084 " add.f32 %f10, %f9, %f8;\n"
2085 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
2086 " .loc 3 85 0\n"
2087 " ld.volatile.shared.f32 %f11, [%rd10+32];\n"
2088 " add.f32 %f12, %f11, %f10;\n"
2089 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
2090 " .loc 3 86 0\n"
2091 " ld.volatile.shared.f32 %f13, [%rd10+16];\n"
2092 " add.f32 %f14, %f13, %f12;\n"
2093 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
2094 " .loc 3 87 0\n"
2095 " ld.volatile.shared.f32 %f15, [%rd10+8];\n"
2096 " add.f32 %f16, %f15, %f14;\n"
2097 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
2098 " .loc 3 88 0\n"
2099 " ld.volatile.shared.f32 %f17, [%rd10+4];\n"
2100 " add.f32 %f5, %f17, %f16;\n"
2101 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2102 "$Lt_19_15362:\n"
2103 " .loc 3 195 0\n"
2104 " mov.u32 %r11, 0;\n"
2105 " setp.ne.u32 %p6, %r3, %r11;\n"
2106 " @%p6 bra $Lt_19_15874;\n"
2107 " .loc 3 199 0\n"
2108 " ld.shared.f32 %f18, [__smem+0];\n"
2109 " ld.param.u64 %rd11, [__cudaparm_reduce_float_128_false_g_odata];\n"
2110 " cvt.u64.u32 %rd12, %r1;\n"
2111 " mul.wide.u32 %rd13, %r1, 4;\n"
2112 " add.u64 %rd14, %rd11, %rd13;\n"
2113 " st.global.f32 [%rd14+0], %f18;\n"
2114 "$Lt_19_15874:\n"
2115 " .loc 3 461 0\n"
2116 " exit;\n"
2117 "$LDWend_reduce_float_128_false:\n"
2118 " } // reduce_float_128_false\n"
2119 "\n"
2120 " .entry reduce_float_256_false (\n"
2121 " .param .u64 __cudaparm_reduce_float_256_false_g_idata,\n"
2122 " .param .u64 __cudaparm_reduce_float_256_false_g_odata,\n"
2123 " .param .u32 __cudaparm_reduce_float_256_false_n)\n"
2124 " {\n"
2125 " .reg .u16 %rh<3>;\n"
2126 " .reg .u32 %r<14>;\n"
2127 " .reg .u64 %rd<16>;\n"
2128 " .reg .f32 %f<21>;\n"
2129 " .reg .pred %p<9>;\n"
2130 " .loc 3 463 0\n"
2131 "$LDWbegin_reduce_float_256_false:\n"
2132 " .loc 3 181 0\n"
2133 " cvt.u32.u16 %r1, %ctaid.x;\n"
2134 " mul.lo.u32 %r2, %r1, 512;\n"
2135 " cvt.u32.u16 %r3, %tid.x;\n"
2136 " add.u32 %r4, %r2, %r3;\n"
2137 " ld.param.u32 %r5, [__cudaparm_reduce_float_256_false_n];\n"
2138 " setp.ge.u32 %p1, %r4, %r5;\n"
2139 " @%p1 bra $Lt_20_16642;\n"
2140 " add.u32 %r6, %r4, 256;\n"
2141 " ld.param.u32 %r5, [__cudaparm_reduce_float_256_false_n];\n"
2142 " add.u32 %r7, %r5, 256;\n"
2143 " mov.u16 %rh1, %nctaid.x;\n"
2144 " mul.wide.u16 %r8, %rh1, 512;\n"
2145 " cvt.s64.u32 %rd1, %r8;\n"
2146 " ld.param.u64 %rd2, [__cudaparm_reduce_float_256_false_g_idata];\n"
2147 " cvt.u64.u32 %rd3, %r4;\n"
2148 " mul.wide.u32 %rd4, %r4, 4;\n"
2149 " add.u64 %rd5, %rd2, %rd4;\n"
2150 " mul.wide.u32 %rd6, %r8, 4;\n"
2151 " mov.f32 %f1, 0f00000000; // 0\n"
2152 "$Lt_20_13570:\n"
2153 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2154 " .loc 3 188 0\n"
2155 " ld.global.f32 %f2, [%rd5+0];\n"
2156 " add.f32 %f1, %f2, %f1;\n"
2157 " .loc 3 181 0\n"
2158 " ld.param.u32 %r5, [__cudaparm_reduce_float_256_false_n];\n"
2159 " .loc 3 188 0\n"
2160 " setp.ge.u32 %p2, %r6, %r5;\n"
2161 " @%p2 bra $Lt_20_13826;\n"
2162 " //<loop> Part of loop body line 181, head labeled $Lt_20_13570\n"
2163 " .loc 3 191 0\n"
2164 " ld.global.f32 %f3, [%rd5+1024];\n"
2165 " add.f32 %f1, %f3, %f1;\n"
2166 "$Lt_20_13826:\n"
2167 " //<loop> Part of loop body line 181, head labeled $Lt_20_13570\n"
2168 " add.u32 %r6, %r6, %r8;\n"
2169 " add.u64 %rd5, %rd5, %rd6;\n"
2170 " setp.lt.u32 %p3, %r6, %r7;\n"
2171 " @%p3 bra $Lt_20_13570;\n"
2172 " bra.uni $Lt_20_13058;\n"
2173 "$Lt_20_16642:\n"
2174 " mov.f32 %f1, 0f00000000; // 0\n"
2175 "$Lt_20_13058:\n"
2176 " .loc 3 195 0\n"
2177 " mov.f32 %f4, %f1;\n"
2178 " mov.f32 %f5, %f4;\n"
2179 " .loc 3 71 0\n"
2180 " mov.u64 %rd7, __smem;\n"
2181 " cvt.u64.u32 %rd8, %r3;\n"
2182 " mul.wide.u32 %rd9, %r3, 4;\n"
2183 " add.u64 %rd10, %rd7, %rd9;\n"
2184 " st.volatile.shared.f32 [%rd10+0], %f4;\n"
2185 " .loc 3 72 0\n"
2186 " bar.sync 0;\n"
2187 " mov.u32 %r9, 127;\n"
2188 " setp.gt.u32 %p4, %r3, %r9;\n"
2189 " @%p4 bra $Lt_20_14594;\n"
2190 " .loc 3 76 0\n"
2191 " ld.volatile.shared.f32 %f6, [%rd10+512];\n"
2192 " add.f32 %f5, %f6, %f4;\n"
2193 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2194 "$Lt_20_14594:\n"
2195 " bar.sync 0;\n"
2196 " mov.u32 %r10, 63;\n"
2197 " setp.gt.u32 %p5, %r3, %r10;\n"
2198 " @%p5 bra $Lt_20_15106;\n"
2199 " .loc 3 77 0\n"
2200 " ld.volatile.shared.f32 %f7, [%rd10+256];\n"
2201 " add.f32 %f5, %f7, %f5;\n"
2202 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2203 "$Lt_20_15106:\n"
2204 " bar.sync 0;\n"
2205 " mov.u32 %r11, 31;\n"
2206 " setp.gt.u32 %p6, %r3, %r11;\n"
2207 " @%p6 bra $Lt_20_15618;\n"
2208 " .loc 3 83 0\n"
2209 " ld.volatile.shared.f32 %f8, [%rd10+128];\n"
2210 " add.f32 %f9, %f8, %f5;\n"
2211 " st.volatile.shared.f32 [%rd10+0], %f9;\n"
2212 " .loc 3 84 0\n"
2213 " ld.volatile.shared.f32 %f10, [%rd10+64];\n"
2214 " add.f32 %f11, %f10, %f9;\n"
2215 " st.volatile.shared.f32 [%rd10+0], %f11;\n"
2216 " .loc 3 85 0\n"
2217 " ld.volatile.shared.f32 %f12, [%rd10+32];\n"
2218 " add.f32 %f13, %f12, %f11;\n"
2219 " st.volatile.shared.f32 [%rd10+0], %f13;\n"
2220 " .loc 3 86 0\n"
2221 " ld.volatile.shared.f32 %f14, [%rd10+16];\n"
2222 " add.f32 %f15, %f14, %f13;\n"
2223 " st.volatile.shared.f32 [%rd10+0], %f15;\n"
2224 " .loc 3 87 0\n"
2225 " ld.volatile.shared.f32 %f16, [%rd10+8];\n"
2226 " add.f32 %f17, %f16, %f15;\n"
2227 " st.volatile.shared.f32 [%rd10+0], %f17;\n"
2228 " .loc 3 88 0\n"
2229 " ld.volatile.shared.f32 %f18, [%rd10+4];\n"
2230 " add.f32 %f5, %f18, %f17;\n"
2231 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2232 "$Lt_20_15618:\n"
2233 " .loc 3 195 0\n"
2234 " mov.u32 %r12, 0;\n"
2235 " setp.ne.u32 %p7, %r3, %r12;\n"
2236 " @%p7 bra $Lt_20_16130;\n"
2237 " .loc 3 199 0\n"
2238 " ld.shared.f32 %f19, [__smem+0];\n"
2239 " ld.param.u64 %rd11, [__cudaparm_reduce_float_256_false_g_odata];\n"
2240 " cvt.u64.u32 %rd12, %r1;\n"
2241 " mul.wide.u32 %rd13, %r1, 4;\n"
2242 " add.u64 %rd14, %rd11, %rd13;\n"
2243 " st.global.f32 [%rd14+0], %f19;\n"
2244 "$Lt_20_16130:\n"
2245 " .loc 3 466 0\n"
2246 " exit;\n"
2247 "$LDWend_reduce_float_256_false:\n"
2248 " } // reduce_float_256_false\n"
2249 "\n"
2250 " .entry reduce_float_512_false (\n"
2251 " .param .u64 __cudaparm_reduce_float_512_false_g_idata,\n"
2252 " .param .u64 __cudaparm_reduce_float_512_false_g_odata,\n"
2253 " .param .u32 __cudaparm_reduce_float_512_false_n)\n"
2254 " {\n"
2255 " .reg .u16 %rh<3>;\n"
2256 " .reg .u32 %r<15>;\n"
2257 " .reg .u64 %rd<16>;\n"
2258 " .reg .f32 %f<22>;\n"
2259 " .reg .pred %p<10>;\n"
2260 " .loc 3 468 0\n"
2261 "$LDWbegin_reduce_float_512_false:\n"
2262 " .loc 3 181 0\n"
2263 " cvt.u32.u16 %r1, %ctaid.x;\n"
2264 " mul.lo.u32 %r2, %r1, 1024;\n"
2265 " cvt.u32.u16 %r3, %tid.x;\n"
2266 " add.u32 %r4, %r2, %r3;\n"
2267 " ld.param.u32 %r5, [__cudaparm_reduce_float_512_false_n];\n"
2268 " setp.ge.u32 %p1, %r4, %r5;\n"
2269 " @%p1 bra $Lt_21_16898;\n"
2270 " add.u32 %r6, %r4, 512;\n"
2271 " ld.param.u32 %r5, [__cudaparm_reduce_float_512_false_n];\n"
2272 " add.u32 %r7, %r5, 512;\n"
2273 " mov.u16 %rh1, %nctaid.x;\n"
2274 " mul.wide.u16 %r8, %rh1, 1024;\n"
2275 " cvt.s64.u32 %rd1, %r8;\n"
2276 " ld.param.u64 %rd2, [__cudaparm_reduce_float_512_false_g_idata];\n"
2277 " cvt.u64.u32 %rd3, %r4;\n"
2278 " mul.wide.u32 %rd4, %r4, 4;\n"
2279 " add.u64 %rd5, %rd2, %rd4;\n"
2280 " mul.wide.u32 %rd6, %r8, 4;\n"
2281 " mov.f32 %f1, 0f00000000; // 0\n"
2282 "$Lt_21_13314:\n"
2283 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2284 " .loc 3 188 0\n"
2285 " ld.global.f32 %f2, [%rd5+0];\n"
2286 " add.f32 %f1, %f2, %f1;\n"
2287 " .loc 3 181 0\n"
2288 " ld.param.u32 %r5, [__cudaparm_reduce_float_512_false_n];\n"
2289 " .loc 3 188 0\n"
2290 " setp.ge.u32 %p2, %r6, %r5;\n"
2291 " @%p2 bra $Lt_21_13570;\n"
2292 " //<loop> Part of loop body line 181, head labeled $Lt_21_13314\n"
2293 " .loc 3 191 0\n"
2294 " ld.global.f32 %f3, [%rd5+2048];\n"
2295 " add.f32 %f1, %f3, %f1;\n"
2296 "$Lt_21_13570:\n"
2297 " //<loop> Part of loop body line 181, head labeled $Lt_21_13314\n"
2298 " add.u32 %r6, %r6, %r8;\n"
2299 " add.u64 %rd5, %rd5, %rd6;\n"
2300 " setp.lt.u32 %p3, %r6, %r7;\n"
2301 " @%p3 bra $Lt_21_13314;\n"
2302 " bra.uni $Lt_21_12802;\n"
2303 "$Lt_21_16898:\n"
2304 " mov.f32 %f1, 0f00000000; // 0\n"
2305 "$Lt_21_12802:\n"
2306 " .loc 3 195 0\n"
2307 " mov.f32 %f4, %f1;\n"
2308 " mov.f32 %f5, %f4;\n"
2309 " .loc 3 71 0\n"
2310 " mov.u64 %rd7, __smem;\n"
2311 " cvt.u64.u32 %rd8, %r3;\n"
2312 " mul.wide.u32 %rd9, %r3, 4;\n"
2313 " add.u64 %rd10, %rd7, %rd9;\n"
2314 " st.volatile.shared.f32 [%rd10+0], %f4;\n"
2315 " .loc 3 72 0\n"
2316 " bar.sync 0;\n"
2317 " mov.u32 %r9, 255;\n"
2318 " setp.gt.u32 %p4, %r3, %r9;\n"
2319 " @%p4 bra $Lt_21_14338;\n"
2320 " .loc 3 75 0\n"
2321 " ld.volatile.shared.f32 %f6, [%rd10+1024];\n"
2322 " add.f32 %f5, %f6, %f4;\n"
2323 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2324 "$Lt_21_14338:\n"
2325 " bar.sync 0;\n"
2326 " mov.u32 %r10, 127;\n"
2327 " setp.gt.u32 %p5, %r3, %r10;\n"
2328 " @%p5 bra $Lt_21_14850;\n"
2329 " .loc 3 76 0\n"
2330 " ld.volatile.shared.f32 %f7, [%rd10+512];\n"
2331 " add.f32 %f5, %f7, %f5;\n"
2332 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2333 "$Lt_21_14850:\n"
2334 " bar.sync 0;\n"
2335 " mov.u32 %r11, 63;\n"
2336 " setp.gt.u32 %p6, %r3, %r11;\n"
2337 " @%p6 bra $Lt_21_15362;\n"
2338 " .loc 3 77 0\n"
2339 " ld.volatile.shared.f32 %f8, [%rd10+256];\n"
2340 " add.f32 %f5, %f8, %f5;\n"
2341 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2342 "$Lt_21_15362:\n"
2343 " bar.sync 0;\n"
2344 " mov.u32 %r12, 31;\n"
2345 " setp.gt.u32 %p7, %r3, %r12;\n"
2346 " @%p7 bra $Lt_21_15874;\n"
2347 " .loc 3 83 0\n"
2348 " ld.volatile.shared.f32 %f9, [%rd10+128];\n"
2349 " add.f32 %f10, %f9, %f5;\n"
2350 " st.volatile.shared.f32 [%rd10+0], %f10;\n"
2351 " .loc 3 84 0\n"
2352 " ld.volatile.shared.f32 %f11, [%rd10+64];\n"
2353 " add.f32 %f12, %f11, %f10;\n"
2354 " st.volatile.shared.f32 [%rd10+0], %f12;\n"
2355 " .loc 3 85 0\n"
2356 " ld.volatile.shared.f32 %f13, [%rd10+32];\n"
2357 " add.f32 %f14, %f13, %f12;\n"
2358 " st.volatile.shared.f32 [%rd10+0], %f14;\n"
2359 " .loc 3 86 0\n"
2360 " ld.volatile.shared.f32 %f15, [%rd10+16];\n"
2361 " add.f32 %f16, %f15, %f14;\n"
2362 " st.volatile.shared.f32 [%rd10+0], %f16;\n"
2363 " .loc 3 87 0\n"
2364 " ld.volatile.shared.f32 %f17, [%rd10+8];\n"
2365 " add.f32 %f18, %f17, %f16;\n"
2366 " st.volatile.shared.f32 [%rd10+0], %f18;\n"
2367 " .loc 3 88 0\n"
2368 " ld.volatile.shared.f32 %f19, [%rd10+4];\n"
2369 " add.f32 %f5, %f19, %f18;\n"
2370 " st.volatile.shared.f32 [%rd10+0], %f5;\n"
2371 "$Lt_21_15874:\n"
2372 " .loc 3 195 0\n"
2373 " mov.u32 %r13, 0;\n"
2374 " setp.ne.u32 %p8, %r3, %r13;\n"
2375 " @%p8 bra $Lt_21_16386;\n"
2376 " .loc 3 199 0\n"
2377 " ld.shared.f32 %f20, [__smem+0];\n"
2378 " ld.param.u64 %rd11, [__cudaparm_reduce_float_512_false_g_odata];\n"
2379 " cvt.u64.u32 %rd12, %r1;\n"
2380 " mul.wide.u32 %rd13, %r1, 4;\n"
2381 " add.u64 %rd14, %rd11, %rd13;\n"
2382 " st.global.f32 [%rd14+0], %f20;\n"
2383 "$Lt_21_16386:\n"
2384 " .loc 3 471 0\n"
2385 " exit;\n"
2386 "$LDWend_reduce_float_512_false:\n"
2387 " } // reduce_float_512_false\n"
2388 "\n"
2389 " .entry tex_reduce_256_false (\n"
2390 " .param .u64 __cudaparm_tex_reduce_256_false_g_odata,\n"
2391 " .param .u32 __cudaparm_tex_reduce_256_false_n,\n"
2392 " .param .u32 __cudaparm_tex_reduce_256_false_stride)\n"
2393 " {\n"
2394 " .reg .u16 %rh<3>;\n"
2395 " .reg .u32 %r<19>;\n"
2396 " .reg .u64 %rd<10>;\n"
2397 " .reg .f32 %f<37>;\n"
2398 " .reg .pred %p<9>;\n"
2399 " .loc 3 477 0\n"
2400 "$LDWbegin_tex_reduce_256_false:\n"
2401 " .loc 3 240 0\n"
2402 " cvt.u32.u16 %r1, %ctaid.x;\n"
2403 " mul.lo.u32 %r2, %r1, 512;\n"
2404 " cvt.u32.u16 %r3, %tid.x;\n"
2405 " add.u32 %r4, %r2, %r3;\n"
2406 " mov.s32 %r5, %r4;\n"
2407 " ld.param.u32 %r6, [__cudaparm_tex_reduce_256_false_n];\n"
2408 " setp.ge.u32 %p1, %r4, %r6;\n"
2409 " @%p1 bra $Lt_22_16642;\n"
2410 " mov.u16 %rh1, %nctaid.x;\n"
2411 " mul.wide.u16 %r7, %rh1, 512;\n"
2412 " ld.param.u32 %r8, [__cudaparm_tex_reduce_256_false_stride];\n"
2413 " mov.f32 %f1, 0f00000000; // 0\n"
2414 "$Lt_22_13570:\n"
2415 " //<loop> Loop body line 240, nesting depth: 1, estimated iterations: unknown\n"
2416 " ld.param.u32 %r8, [__cudaparm_tex_reduce_256_false_stride];\n"
2417 " rem.u32 %r9, %r5, %r8;\n"
2418 " cvt.rn.f32.u32 %f2, %r9;\n"
2419 " div.u32 %r10, %r5, %r8;\n"
2420 " cvt.rn.f32.u32 %f3, %r10;\n"
2421 " mov.f32 %f4, 0f00000000; // 0\n"
2422 " mov.f32 %f5, 0f00000000; // 0\n"
2423 " tex.2d.v4.f32.f32 {%f6,%f7,%f8,%f9},[tex_ref_1,{%f2,%f3,%f4,%f5}];\n"
2424 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2425 " .loc 3 248 0\n"
2426 " mov.f32 %f10, %f6;\n"
2427 " add.f32 %f1, %f10, %f1;\n"
2428 " add.u32 %r11, %r5, 256;\n"
2429 " .loc 3 240 0\n"
2430 " ld.param.u32 %r6, [__cudaparm_tex_reduce_256_false_n];\n"
2431 " .loc 3 248 0\n"
2432 " setp.ge.u32 %p2, %r11, %r6;\n"
2433 " @%p2 bra $Lt_22_13826;\n"
2434 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2435 " .loc 3 240 0\n"
2436 " ld.param.u32 %r8, [__cudaparm_tex_reduce_256_false_stride];\n"
2437 " .loc 3 248 0\n"
2438 " rem.u32 %r12, %r11, %r8;\n"
2439 " cvt.rn.f32.u32 %f11, %r12;\n"
2440 " div.u32 %r13, %r11, %r8;\n"
2441 " cvt.rn.f32.u32 %f12, %r13;\n"
2442 " mov.f32 %f13, 0f00000000; // 0\n"
2443 " mov.f32 %f14, 0f00000000; // 0\n"
2444 " tex.2d.v4.f32.f32 {%f15,%f16,%f17,%f18},[tex_ref_1,{%f11,%f12,%f13,%f14}];\n"
2445 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2446 " .loc 3 253 0\n"
2447 " mov.f32 %f19, %f15;\n"
2448 " add.f32 %f1, %f19, %f1;\n"
2449 "$Lt_22_13826:\n"
2450 " //<loop> Part of loop body line 240, head labeled $Lt_22_13570\n"
2451 " add.u32 %r5, %r7, %r5;\n"
2452 " .loc 3 240 0\n"
2453 " ld.param.u32 %r6, [__cudaparm_tex_reduce_256_false_n];\n"
2454 " .loc 3 253 0\n"
2455 " setp.lt.u32 %p3, %r5, %r6;\n"
2456 " @%p3 bra $Lt_22_13570;\n"
2457 " bra.uni $Lt_22_13058;\n"
2458 "$Lt_22_16642:\n"
2459 " mov.f32 %f1, 0f00000000; // 0\n"
2460 "$Lt_22_13058:\n"
2461 " .loc 3 258 0\n"
2462 " mov.f32 %f20, %f1;\n"
2463 " mov.f32 %f21, %f20;\n"
2464 " .loc 3 71 0\n"
2465 " mov.u64 %rd1, __smem;\n"
2466 " cvt.u64.u32 %rd2, %r3;\n"
2467 " mul.wide.u32 %rd3, %r3, 4;\n"
2468 " add.u64 %rd4, %rd1, %rd3;\n"
2469 " st.volatile.shared.f32 [%rd4+0], %f20;\n"
2470 " .loc 3 72 0\n"
2471 " bar.sync 0;\n"
2472 " mov.u32 %r14, 127;\n"
2473 " setp.gt.u32 %p4, %r3, %r14;\n"
2474 " @%p4 bra $Lt_22_14594;\n"
2475 " .loc 3 76 0\n"
2476 " ld.volatile.shared.f32 %f22, [%rd4+512];\n"
2477 " add.f32 %f21, %f22, %f20;\n"
2478 " st.volatile.shared.f32 [%rd4+0], %f21;\n"
2479 "$Lt_22_14594:\n"
2480 " bar.sync 0;\n"
2481 " mov.u32 %r15, 63;\n"
2482 " setp.gt.u32 %p5, %r3, %r15;\n"
2483 " @%p5 bra $Lt_22_15106;\n"
2484 " .loc 3 77 0\n"
2485 " ld.volatile.shared.f32 %f23, [%rd4+256];\n"
2486 " add.f32 %f21, %f23, %f21;\n"
2487 " st.volatile.shared.f32 [%rd4+0], %f21;\n"
2488 "$Lt_22_15106:\n"
2489 " bar.sync 0;\n"
2490 " mov.u32 %r16, 31;\n"
2491 " setp.gt.u32 %p6, %r3, %r16;\n"
2492 " @%p6 bra $Lt_22_15618;\n"
2493 " .loc 3 83 0\n"
2494 " ld.volatile.shared.f32 %f24, [%rd4+128];\n"
2495 " add.f32 %f25, %f24, %f21;\n"
2496 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2497 " .loc 3 84 0\n"
2498 " ld.volatile.shared.f32 %f26, [%rd4+64];\n"
2499 " add.f32 %f27, %f26, %f25;\n"
2500 " st.volatile.shared.f32 [%rd4+0], %f27;\n"
2501 " .loc 3 85 0\n"
2502 " ld.volatile.shared.f32 %f28, [%rd4+32];\n"
2503 " add.f32 %f29, %f28, %f27;\n"
2504 " st.volatile.shared.f32 [%rd4+0], %f29;\n"
2505 " .loc 3 86 0\n"
2506 " ld.volatile.shared.f32 %f30, [%rd4+16];\n"
2507 " add.f32 %f31, %f30, %f29;\n"
2508 " st.volatile.shared.f32 [%rd4+0], %f31;\n"
2509 " .loc 3 87 0\n"
2510 " ld.volatile.shared.f32 %f32, [%rd4+8];\n"
2511 " add.f32 %f33, %f32, %f31;\n"
2512 " st.volatile.shared.f32 [%rd4+0], %f33;\n"
2513 " .loc 3 88 0\n"
2514 " ld.volatile.shared.f32 %f34, [%rd4+4];\n"
2515 " add.f32 %f21, %f34, %f33;\n"
2516 " st.volatile.shared.f32 [%rd4+0], %f21;\n"
2517 "$Lt_22_15618:\n"
2518 " .loc 3 258 0\n"
2519 " mov.u32 %r17, 0;\n"
2520 " setp.ne.u32 %p7, %r3, %r17;\n"
2521 " @%p7 bra $Lt_22_16130;\n"
2522 " .loc 3 262 0\n"
2523 " ld.shared.f32 %f35, [__smem+0];\n"
2524 " ld.param.u64 %rd5, [__cudaparm_tex_reduce_256_false_g_odata];\n"
2525 " cvt.u64.u32 %rd6, %r1;\n"
2526 " mul.wide.u32 %rd7, %r1, 4;\n"
2527 " add.u64 %rd8, %rd5, %rd7;\n"
2528 " st.global.f32 [%rd8+0], %f35;\n"
2529 "$Lt_22_16130:\n"
2530 " .loc 3 480 0\n"
2531 " exit;\n"
2532 "$LDWend_tex_reduce_256_false:\n"
2533 " } // tex_reduce_256_false\n"
2534 "\n"
2535 " .entry tex_count_256_false (\n"
2536 " .param .u64 __cudaparm_tex_count_256_false_g_odata,\n"
2537 " .param .u32 __cudaparm_tex_count_256_false_n,\n"
2538 " .param .u32 __cudaparm_tex_count_256_false_stride)\n"
2539 " {\n"
2540 " .reg .u16 %rh<3>;\n"
2541 " .reg .u32 %r<19>;\n"
2542 " .reg .u64 %rd<10>;\n"
2543 " .reg .f32 %f<41>;\n"
2544 " .reg .f64 %fd<6>;\n"
2545 " .reg .pred %p<11>;\n"
2546 " .loc 3 482 0\n"
2547 "$LDWbegin_tex_count_256_false:\n"
2548 " .loc 3 333 0\n"
2549 " cvt.u32.u16 %r1, %ctaid.x;\n"
2550 " mul.lo.u32 %r2, %r1, 512;\n"
2551 " cvt.u32.u16 %r3, %tid.x;\n"
2552 " add.u32 %r4, %r2, %r3;\n"
2553 " mov.s32 %r5, %r4;\n"
2554 " ld.param.u32 %r6, [__cudaparm_tex_count_256_false_n];\n"
2555 " setp.ge.u32 %p1, %r4, %r6;\n"
2556 " @%p1 bra $Lt_23_18178;\n"
2557 " mov.u16 %rh1, %nctaid.x;\n"
2558 " mul.wide.u16 %r7, %rh1, 512;\n"
2559 " ld.param.u32 %r8, [__cudaparm_tex_count_256_false_stride];\n"
2560 " mov.f32 %f1, 0f00000000; // 0\n"
2561 "$Lt_23_15106:\n"
2562 " //<loop> Loop body line 333, nesting depth: 1, estimated iterations: unknown\n"
2563 " ld.param.u32 %r8, [__cudaparm_tex_count_256_false_stride];\n"
2564 " rem.u32 %r9, %r5, %r8;\n"
2565 " cvt.rn.f32.u32 %f2, %r9;\n"
2566 " div.u32 %r10, %r5, %r8;\n"
2567 " cvt.rn.f32.u32 %f3, %r10;\n"
2568 " mov.f32 %f4, 0f00000000; // 0\n"
2569 " mov.f32 %f5, 0f00000000; // 0\n"
2570 " tex.2d.v4.f32.f32 {%f6,%f7,%f8,%f9},[tex_ref_1,{%f2,%f3,%f4,%f5}];\n"
2571 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2572 " .loc 3 341 0\n"
2573 " mov.f32 %f10, %f6;\n"
2574 " .loc 3 340 0\n"
2575 " mov.f32 %f11, 0f3f800000; // 1\n"
2576 " add.f32 %f12, %f1, %f11;\n"
2577 " cvt.f64.f32 %fd1, %f10;\n"
2578 " mov.f64 %fd2, 0d0000000000000000; // 0\n"
2579 " setp.ne.f64 %p2, %fd1, %fd2;\n"
2580 " selp.f32 %f1, %f12, %f1, %p2;\n"
2581 " add.u32 %r11, %r5, 256;\n"
2582 " .loc 3 333 0\n"
2583 " ld.param.u32 %r6, [__cudaparm_tex_count_256_false_n];\n"
2584 " .loc 3 340 0\n"
2585 " setp.ge.u32 %p3, %r11, %r6;\n"
2586 " @%p3 bra $Lt_23_15362;\n"
2587 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2588 " .loc 3 333 0\n"
2589 " ld.param.u32 %r8, [__cudaparm_tex_count_256_false_stride];\n"
2590 " .loc 3 340 0\n"
2591 " rem.u32 %r12, %r11, %r8;\n"
2592 " cvt.rn.f32.u32 %f13, %r12;\n"
2593 " div.u32 %r13, %r11, %r8;\n"
2594 " cvt.rn.f32.u32 %f14, %r13;\n"
2595 " mov.f32 %f15, 0f00000000; // 0\n"
2596 " mov.f32 %f16, 0f00000000; // 0\n"
2597 " tex.2d.v4.f32.f32 {%f17,%f18,%f19,%f20},[tex_ref_1,{%f13,%f14,%f15,%f16}];\n"
2598 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2599 " .loc 3 348 0\n"
2600 " mov.f32 %f21, %f17;\n"
2601 " .loc 3 347 0\n"
2602 " mov.f32 %f22, 0f3f800000; // 1\n"
2603 " add.f32 %f23, %f1, %f22;\n"
2604 " cvt.f64.f32 %fd3, %f21;\n"
2605 " mov.f64 %fd4, 0d0000000000000000; // 0\n"
2606 " setp.ne.f64 %p4, %fd3, %fd4;\n"
2607 " selp.f32 %f1, %f23, %f1, %p4;\n"
2608 "$Lt_23_15362:\n"
2609 " //<loop> Part of loop body line 333, head labeled $Lt_23_15106\n"
2610 " add.u32 %r5, %r7, %r5;\n"
2611 " .loc 3 333 0\n"
2612 " ld.param.u32 %r6, [__cudaparm_tex_count_256_false_n];\n"
2613 " .loc 3 347 0\n"
2614 " setp.lt.u32 %p5, %r5, %r6;\n"
2615 " @%p5 bra $Lt_23_15106;\n"
2616 " bra.uni $Lt_23_14594;\n"
2617 "$Lt_23_18178:\n"
2618 " mov.f32 %f1, 0f00000000; // 0\n"
2619 "$Lt_23_14594:\n"
2620 " .loc 3 355 0\n"
2621 " mov.f32 %f24, %f1;\n"
2622 " mov.f32 %f25, %f24;\n"
2623 " .loc 3 71 0\n"
2624 " mov.u64 %rd1, __smem;\n"
2625 " cvt.u64.u32 %rd2, %r3;\n"
2626 " mul.wide.u32 %rd3, %r3, 4;\n"
2627 " add.u64 %rd4, %rd1, %rd3;\n"
2628 " st.volatile.shared.f32 [%rd4+0], %f24;\n"
2629 " .loc 3 72 0\n"
2630 " bar.sync 0;\n"
2631 " mov.u32 %r14, 127;\n"
2632 " setp.gt.u32 %p6, %r3, %r14;\n"
2633 " @%p6 bra $Lt_23_16130;\n"
2634 " .loc 3 76 0\n"
2635 " ld.volatile.shared.f32 %f26, [%rd4+512];\n"
2636 " add.f32 %f25, %f26, %f24;\n"
2637 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2638 "$Lt_23_16130:\n"
2639 " bar.sync 0;\n"
2640 " mov.u32 %r15, 63;\n"
2641 " setp.gt.u32 %p7, %r3, %r15;\n"
2642 " @%p7 bra $Lt_23_16642;\n"
2643 " .loc 3 77 0\n"
2644 " ld.volatile.shared.f32 %f27, [%rd4+256];\n"
2645 " add.f32 %f25, %f27, %f25;\n"
2646 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2647 "$Lt_23_16642:\n"
2648 " bar.sync 0;\n"
2649 " mov.u32 %r16, 31;\n"
2650 " setp.gt.u32 %p8, %r3, %r16;\n"
2651 " @%p8 bra $Lt_23_17154;\n"
2652 " .loc 3 83 0\n"
2653 " ld.volatile.shared.f32 %f28, [%rd4+128];\n"
2654 " add.f32 %f29, %f28, %f25;\n"
2655 " st.volatile.shared.f32 [%rd4+0], %f29;\n"
2656 " .loc 3 84 0\n"
2657 " ld.volatile.shared.f32 %f30, [%rd4+64];\n"
2658 " add.f32 %f31, %f30, %f29;\n"
2659 " st.volatile.shared.f32 [%rd4+0], %f31;\n"
2660 " .loc 3 85 0\n"
2661 " ld.volatile.shared.f32 %f32, [%rd4+32];\n"
2662 " add.f32 %f33, %f32, %f31;\n"
2663 " st.volatile.shared.f32 [%rd4+0], %f33;\n"
2664 " .loc 3 86 0\n"
2665 " ld.volatile.shared.f32 %f34, [%rd4+16];\n"
2666 " add.f32 %f35, %f34, %f33;\n"
2667 " st.volatile.shared.f32 [%rd4+0], %f35;\n"
2668 " .loc 3 87 0\n"
2669 " ld.volatile.shared.f32 %f36, [%rd4+8];\n"
2670 " add.f32 %f37, %f36, %f35;\n"
2671 " st.volatile.shared.f32 [%rd4+0], %f37;\n"
2672 " .loc 3 88 0\n"
2673 " ld.volatile.shared.f32 %f38, [%rd4+4];\n"
2674 " add.f32 %f25, %f38, %f37;\n"
2675 " st.volatile.shared.f32 [%rd4+0], %f25;\n"
2676 "$Lt_23_17154:\n"
2677 " .loc 3 355 0\n"
2678 " mov.u32 %r17, 0;\n"
2679 " setp.ne.u32 %p9, %r3, %r17;\n"
2680 " @%p9 bra $Lt_23_17666;\n"
2681 " .loc 3 359 0\n"
2682 " ld.shared.f32 %f39, [__smem+0];\n"
2683 " ld.param.u64 %rd5, [__cudaparm_tex_count_256_false_g_odata];\n"
2684 " cvt.u64.u32 %rd6, %r1;\n"
2685 " mul.wide.u32 %rd7, %r1, 4;\n"
2686 " add.u64 %rd8, %rd5, %rd7;\n"
2687 " st.global.f32 [%rd8+0], %f39;\n"
2688 "$Lt_23_17666:\n"
2689 " .loc 3 485 0\n"
2690 " exit;\n"
2691 "$LDWend_tex_count_256_false:\n"
2692 " } // tex_count_256_false\n"
2693 "\n"
2694 " .entry chamfer_reduce_256_false (\n"
2695 " .param .u64 __cudaparm_chamfer_reduce_256_false_g_odata,\n"
2696 " .param .u32 __cudaparm_chamfer_reduce_256_false_n,\n"
2697 " .param .u32 __cudaparm_chamfer_reduce_256_false_stride)\n"
2698 " {\n"
2699 " .reg .u16 %rh<3>;\n"
2700 " .reg .u32 %r<19>;\n"
2701 " .reg .u64 %rd<10>;\n"
2702 " .reg .f32 %f<59>;\n"
2703 " .reg .pred %p<9>;\n"
2704 " .loc 3 487 0\n"
2705 "$LDWbegin_chamfer_reduce_256_false:\n"
2706 " .loc 3 287 0\n"
2707 " cvt.u32.u16 %r1, %ctaid.x;\n"
2708 " mul.lo.u32 %r2, %r1, 512;\n"
2709 " cvt.u32.u16 %r3, %tid.x;\n"
2710 " add.u32 %r4, %r2, %r3;\n"
2711 " mov.s32 %r5, %r4;\n"
2712 " ld.param.u32 %r6, [__cudaparm_chamfer_reduce_256_false_n];\n"
2713 " setp.ge.u32 %p1, %r4, %r6;\n"
2714 " @%p1 bra $Lt_24_16642;\n"
2715 " mov.u16 %rh1, %nctaid.x;\n"
2716 " mul.wide.u16 %r7, %rh1, 512;\n"
2717 " ld.param.u32 %r8, [__cudaparm_chamfer_reduce_256_false_stride];\n"
2718 " mov.f32 %f1, 0f00000000; // 0\n"
2719 "$Lt_24_13570:\n"
2720 " //<loop> Loop body line 287, nesting depth: 1, estimated iterations: unknown\n"
2721 " ld.param.u32 %r8, [__cudaparm_chamfer_reduce_256_false_stride];\n"
2722 " rem.u32 %r9, %r5, %r8;\n"
2723 " cvt.rn.f32.u32 %f2, %r9;\n"
2724 " div.u32 %r10, %r5, %r8;\n"
2725 " cvt.rn.f32.u32 %f3, %r10;\n"
2726 " mov.f32 %f4, %f2;\n"
2727 " mov.f32 %f5, %f3;\n"
2728 " mov.f32 %f6, 0f00000000; // 0\n"
2729 " mov.f32 %f7, 0f00000000; // 0\n"
2730 " tex.2d.v4.f32.f32 {%f8,%f9,%f10,%f11},[tex_ref_1,{%f4,%f5,%f6,%f7}];\n"
2731 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2732 " .loc 3 295 0\n"
2733 " mov.f32 %f12, %f8;\n"
2734 " mov.f32 %f13, %f2;\n"
2735 " mov.f32 %f14, %f3;\n"
2736 " mov.f32 %f15, 0f00000000; // 0\n"
2737 " mov.f32 %f16, 0f00000000; // 0\n"
2738 " tex.2d.v4.f32.f32 {%f17,%f18,%f19,%f20},[tex_ref_2,{%f13,%f14,%f15,%f16}];\n"
2739 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2740 " mov.f32 %f21, %f17;\n"
2741 " mad.f32 %f1, %f12, %f21, %f1;\n"
2742 " add.u32 %r11, %r5, 256;\n"
2743 " .loc 3 287 0\n"
2744 " ld.param.u32 %r6, [__cudaparm_chamfer_reduce_256_false_n];\n"
2745 " .loc 3 295 0\n"
2746 " setp.ge.u32 %p2, %r11, %r6;\n"
2747 " @%p2 bra $Lt_24_13826;\n"
2748 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2749 " .loc 3 287 0\n"
2750 " ld.param.u32 %r8, [__cudaparm_chamfer_reduce_256_false_stride];\n"
2751 " .loc 3 295 0\n"
2752 " rem.u32 %r12, %r11, %r8;\n"
2753 " cvt.rn.f32.u32 %f22, %r12;\n"
2754 " div.u32 %r13, %r11, %r8;\n"
2755 " cvt.rn.f32.u32 %f23, %r13;\n"
2756 " mov.f32 %f24, %f22;\n"
2757 " mov.f32 %f25, %f23;\n"
2758 " mov.f32 %f26, 0f00000000; // 0\n"
2759 " mov.f32 %f27, 0f00000000; // 0\n"
2760 " tex.2d.v4.f32.f32 {%f28,%f29,%f30,%f31},[tex_ref_1,{%f24,%f25,%f26,%f27}];\n"
2761 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2762 " .loc 3 300 0\n"
2763 " mov.f32 %f32, %f28;\n"
2764 " mov.f32 %f33, %f22;\n"
2765 " mov.f32 %f34, %f23;\n"
2766 " mov.f32 %f35, 0f00000000; // 0\n"
2767 " mov.f32 %f36, 0f00000000; // 0\n"
2768 " tex.2d.v4.f32.f32 {%f37,%f38,%f39,%f40},[tex_ref_2,{%f33,%f34,%f35,%f36}];\n"
2769 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2770 " mov.f32 %f41, %f37;\n"
2771 " mad.f32 %f1, %f32, %f41, %f1;\n"
2772 "$Lt_24_13826:\n"
2773 " //<loop> Part of loop body line 287, head labeled $Lt_24_13570\n"
2774 " add.u32 %r5, %r7, %r5;\n"
2775 " .loc 3 287 0\n"
2776 " ld.param.u32 %r6, [__cudaparm_chamfer_reduce_256_false_n];\n"
2777 " .loc 3 300 0\n"
2778 " setp.lt.u32 %p3, %r5, %r6;\n"
2779 " @%p3 bra $Lt_24_13570;\n"
2780 " bra.uni $Lt_24_13058;\n"
2781 "$Lt_24_16642:\n"
2782 " mov.f32 %f1, 0f00000000; // 0\n"
2783 "$Lt_24_13058:\n"
2784 " .loc 3 305 0\n"
2785 " mov.f32 %f42, %f1;\n"
2786 " mov.f32 %f43, %f42;\n"
2787 " .loc 3 71 0\n"
2788 " mov.u64 %rd1, __smem;\n"
2789 " cvt.u64.u32 %rd2, %r3;\n"
2790 " mul.wide.u32 %rd3, %r3, 4;\n"
2791 " add.u64 %rd4, %rd1, %rd3;\n"
2792 " st.volatile.shared.f32 [%rd4+0], %f42;\n"
2793 " .loc 3 72 0\n"
2794 " bar.sync 0;\n"
2795 " mov.u32 %r14, 127;\n"
2796 " setp.gt.u32 %p4, %r3, %r14;\n"
2797 " @%p4 bra $Lt_24_14594;\n"
2798 " .loc 3 76 0\n"
2799 " ld.volatile.shared.f32 %f44, [%rd4+512];\n"
2800 " add.f32 %f43, %f44, %f42;\n"
2801 " st.volatile.shared.f32 [%rd4+0], %f43;\n"
2802 "$Lt_24_14594:\n"
2803 " bar.sync 0;\n"
2804 " mov.u32 %r15, 63;\n"
2805 " setp.gt.u32 %p5, %r3, %r15;\n"
2806 " @%p5 bra $Lt_24_15106;\n"
2807 " .loc 3 77 0\n"
2808 " ld.volatile.shared.f32 %f45, [%rd4+256];\n"
2809 " add.f32 %f43, %f45, %f43;\n"
2810 " st.volatile.shared.f32 [%rd4+0], %f43;\n"
2811 "$Lt_24_15106:\n"
2812 " bar.sync 0;\n"
2813 " mov.u32 %r16, 31;\n"
2814 " setp.gt.u32 %p6, %r3, %r16;\n"
2815 " @%p6 bra $Lt_24_15618;\n"
2816 " .loc 3 83 0\n"
2817 " ld.volatile.shared.f32 %f46, [%rd4+128];\n"
2818 " add.f32 %f47, %f46, %f43;\n"
2819 " st.volatile.shared.f32 [%rd4+0], %f47;\n"
2820 " .loc 3 84 0\n"
2821 " ld.volatile.shared.f32 %f48, [%rd4+64];\n"
2822 " add.f32 %f49, %f48, %f47;\n"
2823 " st.volatile.shared.f32 [%rd4+0], %f49;\n"
2824 " .loc 3 85 0\n"
2825 " ld.volatile.shared.f32 %f50, [%rd4+32];\n"
2826 " add.f32 %f51, %f50, %f49;\n"
2827 " st.volatile.shared.f32 [%rd4+0], %f51;\n"
2828 " .loc 3 86 0\n"
2829 " ld.volatile.shared.f32 %f52, [%rd4+16];\n"
2830 " add.f32 %f53, %f52, %f51;\n"
2831 " st.volatile.shared.f32 [%rd4+0], %f53;\n"
2832 " .loc 3 87 0\n"
2833 " ld.volatile.shared.f32 %f54, [%rd4+8];\n"
2834 " add.f32 %f55, %f54, %f53;\n"
2835 " st.volatile.shared.f32 [%rd4+0], %f55;\n"
2836 " .loc 3 88 0\n"
2837 " ld.volatile.shared.f32 %f56, [%rd4+4];\n"
2838 " add.f32 %f43, %f56, %f55;\n"
2839 " st.volatile.shared.f32 [%rd4+0], %f43;\n"
2840 "$Lt_24_15618:\n"
2841 " .loc 3 305 0\n"
2842 " mov.u32 %r17, 0;\n"
2843 " setp.ne.u32 %p7, %r3, %r17;\n"
2844 " @%p7 bra $Lt_24_16130;\n"
2845 " .loc 3 309 0\n"
2846 " ld.shared.f32 %f57, [__smem+0];\n"
2847 " ld.param.u64 %rd5, [__cudaparm_chamfer_reduce_256_false_g_odata];\n"
2848 " cvt.u64.u32 %rd6, %r1;\n"
2849 " mul.wide.u32 %rd7, %r1, 4;\n"
2850 " add.u64 %rd8, %rd5, %rd7;\n"
2851 " st.global.f32 [%rd8+0], %f57;\n"
2852 "$Lt_24_16130:\n"
2853 " .loc 3 490 0\n"
2854 " exit;\n"
2855 "$LDWend_chamfer_reduce_256_false:\n"
2856 " } // chamfer_reduce_256_false\n"
2857 "\n"
2858 " .entry reduce_uchar_1_true (\n"
2859 " .param .u64 __cudaparm_reduce_uchar_1_true_g_idata,\n"
2860 " .param .u64 __cudaparm_reduce_uchar_1_true_g_odata,\n"
2861 " .param .u32 __cudaparm_reduce_uchar_1_true_n)\n"
2862 " {\n"
2863 " .reg .u16 %rh<3>;\n"
2864 " .reg .u32 %r<12>;\n"
2865 " .reg .u64 %rd<14>;\n"
2866 " .reg .f32 %f<7>;\n"
2867 " .reg .pred %p<5>;\n"
2868 " .loc 3 500 0\n"
2869 "$LDWbegin_reduce_uchar_1_true:\n"
2870 " .loc 3 181 0\n"
2871 " cvt.u32.u16 %r1, %ctaid.x;\n"
2872 " mul24.lo.u32 %r2, %r1, 2;\n"
2873 " cvt.u32.u16 %r3, %tid.x;\n"
2874 " add.u32 %r4, %r2, %r3;\n"
2875 " mov.s32 %r5, %r4;\n"
2876 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_1_true_n];\n"
2877 " setp.ge.u32 %p1, %r4, %r6;\n"
2878 " @%p1 bra $Lt_25_16642;\n"
2879 " mov.u16 %rh1, %nctaid.x;\n"
2880 " mul.wide.u16 %r7, %rh1, 2;\n"
2881 " cvt.u64.u32 %rd1, %r4;\n"
2882 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_1_true_g_idata];\n"
2883 " add.u64 %rd3, %rd1, %rd2;\n"
2884 " cvt.s64.u32 %rd4, %r7;\n"
2885 " mov.f32 %f1, 0f00000000; // 0\n"
2886 "$Lt_25_15618:\n"
2887 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2888 " .loc 3 188 0\n"
2889 " ld.global.u8 %r8, [%rd3+0];\n"
2890 " cvt.rn.f32.u32 %f2, %r8;\n"
2891 " add.f32 %f3, %f2, %f1;\n"
2892 " .loc 3 191 0\n"
2893 " ld.global.u8 %r9, [%rd3+1];\n"
2894 " cvt.rn.f32.u32 %f4, %r9;\n"
2895 " add.f32 %f1, %f4, %f3;\n"
2896 " add.u32 %r5, %r7, %r5;\n"
2897 " add.u64 %rd3, %rd4, %rd3;\n"
2898 " .loc 3 181 0\n"
2899 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_1_true_n];\n"
2900 " .loc 3 191 0\n"
2901 " setp.lt.u32 %p2, %r5, %r6;\n"
2902 " @%p2 bra $Lt_25_15618;\n"
2903 " bra.uni $Lt_25_15106;\n"
2904 "$Lt_25_16642:\n"
2905 " mov.f32 %f1, 0f00000000; // 0\n"
2906 "$Lt_25_15106:\n"
2907 " .loc 3 71 0\n"
2908 " mov.u64 %rd5, __smem;\n"
2909 " cvt.u64.u32 %rd6, %r3;\n"
2910 " mul.wide.u32 %rd7, %r3, 4;\n"
2911 " add.u64 %rd8, %rd5, %rd7;\n"
2912 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
2913 " .loc 3 72 0\n"
2914 " bar.sync 0;\n"
2915 " .loc 3 195 0\n"
2916 " mov.u32 %r10, 0;\n"
2917 " setp.ne.u32 %p3, %r3, %r10;\n"
2918 " @%p3 bra $Lt_25_16130;\n"
2919 " .loc 3 199 0\n"
2920 " ld.shared.f32 %f5, [__smem+0];\n"
2921 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_1_true_g_odata];\n"
2922 " cvt.u64.u32 %rd10, %r1;\n"
2923 " mul.wide.u32 %rd11, %r1, 4;\n"
2924 " add.u64 %rd12, %rd9, %rd11;\n"
2925 " st.global.f32 [%rd12+0], %f5;\n"
2926 "$Lt_25_16130:\n"
2927 " .loc 3 503 0\n"
2928 " exit;\n"
2929 "$LDWend_reduce_uchar_1_true:\n"
2930 " } // reduce_uchar_1_true\n"
2931 "\n"
2932 " .entry reduce_uchar_2_true (\n"
2933 " .param .u64 __cudaparm_reduce_uchar_2_true_g_idata,\n"
2934 " .param .u64 __cudaparm_reduce_uchar_2_true_g_odata,\n"
2935 " .param .u32 __cudaparm_reduce_uchar_2_true_n)\n"
2936 " {\n"
2937 " .reg .u16 %rh<3>;\n"
2938 " .reg .u32 %r<13>;\n"
2939 " .reg .u64 %rd<14>;\n"
2940 " .reg .f32 %f<9>;\n"
2941 " .reg .pred %p<6>;\n"
2942 " .loc 3 505 0\n"
2943 "$LDWbegin_reduce_uchar_2_true:\n"
2944 " .loc 3 181 0\n"
2945 " cvt.u32.u16 %r1, %ctaid.x;\n"
2946 " mul24.lo.u32 %r2, %r1, 4;\n"
2947 " cvt.u32.u16 %r3, %tid.x;\n"
2948 " add.u32 %r4, %r2, %r3;\n"
2949 " mov.s32 %r5, %r4;\n"
2950 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_2_true_n];\n"
2951 " setp.ge.u32 %p1, %r4, %r6;\n"
2952 " @%p1 bra $Lt_26_16898;\n"
2953 " mov.u16 %rh1, %nctaid.x;\n"
2954 " mul.wide.u16 %r7, %rh1, 4;\n"
2955 " cvt.u64.u32 %rd1, %r4;\n"
2956 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_2_true_g_idata];\n"
2957 " add.u64 %rd3, %rd1, %rd2;\n"
2958 " cvt.s64.u32 %rd4, %r7;\n"
2959 " mov.f32 %f1, 0f00000000; // 0\n"
2960 "$Lt_26_15362:\n"
2961 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
2962 " .loc 3 188 0\n"
2963 " ld.global.u8 %r8, [%rd3+0];\n"
2964 " cvt.rn.f32.u32 %f2, %r8;\n"
2965 " add.f32 %f3, %f2, %f1;\n"
2966 " .loc 3 191 0\n"
2967 " ld.global.u8 %r9, [%rd3+2];\n"
2968 " cvt.rn.f32.u32 %f4, %r9;\n"
2969 " add.f32 %f1, %f4, %f3;\n"
2970 " add.u32 %r5, %r7, %r5;\n"
2971 " add.u64 %rd3, %rd4, %rd3;\n"
2972 " .loc 3 181 0\n"
2973 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_2_true_n];\n"
2974 " .loc 3 191 0\n"
2975 " setp.lt.u32 %p2, %r5, %r6;\n"
2976 " @%p2 bra $Lt_26_15362;\n"
2977 " bra.uni $Lt_26_14850;\n"
2978 "$Lt_26_16898:\n"
2979 " mov.f32 %f1, 0f00000000; // 0\n"
2980 "$Lt_26_14850:\n"
2981 " .loc 3 71 0\n"
2982 " mov.u64 %rd5, __smem;\n"
2983 " cvt.u64.u32 %rd6, %r3;\n"
2984 " mul.wide.u32 %rd7, %r3, 4;\n"
2985 " add.u64 %rd8, %rd5, %rd7;\n"
2986 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
2987 " .loc 3 72 0\n"
2988 " bar.sync 0;\n"
2989 " mov.u32 %r10, 31;\n"
2990 " setp.gt.u32 %p3, %r3, %r10;\n"
2991 " @%p3 bra $Lt_26_15874;\n"
2992 " .loc 3 88 0\n"
2993 " ld.volatile.shared.f32 %f5, [%rd8+4];\n"
2994 " add.f32 %f6, %f5, %f1;\n"
2995 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
2996 "$Lt_26_15874:\n"
2997 " .loc 3 195 0\n"
2998 " mov.u32 %r11, 0;\n"
2999 " setp.ne.u32 %p4, %r3, %r11;\n"
3000 " @%p4 bra $Lt_26_16386;\n"
3001 " .loc 3 199 0\n"
3002 " ld.shared.f32 %f7, [__smem+0];\n"
3003 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_2_true_g_odata];\n"
3004 " cvt.u64.u32 %rd10, %r1;\n"
3005 " mul.wide.u32 %rd11, %r1, 4;\n"
3006 " add.u64 %rd12, %rd9, %rd11;\n"
3007 " st.global.f32 [%rd12+0], %f7;\n"
3008 "$Lt_26_16386:\n"
3009 " .loc 3 508 0\n"
3010 " exit;\n"
3011 "$LDWend_reduce_uchar_2_true:\n"
3012 " } // reduce_uchar_2_true\n"
3013 "\n"
3014 " .entry reduce_uchar_4_true (\n"
3015 " .param .u64 __cudaparm_reduce_uchar_4_true_g_idata,\n"
3016 " .param .u64 __cudaparm_reduce_uchar_4_true_g_odata,\n"
3017 " .param .u32 __cudaparm_reduce_uchar_4_true_n)\n"
3018 " {\n"
3019 " .reg .u16 %rh<3>;\n"
3020 " .reg .u32 %r<13>;\n"
3021 " .reg .u64 %rd<14>;\n"
3022 " .reg .f32 %f<11>;\n"
3023 " .reg .pred %p<6>;\n"
3024 " .loc 3 510 0\n"
3025 "$LDWbegin_reduce_uchar_4_true:\n"
3026 " .loc 3 181 0\n"
3027 " cvt.u32.u16 %r1, %ctaid.x;\n"
3028 " mul24.lo.u32 %r2, %r1, 8;\n"
3029 " cvt.u32.u16 %r3, %tid.x;\n"
3030 " add.u32 %r4, %r2, %r3;\n"
3031 " mov.s32 %r5, %r4;\n"
3032 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_4_true_n];\n"
3033 " setp.ge.u32 %p1, %r4, %r6;\n"
3034 " @%p1 bra $Lt_27_16642;\n"
3035 " mov.u16 %rh1, %nctaid.x;\n"
3036 " mul.wide.u16 %r7, %rh1, 8;\n"
3037 " cvt.u64.u32 %rd1, %r4;\n"
3038 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_4_true_g_idata];\n"
3039 " add.u64 %rd3, %rd1, %rd2;\n"
3040 " cvt.s64.u32 %rd4, %r7;\n"
3041 " mov.f32 %f1, 0f00000000; // 0\n"
3042 "$Lt_27_15106:\n"
3043 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3044 " .loc 3 188 0\n"
3045 " ld.global.u8 %r8, [%rd3+0];\n"
3046 " cvt.rn.f32.u32 %f2, %r8;\n"
3047 " add.f32 %f3, %f2, %f1;\n"
3048 " .loc 3 191 0\n"
3049 " ld.global.u8 %r9, [%rd3+4];\n"
3050 " cvt.rn.f32.u32 %f4, %r9;\n"
3051 " add.f32 %f1, %f4, %f3;\n"
3052 " add.u32 %r5, %r7, %r5;\n"
3053 " add.u64 %rd3, %rd4, %rd3;\n"
3054 " .loc 3 181 0\n"
3055 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_4_true_n];\n"
3056 " .loc 3 191 0\n"
3057 " setp.lt.u32 %p2, %r5, %r6;\n"
3058 " @%p2 bra $Lt_27_15106;\n"
3059 " bra.uni $Lt_27_14594;\n"
3060 "$Lt_27_16642:\n"
3061 " mov.f32 %f1, 0f00000000; // 0\n"
3062 "$Lt_27_14594:\n"
3063 " .loc 3 71 0\n"
3064 " mov.u64 %rd5, __smem;\n"
3065 " cvt.u64.u32 %rd6, %r3;\n"
3066 " mul.wide.u32 %rd7, %r3, 4;\n"
3067 " add.u64 %rd8, %rd5, %rd7;\n"
3068 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3069 " .loc 3 72 0\n"
3070 " bar.sync 0;\n"
3071 " mov.u32 %r10, 31;\n"
3072 " setp.gt.u32 %p3, %r3, %r10;\n"
3073 " @%p3 bra $Lt_27_15618;\n"
3074 " .loc 3 87 0\n"
3075 " ld.volatile.shared.f32 %f5, [%rd8+8];\n"
3076 " add.f32 %f6, %f5, %f1;\n"
3077 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3078 " .loc 3 88 0\n"
3079 " ld.volatile.shared.f32 %f7, [%rd8+4];\n"
3080 " add.f32 %f8, %f7, %f6;\n"
3081 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3082 "$Lt_27_15618:\n"
3083 " .loc 3 195 0\n"
3084 " mov.u32 %r11, 0;\n"
3085 " setp.ne.u32 %p4, %r3, %r11;\n"
3086 " @%p4 bra $Lt_27_16130;\n"
3087 " .loc 3 199 0\n"
3088 " ld.shared.f32 %f9, [__smem+0];\n"
3089 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_4_true_g_odata];\n"
3090 " cvt.u64.u32 %rd10, %r1;\n"
3091 " mul.wide.u32 %rd11, %r1, 4;\n"
3092 " add.u64 %rd12, %rd9, %rd11;\n"
3093 " st.global.f32 [%rd12+0], %f9;\n"
3094 "$Lt_27_16130:\n"
3095 " .loc 3 513 0\n"
3096 " exit;\n"
3097 "$LDWend_reduce_uchar_4_true:\n"
3098 " } // reduce_uchar_4_true\n"
3099 "\n"
3100 " .entry reduce_uchar_8_true (\n"
3101 " .param .u64 __cudaparm_reduce_uchar_8_true_g_idata,\n"
3102 " .param .u64 __cudaparm_reduce_uchar_8_true_g_odata,\n"
3103 " .param .u32 __cudaparm_reduce_uchar_8_true_n)\n"
3104 " {\n"
3105 " .reg .u16 %rh<3>;\n"
3106 " .reg .u32 %r<13>;\n"
3107 " .reg .u64 %rd<14>;\n"
3108 " .reg .f32 %f<13>;\n"
3109 " .reg .pred %p<6>;\n"
3110 " .loc 3 515 0\n"
3111 "$LDWbegin_reduce_uchar_8_true:\n"
3112 " .loc 3 181 0\n"
3113 " cvt.u32.u16 %r1, %ctaid.x;\n"
3114 " mul24.lo.u32 %r2, %r1, 16;\n"
3115 " cvt.u32.u16 %r3, %tid.x;\n"
3116 " add.u32 %r4, %r2, %r3;\n"
3117 " mov.s32 %r5, %r4;\n"
3118 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_8_true_n];\n"
3119 " setp.ge.u32 %p1, %r4, %r6;\n"
3120 " @%p1 bra $Lt_28_16386;\n"
3121 " mov.u16 %rh1, %nctaid.x;\n"
3122 " mul.wide.u16 %r7, %rh1, 16;\n"
3123 " cvt.u64.u32 %rd1, %r4;\n"
3124 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_8_true_g_idata];\n"
3125 " add.u64 %rd3, %rd1, %rd2;\n"
3126 " cvt.s64.u32 %rd4, %r7;\n"
3127 " mov.f32 %f1, 0f00000000; // 0\n"
3128 "$Lt_28_14850:\n"
3129 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3130 " .loc 3 188 0\n"
3131 " ld.global.u8 %r8, [%rd3+0];\n"
3132 " cvt.rn.f32.u32 %f2, %r8;\n"
3133 " add.f32 %f3, %f2, %f1;\n"
3134 " .loc 3 191 0\n"
3135 " ld.global.u8 %r9, [%rd3+8];\n"
3136 " cvt.rn.f32.u32 %f4, %r9;\n"
3137 " add.f32 %f1, %f4, %f3;\n"
3138 " add.u32 %r5, %r7, %r5;\n"
3139 " add.u64 %rd3, %rd4, %rd3;\n"
3140 " .loc 3 181 0\n"
3141 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_8_true_n];\n"
3142 " .loc 3 191 0\n"
3143 " setp.lt.u32 %p2, %r5, %r6;\n"
3144 " @%p2 bra $Lt_28_14850;\n"
3145 " bra.uni $Lt_28_14338;\n"
3146 "$Lt_28_16386:\n"
3147 " mov.f32 %f1, 0f00000000; // 0\n"
3148 "$Lt_28_14338:\n"
3149 " .loc 3 71 0\n"
3150 " mov.u64 %rd5, __smem;\n"
3151 " cvt.u64.u32 %rd6, %r3;\n"
3152 " mul.wide.u32 %rd7, %r3, 4;\n"
3153 " add.u64 %rd8, %rd5, %rd7;\n"
3154 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3155 " .loc 3 72 0\n"
3156 " bar.sync 0;\n"
3157 " mov.u32 %r10, 31;\n"
3158 " setp.gt.u32 %p3, %r3, %r10;\n"
3159 " @%p3 bra $Lt_28_15362;\n"
3160 " .loc 3 86 0\n"
3161 " ld.volatile.shared.f32 %f5, [%rd8+16];\n"
3162 " add.f32 %f6, %f5, %f1;\n"
3163 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3164 " .loc 3 87 0\n"
3165 " ld.volatile.shared.f32 %f7, [%rd8+8];\n"
3166 " add.f32 %f8, %f7, %f6;\n"
3167 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3168 " .loc 3 88 0\n"
3169 " ld.volatile.shared.f32 %f9, [%rd8+4];\n"
3170 " add.f32 %f10, %f9, %f8;\n"
3171 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3172 "$Lt_28_15362:\n"
3173 " .loc 3 195 0\n"
3174 " mov.u32 %r11, 0;\n"
3175 " setp.ne.u32 %p4, %r3, %r11;\n"
3176 " @%p4 bra $Lt_28_15874;\n"
3177 " .loc 3 199 0\n"
3178 " ld.shared.f32 %f11, [__smem+0];\n"
3179 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_8_true_g_odata];\n"
3180 " cvt.u64.u32 %rd10, %r1;\n"
3181 " mul.wide.u32 %rd11, %r1, 4;\n"
3182 " add.u64 %rd12, %rd9, %rd11;\n"
3183 " st.global.f32 [%rd12+0], %f11;\n"
3184 "$Lt_28_15874:\n"
3185 " .loc 3 518 0\n"
3186 " exit;\n"
3187 "$LDWend_reduce_uchar_8_true:\n"
3188 " } // reduce_uchar_8_true\n"
3189 "\n"
3190 " .entry reduce_uchar_16_true (\n"
3191 " .param .u64 __cudaparm_reduce_uchar_16_true_g_idata,\n"
3192 " .param .u64 __cudaparm_reduce_uchar_16_true_g_odata,\n"
3193 " .param .u32 __cudaparm_reduce_uchar_16_true_n)\n"
3194 " {\n"
3195 " .reg .u16 %rh<3>;\n"
3196 " .reg .u32 %r<13>;\n"
3197 " .reg .u64 %rd<14>;\n"
3198 " .reg .f32 %f<15>;\n"
3199 " .reg .pred %p<6>;\n"
3200 " .loc 3 520 0\n"
3201 "$LDWbegin_reduce_uchar_16_true:\n"
3202 " .loc 3 181 0\n"
3203 " cvt.u32.u16 %r1, %ctaid.x;\n"
3204 " mul24.lo.u32 %r2, %r1, 32;\n"
3205 " cvt.u32.u16 %r3, %tid.x;\n"
3206 " add.u32 %r4, %r2, %r3;\n"
3207 " mov.s32 %r5, %r4;\n"
3208 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_16_true_n];\n"
3209 " setp.ge.u32 %p1, %r4, %r6;\n"
3210 " @%p1 bra $Lt_29_16130;\n"
3211 " mov.u16 %rh1, %nctaid.x;\n"
3212 " mul.wide.u16 %r7, %rh1, 32;\n"
3213 " cvt.u64.u32 %rd1, %r4;\n"
3214 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_16_true_g_idata];\n"
3215 " add.u64 %rd3, %rd1, %rd2;\n"
3216 " cvt.s64.u32 %rd4, %r7;\n"
3217 " mov.f32 %f1, 0f00000000; // 0\n"
3218 "$Lt_29_14594:\n"
3219 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3220 " .loc 3 188 0\n"
3221 " ld.global.u8 %r8, [%rd3+0];\n"
3222 " cvt.rn.f32.u32 %f2, %r8;\n"
3223 " add.f32 %f3, %f2, %f1;\n"
3224 " .loc 3 191 0\n"
3225 " ld.global.u8 %r9, [%rd3+16];\n"
3226 " cvt.rn.f32.u32 %f4, %r9;\n"
3227 " add.f32 %f1, %f4, %f3;\n"
3228 " add.u32 %r5, %r7, %r5;\n"
3229 " add.u64 %rd3, %rd4, %rd3;\n"
3230 " .loc 3 181 0\n"
3231 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_16_true_n];\n"
3232 " .loc 3 191 0\n"
3233 " setp.lt.u32 %p2, %r5, %r6;\n"
3234 " @%p2 bra $Lt_29_14594;\n"
3235 " bra.uni $Lt_29_14082;\n"
3236 "$Lt_29_16130:\n"
3237 " mov.f32 %f1, 0f00000000; // 0\n"
3238 "$Lt_29_14082:\n"
3239 " .loc 3 71 0\n"
3240 " mov.u64 %rd5, __smem;\n"
3241 " cvt.u64.u32 %rd6, %r3;\n"
3242 " mul.wide.u32 %rd7, %r3, 4;\n"
3243 " add.u64 %rd8, %rd5, %rd7;\n"
3244 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3245 " .loc 3 72 0\n"
3246 " bar.sync 0;\n"
3247 " mov.u32 %r10, 31;\n"
3248 " setp.gt.u32 %p3, %r3, %r10;\n"
3249 " @%p3 bra $Lt_29_15106;\n"
3250 " .loc 3 85 0\n"
3251 " ld.volatile.shared.f32 %f5, [%rd8+32];\n"
3252 " add.f32 %f6, %f5, %f1;\n"
3253 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3254 " .loc 3 86 0\n"
3255 " ld.volatile.shared.f32 %f7, [%rd8+16];\n"
3256 " add.f32 %f8, %f7, %f6;\n"
3257 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3258 " .loc 3 87 0\n"
3259 " ld.volatile.shared.f32 %f9, [%rd8+8];\n"
3260 " add.f32 %f10, %f9, %f8;\n"
3261 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3262 " .loc 3 88 0\n"
3263 " ld.volatile.shared.f32 %f11, [%rd8+4];\n"
3264 " add.f32 %f12, %f11, %f10;\n"
3265 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3266 "$Lt_29_15106:\n"
3267 " .loc 3 195 0\n"
3268 " mov.u32 %r11, 0;\n"
3269 " setp.ne.u32 %p4, %r3, %r11;\n"
3270 " @%p4 bra $Lt_29_15618;\n"
3271 " .loc 3 199 0\n"
3272 " ld.shared.f32 %f13, [__smem+0];\n"
3273 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_16_true_g_odata];\n"
3274 " cvt.u64.u32 %rd10, %r1;\n"
3275 " mul.wide.u32 %rd11, %r1, 4;\n"
3276 " add.u64 %rd12, %rd9, %rd11;\n"
3277 " st.global.f32 [%rd12+0], %f13;\n"
3278 "$Lt_29_15618:\n"
3279 " .loc 3 523 0\n"
3280 " exit;\n"
3281 "$LDWend_reduce_uchar_16_true:\n"
3282 " } // reduce_uchar_16_true\n"
3283 "\n"
3284 " .entry reduce_uchar_32_true (\n"
3285 " .param .u64 __cudaparm_reduce_uchar_32_true_g_idata,\n"
3286 " .param .u64 __cudaparm_reduce_uchar_32_true_g_odata,\n"
3287 " .param .u32 __cudaparm_reduce_uchar_32_true_n)\n"
3288 " {\n"
3289 " .reg .u16 %rh<3>;\n"
3290 " .reg .u32 %r<13>;\n"
3291 " .reg .u64 %rd<14>;\n"
3292 " .reg .f32 %f<17>;\n"
3293 " .reg .pred %p<6>;\n"
3294 " .loc 3 525 0\n"
3295 "$LDWbegin_reduce_uchar_32_true:\n"
3296 " .loc 3 181 0\n"
3297 " cvt.u32.u16 %r1, %ctaid.x;\n"
3298 " mul24.lo.u32 %r2, %r1, 64;\n"
3299 " cvt.u32.u16 %r3, %tid.x;\n"
3300 " add.u32 %r4, %r2, %r3;\n"
3301 " mov.s32 %r5, %r4;\n"
3302 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_32_true_n];\n"
3303 " setp.ge.u32 %p1, %r4, %r6;\n"
3304 " @%p1 bra $Lt_30_15874;\n"
3305 " mov.u16 %rh1, %nctaid.x;\n"
3306 " mul.wide.u16 %r7, %rh1, 64;\n"
3307 " cvt.u64.u32 %rd1, %r4;\n"
3308 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_32_true_g_idata];\n"
3309 " add.u64 %rd3, %rd1, %rd2;\n"
3310 " cvt.s64.u32 %rd4, %r7;\n"
3311 " mov.f32 %f1, 0f00000000; // 0\n"
3312 "$Lt_30_14338:\n"
3313 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3314 " .loc 3 188 0\n"
3315 " ld.global.u8 %r8, [%rd3+0];\n"
3316 " cvt.rn.f32.u32 %f2, %r8;\n"
3317 " add.f32 %f3, %f2, %f1;\n"
3318 " .loc 3 191 0\n"
3319 " ld.global.u8 %r9, [%rd3+32];\n"
3320 " cvt.rn.f32.u32 %f4, %r9;\n"
3321 " add.f32 %f1, %f4, %f3;\n"
3322 " add.u32 %r5, %r7, %r5;\n"
3323 " add.u64 %rd3, %rd4, %rd3;\n"
3324 " .loc 3 181 0\n"
3325 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_32_true_n];\n"
3326 " .loc 3 191 0\n"
3327 " setp.lt.u32 %p2, %r5, %r6;\n"
3328 " @%p2 bra $Lt_30_14338;\n"
3329 " bra.uni $Lt_30_13826;\n"
3330 "$Lt_30_15874:\n"
3331 " mov.f32 %f1, 0f00000000; // 0\n"
3332 "$Lt_30_13826:\n"
3333 " .loc 3 71 0\n"
3334 " mov.u64 %rd5, __smem;\n"
3335 " cvt.u64.u32 %rd6, %r3;\n"
3336 " mul.wide.u32 %rd7, %r3, 4;\n"
3337 " add.u64 %rd8, %rd5, %rd7;\n"
3338 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3339 " .loc 3 72 0\n"
3340 " bar.sync 0;\n"
3341 " mov.u32 %r10, 31;\n"
3342 " setp.gt.u32 %p3, %r3, %r10;\n"
3343 " @%p3 bra $Lt_30_14850;\n"
3344 " .loc 3 84 0\n"
3345 " ld.volatile.shared.f32 %f5, [%rd8+64];\n"
3346 " add.f32 %f6, %f5, %f1;\n"
3347 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3348 " .loc 3 85 0\n"
3349 " ld.volatile.shared.f32 %f7, [%rd8+32];\n"
3350 " add.f32 %f8, %f7, %f6;\n"
3351 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3352 " .loc 3 86 0\n"
3353 " ld.volatile.shared.f32 %f9, [%rd8+16];\n"
3354 " add.f32 %f10, %f9, %f8;\n"
3355 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3356 " .loc 3 87 0\n"
3357 " ld.volatile.shared.f32 %f11, [%rd8+8];\n"
3358 " add.f32 %f12, %f11, %f10;\n"
3359 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3360 " .loc 3 88 0\n"
3361 " ld.volatile.shared.f32 %f13, [%rd8+4];\n"
3362 " add.f32 %f14, %f13, %f12;\n"
3363 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
3364 "$Lt_30_14850:\n"
3365 " .loc 3 195 0\n"
3366 " mov.u32 %r11, 0;\n"
3367 " setp.ne.u32 %p4, %r3, %r11;\n"
3368 " @%p4 bra $Lt_30_15362;\n"
3369 " .loc 3 199 0\n"
3370 " ld.shared.f32 %f15, [__smem+0];\n"
3371 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_32_true_g_odata];\n"
3372 " cvt.u64.u32 %rd10, %r1;\n"
3373 " mul.wide.u32 %rd11, %r1, 4;\n"
3374 " add.u64 %rd12, %rd9, %rd11;\n"
3375 " st.global.f32 [%rd12+0], %f15;\n"
3376 "$Lt_30_15362:\n"
3377 " .loc 3 528 0\n"
3378 " exit;\n"
3379 "$LDWend_reduce_uchar_32_true:\n"
3380 " } // reduce_uchar_32_true\n"
3381 "\n"
3382 " .entry reduce_uchar_64_true (\n"
3383 " .param .u64 __cudaparm_reduce_uchar_64_true_g_idata,\n"
3384 " .param .u64 __cudaparm_reduce_uchar_64_true_g_odata,\n"
3385 " .param .u32 __cudaparm_reduce_uchar_64_true_n)\n"
3386 " {\n"
3387 " .reg .u16 %rh<3>;\n"
3388 " .reg .u32 %r<13>;\n"
3389 " .reg .u64 %rd<14>;\n"
3390 " .reg .f32 %f<19>;\n"
3391 " .reg .pred %p<6>;\n"
3392 " .loc 3 530 0\n"
3393 "$LDWbegin_reduce_uchar_64_true:\n"
3394 " .loc 3 181 0\n"
3395 " cvt.u32.u16 %r1, %ctaid.x;\n"
3396 " mul24.lo.u32 %r2, %r1, 128;\n"
3397 " cvt.u32.u16 %r3, %tid.x;\n"
3398 " add.u32 %r4, %r2, %r3;\n"
3399 " mov.s32 %r5, %r4;\n"
3400 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_64_true_n];\n"
3401 " setp.ge.u32 %p1, %r4, %r6;\n"
3402 " @%p1 bra $Lt_31_15618;\n"
3403 " mov.u16 %rh1, %nctaid.x;\n"
3404 " mul.wide.u16 %r7, %rh1, 128;\n"
3405 " cvt.u64.u32 %rd1, %r4;\n"
3406 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_64_true_g_idata];\n"
3407 " add.u64 %rd3, %rd1, %rd2;\n"
3408 " cvt.s64.u32 %rd4, %r7;\n"
3409 " mov.f32 %f1, 0f00000000; // 0\n"
3410 "$Lt_31_14082:\n"
3411 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3412 " .loc 3 188 0\n"
3413 " ld.global.u8 %r8, [%rd3+0];\n"
3414 " cvt.rn.f32.u32 %f2, %r8;\n"
3415 " add.f32 %f3, %f2, %f1;\n"
3416 " .loc 3 191 0\n"
3417 " ld.global.u8 %r9, [%rd3+64];\n"
3418 " cvt.rn.f32.u32 %f4, %r9;\n"
3419 " add.f32 %f1, %f4, %f3;\n"
3420 " add.u32 %r5, %r7, %r5;\n"
3421 " add.u64 %rd3, %rd4, %rd3;\n"
3422 " .loc 3 181 0\n"
3423 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_64_true_n];\n"
3424 " .loc 3 191 0\n"
3425 " setp.lt.u32 %p2, %r5, %r6;\n"
3426 " @%p2 bra $Lt_31_14082;\n"
3427 " bra.uni $Lt_31_13570;\n"
3428 "$Lt_31_15618:\n"
3429 " mov.f32 %f1, 0f00000000; // 0\n"
3430 "$Lt_31_13570:\n"
3431 " .loc 3 71 0\n"
3432 " mov.u64 %rd5, __smem;\n"
3433 " cvt.u64.u32 %rd6, %r3;\n"
3434 " mul.wide.u32 %rd7, %r3, 4;\n"
3435 " add.u64 %rd8, %rd5, %rd7;\n"
3436 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3437 " .loc 3 72 0\n"
3438 " bar.sync 0;\n"
3439 " mov.u32 %r10, 31;\n"
3440 " setp.gt.u32 %p3, %r3, %r10;\n"
3441 " @%p3 bra $Lt_31_14594;\n"
3442 " .loc 3 83 0\n"
3443 " ld.volatile.shared.f32 %f5, [%rd8+128];\n"
3444 " add.f32 %f6, %f5, %f1;\n"
3445 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3446 " .loc 3 84 0\n"
3447 " ld.volatile.shared.f32 %f7, [%rd8+64];\n"
3448 " add.f32 %f8, %f7, %f6;\n"
3449 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
3450 " .loc 3 85 0\n"
3451 " ld.volatile.shared.f32 %f9, [%rd8+32];\n"
3452 " add.f32 %f10, %f9, %f8;\n"
3453 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3454 " .loc 3 86 0\n"
3455 " ld.volatile.shared.f32 %f11, [%rd8+16];\n"
3456 " add.f32 %f12, %f11, %f10;\n"
3457 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3458 " .loc 3 87 0\n"
3459 " ld.volatile.shared.f32 %f13, [%rd8+8];\n"
3460 " add.f32 %f14, %f13, %f12;\n"
3461 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
3462 " .loc 3 88 0\n"
3463 " ld.volatile.shared.f32 %f15, [%rd8+4];\n"
3464 " add.f32 %f16, %f15, %f14;\n"
3465 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
3466 "$Lt_31_14594:\n"
3467 " .loc 3 195 0\n"
3468 " mov.u32 %r11, 0;\n"
3469 " setp.ne.u32 %p4, %r3, %r11;\n"
3470 " @%p4 bra $Lt_31_15106;\n"
3471 " .loc 3 199 0\n"
3472 " ld.shared.f32 %f17, [__smem+0];\n"
3473 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_64_true_g_odata];\n"
3474 " cvt.u64.u32 %rd10, %r1;\n"
3475 " mul.wide.u32 %rd11, %r1, 4;\n"
3476 " add.u64 %rd12, %rd9, %rd11;\n"
3477 " st.global.f32 [%rd12+0], %f17;\n"
3478 "$Lt_31_15106:\n"
3479 " .loc 3 533 0\n"
3480 " exit;\n"
3481 "$LDWend_reduce_uchar_64_true:\n"
3482 " } // reduce_uchar_64_true\n"
3483 "\n"
3484 " .entry reduce_uchar_128_true (\n"
3485 " .param .u64 __cudaparm_reduce_uchar_128_true_g_idata,\n"
3486 " .param .u64 __cudaparm_reduce_uchar_128_true_g_odata,\n"
3487 " .param .u32 __cudaparm_reduce_uchar_128_true_n)\n"
3488 " {\n"
3489 " .reg .u16 %rh<3>;\n"
3490 " .reg .u32 %r<14>;\n"
3491 " .reg .u64 %rd<14>;\n"
3492 " .reg .f32 %f<21>;\n"
3493 " .reg .pred %p<7>;\n"
3494 " .loc 3 535 0\n"
3495 "$LDWbegin_reduce_uchar_128_true:\n"
3496 " .loc 3 181 0\n"
3497 " cvt.u32.u16 %r1, %ctaid.x;\n"
3498 " mul.lo.u32 %r2, %r1, 256;\n"
3499 " cvt.u32.u16 %r3, %tid.x;\n"
3500 " add.u32 %r4, %r2, %r3;\n"
3501 " mov.s32 %r5, %r4;\n"
3502 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_128_true_n];\n"
3503 " setp.ge.u32 %p1, %r4, %r6;\n"
3504 " @%p1 bra $Lt_32_15874;\n"
3505 " mov.u16 %rh1, %nctaid.x;\n"
3506 " mul.wide.u16 %r7, %rh1, 256;\n"
3507 " cvt.u64.u32 %rd1, %r4;\n"
3508 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_128_true_g_idata];\n"
3509 " add.u64 %rd3, %rd1, %rd2;\n"
3510 " cvt.s64.u32 %rd4, %r7;\n"
3511 " mov.f32 %f1, 0f00000000; // 0\n"
3512 "$Lt_32_13826:\n"
3513 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3514 " .loc 3 188 0\n"
3515 " ld.global.u8 %r8, [%rd3+0];\n"
3516 " cvt.rn.f32.u32 %f2, %r8;\n"
3517 " add.f32 %f3, %f2, %f1;\n"
3518 " .loc 3 191 0\n"
3519 " ld.global.u8 %r9, [%rd3+128];\n"
3520 " cvt.rn.f32.u32 %f4, %r9;\n"
3521 " add.f32 %f1, %f4, %f3;\n"
3522 " add.u32 %r5, %r7, %r5;\n"
3523 " add.u64 %rd3, %rd4, %rd3;\n"
3524 " .loc 3 181 0\n"
3525 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_128_true_n];\n"
3526 " .loc 3 191 0\n"
3527 " setp.lt.u32 %p2, %r5, %r6;\n"
3528 " @%p2 bra $Lt_32_13826;\n"
3529 " bra.uni $Lt_32_13314;\n"
3530 "$Lt_32_15874:\n"
3531 " mov.f32 %f1, 0f00000000; // 0\n"
3532 "$Lt_32_13314:\n"
3533 " .loc 3 195 0\n"
3534 " mov.f32 %f5, %f1;\n"
3535 " mov.f32 %f6, %f5;\n"
3536 " .loc 3 71 0\n"
3537 " mov.u64 %rd5, __smem;\n"
3538 " cvt.u64.u32 %rd6, %r3;\n"
3539 " mul.wide.u32 %rd7, %r3, 4;\n"
3540 " add.u64 %rd8, %rd5, %rd7;\n"
3541 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
3542 " .loc 3 72 0\n"
3543 " bar.sync 0;\n"
3544 " mov.u32 %r10, 63;\n"
3545 " setp.gt.u32 %p3, %r3, %r10;\n"
3546 " @%p3 bra $Lt_32_14338;\n"
3547 " .loc 3 77 0\n"
3548 " ld.volatile.shared.f32 %f7, [%rd8+256];\n"
3549 " add.f32 %f6, %f7, %f5;\n"
3550 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3551 "$Lt_32_14338:\n"
3552 " bar.sync 0;\n"
3553 " mov.u32 %r11, 31;\n"
3554 " setp.gt.u32 %p4, %r3, %r11;\n"
3555 " @%p4 bra $Lt_32_14850;\n"
3556 " .loc 3 83 0\n"
3557 " ld.volatile.shared.f32 %f8, [%rd8+128];\n"
3558 " add.f32 %f9, %f8, %f6;\n"
3559 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
3560 " .loc 3 84 0\n"
3561 " ld.volatile.shared.f32 %f10, [%rd8+64];\n"
3562 " add.f32 %f11, %f10, %f9;\n"
3563 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
3564 " .loc 3 85 0\n"
3565 " ld.volatile.shared.f32 %f12, [%rd8+32];\n"
3566 " add.f32 %f13, %f12, %f11;\n"
3567 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
3568 " .loc 3 86 0\n"
3569 " ld.volatile.shared.f32 %f14, [%rd8+16];\n"
3570 " add.f32 %f15, %f14, %f13;\n"
3571 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
3572 " .loc 3 87 0\n"
3573 " ld.volatile.shared.f32 %f16, [%rd8+8];\n"
3574 " add.f32 %f17, %f16, %f15;\n"
3575 " st.volatile.shared.f32 [%rd8+0], %f17;\n"
3576 " .loc 3 88 0\n"
3577 " ld.volatile.shared.f32 %f18, [%rd8+4];\n"
3578 " add.f32 %f6, %f18, %f17;\n"
3579 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3580 "$Lt_32_14850:\n"
3581 " .loc 3 195 0\n"
3582 " mov.u32 %r12, 0;\n"
3583 " setp.ne.u32 %p5, %r3, %r12;\n"
3584 " @%p5 bra $Lt_32_15362;\n"
3585 " .loc 3 199 0\n"
3586 " ld.shared.f32 %f19, [__smem+0];\n"
3587 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_128_true_g_odata];\n"
3588 " cvt.u64.u32 %rd10, %r1;\n"
3589 " mul.wide.u32 %rd11, %r1, 4;\n"
3590 " add.u64 %rd12, %rd9, %rd11;\n"
3591 " st.global.f32 [%rd12+0], %f19;\n"
3592 "$Lt_32_15362:\n"
3593 " .loc 3 538 0\n"
3594 " exit;\n"
3595 "$LDWend_reduce_uchar_128_true:\n"
3596 " } // reduce_uchar_128_true\n"
3597 "\n"
3598 " .entry reduce_uchar_256_true (\n"
3599 " .param .u64 __cudaparm_reduce_uchar_256_true_g_idata,\n"
3600 " .param .u64 __cudaparm_reduce_uchar_256_true_g_odata,\n"
3601 " .param .u32 __cudaparm_reduce_uchar_256_true_n)\n"
3602 " {\n"
3603 " .reg .u16 %rh<3>;\n"
3604 " .reg .u32 %r<15>;\n"
3605 " .reg .u64 %rd<14>;\n"
3606 " .reg .f32 %f<22>;\n"
3607 " .reg .pred %p<8>;\n"
3608 " .loc 3 540 0\n"
3609 "$LDWbegin_reduce_uchar_256_true:\n"
3610 " .loc 3 181 0\n"
3611 " cvt.u32.u16 %r1, %ctaid.x;\n"
3612 " mul.lo.u32 %r2, %r1, 512;\n"
3613 " cvt.u32.u16 %r3, %tid.x;\n"
3614 " add.u32 %r4, %r2, %r3;\n"
3615 " mov.s32 %r5, %r4;\n"
3616 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_256_true_n];\n"
3617 " setp.ge.u32 %p1, %r4, %r6;\n"
3618 " @%p1 bra $Lt_33_16130;\n"
3619 " mov.u16 %rh1, %nctaid.x;\n"
3620 " mul.wide.u16 %r7, %rh1, 512;\n"
3621 " cvt.u64.u32 %rd1, %r4;\n"
3622 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_256_true_g_idata];\n"
3623 " add.u64 %rd3, %rd1, %rd2;\n"
3624 " cvt.s64.u32 %rd4, %r7;\n"
3625 " mov.f32 %f1, 0f00000000; // 0\n"
3626 "$Lt_33_13570:\n"
3627 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3628 " .loc 3 188 0\n"
3629 " ld.global.u8 %r8, [%rd3+0];\n"
3630 " cvt.rn.f32.u32 %f2, %r8;\n"
3631 " add.f32 %f3, %f2, %f1;\n"
3632 " .loc 3 191 0\n"
3633 " ld.global.u8 %r9, [%rd3+256];\n"
3634 " cvt.rn.f32.u32 %f4, %r9;\n"
3635 " add.f32 %f1, %f4, %f3;\n"
3636 " add.u32 %r5, %r7, %r5;\n"
3637 " add.u64 %rd3, %rd4, %rd3;\n"
3638 " .loc 3 181 0\n"
3639 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_256_true_n];\n"
3640 " .loc 3 191 0\n"
3641 " setp.lt.u32 %p2, %r5, %r6;\n"
3642 " @%p2 bra $Lt_33_13570;\n"
3643 " bra.uni $Lt_33_13058;\n"
3644 "$Lt_33_16130:\n"
3645 " mov.f32 %f1, 0f00000000; // 0\n"
3646 "$Lt_33_13058:\n"
3647 " .loc 3 195 0\n"
3648 " mov.f32 %f5, %f1;\n"
3649 " mov.f32 %f6, %f5;\n"
3650 " .loc 3 71 0\n"
3651 " mov.u64 %rd5, __smem;\n"
3652 " cvt.u64.u32 %rd6, %r3;\n"
3653 " mul.wide.u32 %rd7, %r3, 4;\n"
3654 " add.u64 %rd8, %rd5, %rd7;\n"
3655 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
3656 " .loc 3 72 0\n"
3657 " bar.sync 0;\n"
3658 " mov.u32 %r10, 127;\n"
3659 " setp.gt.u32 %p3, %r3, %r10;\n"
3660 " @%p3 bra $Lt_33_14082;\n"
3661 " .loc 3 76 0\n"
3662 " ld.volatile.shared.f32 %f7, [%rd8+512];\n"
3663 " add.f32 %f6, %f7, %f5;\n"
3664 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3665 "$Lt_33_14082:\n"
3666 " bar.sync 0;\n"
3667 " mov.u32 %r11, 63;\n"
3668 " setp.gt.u32 %p4, %r3, %r11;\n"
3669 " @%p4 bra $Lt_33_14594;\n"
3670 " .loc 3 77 0\n"
3671 " ld.volatile.shared.f32 %f8, [%rd8+256];\n"
3672 " add.f32 %f6, %f8, %f6;\n"
3673 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3674 "$Lt_33_14594:\n"
3675 " bar.sync 0;\n"
3676 " mov.u32 %r12, 31;\n"
3677 " setp.gt.u32 %p5, %r3, %r12;\n"
3678 " @%p5 bra $Lt_33_15106;\n"
3679 " .loc 3 83 0\n"
3680 " ld.volatile.shared.f32 %f9, [%rd8+128];\n"
3681 " add.f32 %f10, %f9, %f6;\n"
3682 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
3683 " .loc 3 84 0\n"
3684 " ld.volatile.shared.f32 %f11, [%rd8+64];\n"
3685 " add.f32 %f12, %f11, %f10;\n"
3686 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
3687 " .loc 3 85 0\n"
3688 " ld.volatile.shared.f32 %f13, [%rd8+32];\n"
3689 " add.f32 %f14, %f13, %f12;\n"
3690 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
3691 " .loc 3 86 0\n"
3692 " ld.volatile.shared.f32 %f15, [%rd8+16];\n"
3693 " add.f32 %f16, %f15, %f14;\n"
3694 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
3695 " .loc 3 87 0\n"
3696 " ld.volatile.shared.f32 %f17, [%rd8+8];\n"
3697 " add.f32 %f18, %f17, %f16;\n"
3698 " st.volatile.shared.f32 [%rd8+0], %f18;\n"
3699 " .loc 3 88 0\n"
3700 " ld.volatile.shared.f32 %f19, [%rd8+4];\n"
3701 " add.f32 %f6, %f19, %f18;\n"
3702 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3703 "$Lt_33_15106:\n"
3704 " .loc 3 195 0\n"
3705 " mov.u32 %r13, 0;\n"
3706 " setp.ne.u32 %p6, %r3, %r13;\n"
3707 " @%p6 bra $Lt_33_15618;\n"
3708 " .loc 3 199 0\n"
3709 " ld.shared.f32 %f20, [__smem+0];\n"
3710 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_256_true_g_odata];\n"
3711 " cvt.u64.u32 %rd10, %r1;\n"
3712 " mul.wide.u32 %rd11, %r1, 4;\n"
3713 " add.u64 %rd12, %rd9, %rd11;\n"
3714 " st.global.f32 [%rd12+0], %f20;\n"
3715 "$Lt_33_15618:\n"
3716 " .loc 3 543 0\n"
3717 " exit;\n"
3718 "$LDWend_reduce_uchar_256_true:\n"
3719 " } // reduce_uchar_256_true\n"
3720 "\n"
3721 " .entry reduce_uchar_512_true (\n"
3722 " .param .u64 __cudaparm_reduce_uchar_512_true_g_idata,\n"
3723 " .param .u64 __cudaparm_reduce_uchar_512_true_g_odata,\n"
3724 " .param .u32 __cudaparm_reduce_uchar_512_true_n)\n"
3725 " {\n"
3726 " .reg .u16 %rh<3>;\n"
3727 " .reg .u32 %r<16>;\n"
3728 " .reg .u64 %rd<14>;\n"
3729 " .reg .f32 %f<23>;\n"
3730 " .reg .pred %p<9>;\n"
3731 " .loc 3 545 0\n"
3732 "$LDWbegin_reduce_uchar_512_true:\n"
3733 " .loc 3 181 0\n"
3734 " cvt.u32.u16 %r1, %ctaid.x;\n"
3735 " mul.lo.u32 %r2, %r1, 1024;\n"
3736 " cvt.u32.u16 %r3, %tid.x;\n"
3737 " add.u32 %r4, %r2, %r3;\n"
3738 " mov.s32 %r5, %r4;\n"
3739 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_512_true_n];\n"
3740 " setp.ge.u32 %p1, %r4, %r6;\n"
3741 " @%p1 bra $Lt_34_16386;\n"
3742 " mov.u16 %rh1, %nctaid.x;\n"
3743 " mul.wide.u16 %r7, %rh1, 1024;\n"
3744 " cvt.u64.u32 %rd1, %r4;\n"
3745 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_512_true_g_idata];\n"
3746 " add.u64 %rd3, %rd1, %rd2;\n"
3747 " cvt.s64.u32 %rd4, %r7;\n"
3748 " mov.f32 %f1, 0f00000000; // 0\n"
3749 "$Lt_34_13314:\n"
3750 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3751 " .loc 3 188 0\n"
3752 " ld.global.u8 %r8, [%rd3+0];\n"
3753 " cvt.rn.f32.u32 %f2, %r8;\n"
3754 " add.f32 %f3, %f2, %f1;\n"
3755 " .loc 3 191 0\n"
3756 " ld.global.u8 %r9, [%rd3+512];\n"
3757 " cvt.rn.f32.u32 %f4, %r9;\n"
3758 " add.f32 %f1, %f4, %f3;\n"
3759 " add.u32 %r5, %r7, %r5;\n"
3760 " add.u64 %rd3, %rd4, %rd3;\n"
3761 " .loc 3 181 0\n"
3762 " ld.param.u32 %r6, [__cudaparm_reduce_uchar_512_true_n];\n"
3763 " .loc 3 191 0\n"
3764 " setp.lt.u32 %p2, %r5, %r6;\n"
3765 " @%p2 bra $Lt_34_13314;\n"
3766 " bra.uni $Lt_34_12802;\n"
3767 "$Lt_34_16386:\n"
3768 " mov.f32 %f1, 0f00000000; // 0\n"
3769 "$Lt_34_12802:\n"
3770 " .loc 3 195 0\n"
3771 " mov.f32 %f5, %f1;\n"
3772 " mov.f32 %f6, %f5;\n"
3773 " .loc 3 71 0\n"
3774 " mov.u64 %rd5, __smem;\n"
3775 " cvt.u64.u32 %rd6, %r3;\n"
3776 " mul.wide.u32 %rd7, %r3, 4;\n"
3777 " add.u64 %rd8, %rd5, %rd7;\n"
3778 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
3779 " .loc 3 72 0\n"
3780 " bar.sync 0;\n"
3781 " mov.u32 %r10, 255;\n"
3782 " setp.gt.u32 %p3, %r3, %r10;\n"
3783 " @%p3 bra $Lt_34_13826;\n"
3784 " .loc 3 75 0\n"
3785 " ld.volatile.shared.f32 %f7, [%rd8+1024];\n"
3786 " add.f32 %f6, %f7, %f5;\n"
3787 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3788 "$Lt_34_13826:\n"
3789 " bar.sync 0;\n"
3790 " mov.u32 %r11, 127;\n"
3791 " setp.gt.u32 %p4, %r3, %r11;\n"
3792 " @%p4 bra $Lt_34_14338;\n"
3793 " .loc 3 76 0\n"
3794 " ld.volatile.shared.f32 %f8, [%rd8+512];\n"
3795 " add.f32 %f6, %f8, %f6;\n"
3796 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3797 "$Lt_34_14338:\n"
3798 " bar.sync 0;\n"
3799 " mov.u32 %r12, 63;\n"
3800 " setp.gt.u32 %p5, %r3, %r12;\n"
3801 " @%p5 bra $Lt_34_14850;\n"
3802 " .loc 3 77 0\n"
3803 " ld.volatile.shared.f32 %f9, [%rd8+256];\n"
3804 " add.f32 %f6, %f9, %f6;\n"
3805 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3806 "$Lt_34_14850:\n"
3807 " bar.sync 0;\n"
3808 " mov.u32 %r13, 31;\n"
3809 " setp.gt.u32 %p6, %r3, %r13;\n"
3810 " @%p6 bra $Lt_34_15362;\n"
3811 " .loc 3 83 0\n"
3812 " ld.volatile.shared.f32 %f10, [%rd8+128];\n"
3813 " add.f32 %f11, %f10, %f6;\n"
3814 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
3815 " .loc 3 84 0\n"
3816 " ld.volatile.shared.f32 %f12, [%rd8+64];\n"
3817 " add.f32 %f13, %f12, %f11;\n"
3818 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
3819 " .loc 3 85 0\n"
3820 " ld.volatile.shared.f32 %f14, [%rd8+32];\n"
3821 " add.f32 %f15, %f14, %f13;\n"
3822 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
3823 " .loc 3 86 0\n"
3824 " ld.volatile.shared.f32 %f16, [%rd8+16];\n"
3825 " add.f32 %f17, %f16, %f15;\n"
3826 " st.volatile.shared.f32 [%rd8+0], %f17;\n"
3827 " .loc 3 87 0\n"
3828 " ld.volatile.shared.f32 %f18, [%rd8+8];\n"
3829 " add.f32 %f19, %f18, %f17;\n"
3830 " st.volatile.shared.f32 [%rd8+0], %f19;\n"
3831 " .loc 3 88 0\n"
3832 " ld.volatile.shared.f32 %f20, [%rd8+4];\n"
3833 " add.f32 %f6, %f20, %f19;\n"
3834 " st.volatile.shared.f32 [%rd8+0], %f6;\n"
3835 "$Lt_34_15362:\n"
3836 " .loc 3 195 0\n"
3837 " mov.u32 %r14, 0;\n"
3838 " setp.ne.u32 %p7, %r3, %r14;\n"
3839 " @%p7 bra $Lt_34_15874;\n"
3840 " .loc 3 199 0\n"
3841 " ld.shared.f32 %f21, [__smem+0];\n"
3842 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_512_true_g_odata];\n"
3843 " cvt.u64.u32 %rd10, %r1;\n"
3844 " mul.wide.u32 %rd11, %r1, 4;\n"
3845 " add.u64 %rd12, %rd9, %rd11;\n"
3846 " st.global.f32 [%rd12+0], %f21;\n"
3847 "$Lt_34_15874:\n"
3848 " .loc 3 548 0\n"
3849 " exit;\n"
3850 "$LDWend_reduce_uchar_512_true:\n"
3851 " } // reduce_uchar_512_true\n"
3852 "\n"
3853 " .entry reduce_uchar_1_false (\n"
3854 " .param .u64 __cudaparm_reduce_uchar_1_false_g_idata,\n"
3855 " .param .u64 __cudaparm_reduce_uchar_1_false_g_odata,\n"
3856 " .param .u32 __cudaparm_reduce_uchar_1_false_n)\n"
3857 " {\n"
3858 " .reg .u16 %rh<3>;\n"
3859 " .reg .u32 %r<13>;\n"
3860 " .reg .u64 %rd<14>;\n"
3861 " .reg .f32 %f<6>;\n"
3862 " .reg .pred %p<6>;\n"
3863 " .loc 3 551 0\n"
3864 "$LDWbegin_reduce_uchar_1_false:\n"
3865 " .loc 3 181 0\n"
3866 " cvt.u32.u16 %r1, %ctaid.x;\n"
3867 " mul24.lo.u32 %r2, %r1, 2;\n"
3868 " cvt.u32.u16 %r3, %tid.x;\n"
3869 " add.u32 %r4, %r2, %r3;\n"
3870 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_1_false_n];\n"
3871 " setp.ge.u32 %p1, %r4, %r5;\n"
3872 " @%p1 bra $Lt_35_17154;\n"
3873 " add.u32 %r6, %r4, 1;\n"
3874 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_1_false_n];\n"
3875 " add.u32 %r7, %r5, 1;\n"
3876 " mov.u16 %rh1, %nctaid.x;\n"
3877 " mul.wide.u16 %r8, %rh1, 2;\n"
3878 " cvt.u64.u32 %rd1, %r4;\n"
3879 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_1_false_g_idata];\n"
3880 " add.u64 %rd3, %rd1, %rd2;\n"
3881 " cvt.s64.u32 %rd4, %r8;\n"
3882 " mov.f32 %f1, 0f00000000; // 0\n"
3883 "$Lt_35_15618:\n"
3884 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3885 " .loc 3 188 0\n"
3886 " ld.global.u8 %r9, [%rd3+0];\n"
3887 " cvt.rn.f32.u32 %f2, %r9;\n"
3888 " add.f32 %f1, %f2, %f1;\n"
3889 " .loc 3 181 0\n"
3890 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_1_false_n];\n"
3891 " .loc 3 188 0\n"
3892 " setp.ge.u32 %p2, %r6, %r5;\n"
3893 " @%p2 bra $Lt_35_15874;\n"
3894 " //<loop> Part of loop body line 181, head labeled $Lt_35_15618\n"
3895 " .loc 3 191 0\n"
3896 " ld.global.u8 %r10, [%rd3+1];\n"
3897 " cvt.rn.f32.u32 %f3, %r10;\n"
3898 " add.f32 %f1, %f3, %f1;\n"
3899 "$Lt_35_15874:\n"
3900 " //<loop> Part of loop body line 181, head labeled $Lt_35_15618\n"
3901 " add.u32 %r6, %r6, %r8;\n"
3902 " add.u64 %rd3, %rd4, %rd3;\n"
3903 " setp.lt.u32 %p3, %r6, %r7;\n"
3904 " @%p3 bra $Lt_35_15618;\n"
3905 " bra.uni $Lt_35_15106;\n"
3906 "$Lt_35_17154:\n"
3907 " mov.f32 %f1, 0f00000000; // 0\n"
3908 "$Lt_35_15106:\n"
3909 " .loc 3 71 0\n"
3910 " mov.u64 %rd5, __smem;\n"
3911 " cvt.u64.u32 %rd6, %r3;\n"
3912 " mul.wide.u32 %rd7, %r3, 4;\n"
3913 " add.u64 %rd8, %rd5, %rd7;\n"
3914 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3915 " .loc 3 72 0\n"
3916 " bar.sync 0;\n"
3917 " .loc 3 195 0\n"
3918 " mov.u32 %r11, 0;\n"
3919 " setp.ne.u32 %p4, %r3, %r11;\n"
3920 " @%p4 bra $Lt_35_16642;\n"
3921 " .loc 3 199 0\n"
3922 " ld.shared.f32 %f4, [__smem+0];\n"
3923 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_1_false_g_odata];\n"
3924 " cvt.u64.u32 %rd10, %r1;\n"
3925 " mul.wide.u32 %rd11, %r1, 4;\n"
3926 " add.u64 %rd12, %rd9, %rd11;\n"
3927 " st.global.f32 [%rd12+0], %f4;\n"
3928 "$Lt_35_16642:\n"
3929 " .loc 3 554 0\n"
3930 " exit;\n"
3931 "$LDWend_reduce_uchar_1_false:\n"
3932 " } // reduce_uchar_1_false\n"
3933 "\n"
3934 " .entry reduce_uchar_2_false (\n"
3935 " .param .u64 __cudaparm_reduce_uchar_2_false_g_idata,\n"
3936 " .param .u64 __cudaparm_reduce_uchar_2_false_g_odata,\n"
3937 " .param .u32 __cudaparm_reduce_uchar_2_false_n)\n"
3938 " {\n"
3939 " .reg .u16 %rh<3>;\n"
3940 " .reg .u32 %r<14>;\n"
3941 " .reg .u64 %rd<14>;\n"
3942 " .reg .f32 %f<8>;\n"
3943 " .reg .pred %p<7>;\n"
3944 " .loc 3 556 0\n"
3945 "$LDWbegin_reduce_uchar_2_false:\n"
3946 " .loc 3 181 0\n"
3947 " cvt.u32.u16 %r1, %ctaid.x;\n"
3948 " mul24.lo.u32 %r2, %r1, 4;\n"
3949 " cvt.u32.u16 %r3, %tid.x;\n"
3950 " add.u32 %r4, %r2, %r3;\n"
3951 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_2_false_n];\n"
3952 " setp.ge.u32 %p1, %r4, %r5;\n"
3953 " @%p1 bra $Lt_36_17410;\n"
3954 " add.u32 %r6, %r4, 2;\n"
3955 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_2_false_n];\n"
3956 " add.u32 %r7, %r5, 2;\n"
3957 " mov.u16 %rh1, %nctaid.x;\n"
3958 " mul.wide.u16 %r8, %rh1, 4;\n"
3959 " cvt.u64.u32 %rd1, %r4;\n"
3960 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_2_false_g_idata];\n"
3961 " add.u64 %rd3, %rd1, %rd2;\n"
3962 " cvt.s64.u32 %rd4, %r8;\n"
3963 " mov.f32 %f1, 0f00000000; // 0\n"
3964 "$Lt_36_15362:\n"
3965 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
3966 " .loc 3 188 0\n"
3967 " ld.global.u8 %r9, [%rd3+0];\n"
3968 " cvt.rn.f32.u32 %f2, %r9;\n"
3969 " add.f32 %f1, %f2, %f1;\n"
3970 " .loc 3 181 0\n"
3971 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_2_false_n];\n"
3972 " .loc 3 188 0\n"
3973 " setp.ge.u32 %p2, %r6, %r5;\n"
3974 " @%p2 bra $Lt_36_15618;\n"
3975 " //<loop> Part of loop body line 181, head labeled $Lt_36_15362\n"
3976 " .loc 3 191 0\n"
3977 " ld.global.u8 %r10, [%rd3+2];\n"
3978 " cvt.rn.f32.u32 %f3, %r10;\n"
3979 " add.f32 %f1, %f3, %f1;\n"
3980 "$Lt_36_15618:\n"
3981 " //<loop> Part of loop body line 181, head labeled $Lt_36_15362\n"
3982 " add.u32 %r6, %r6, %r8;\n"
3983 " add.u64 %rd3, %rd4, %rd3;\n"
3984 " setp.lt.u32 %p3, %r6, %r7;\n"
3985 " @%p3 bra $Lt_36_15362;\n"
3986 " bra.uni $Lt_36_14850;\n"
3987 "$Lt_36_17410:\n"
3988 " mov.f32 %f1, 0f00000000; // 0\n"
3989 "$Lt_36_14850:\n"
3990 " .loc 3 71 0\n"
3991 " mov.u64 %rd5, __smem;\n"
3992 " cvt.u64.u32 %rd6, %r3;\n"
3993 " mul.wide.u32 %rd7, %r3, 4;\n"
3994 " add.u64 %rd8, %rd5, %rd7;\n"
3995 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
3996 " .loc 3 72 0\n"
3997 " bar.sync 0;\n"
3998 " mov.u32 %r11, 31;\n"
3999 " setp.gt.u32 %p4, %r3, %r11;\n"
4000 " @%p4 bra $Lt_36_16386;\n"
4001 " .loc 3 88 0\n"
4002 " ld.volatile.shared.f32 %f4, [%rd8+4];\n"
4003 " add.f32 %f5, %f4, %f1;\n"
4004 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4005 "$Lt_36_16386:\n"
4006 " .loc 3 195 0\n"
4007 " mov.u32 %r12, 0;\n"
4008 " setp.ne.u32 %p5, %r3, %r12;\n"
4009 " @%p5 bra $Lt_36_16898;\n"
4010 " .loc 3 199 0\n"
4011 " ld.shared.f32 %f6, [__smem+0];\n"
4012 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_2_false_g_odata];\n"
4013 " cvt.u64.u32 %rd10, %r1;\n"
4014 " mul.wide.u32 %rd11, %r1, 4;\n"
4015 " add.u64 %rd12, %rd9, %rd11;\n"
4016 " st.global.f32 [%rd12+0], %f6;\n"
4017 "$Lt_36_16898:\n"
4018 " .loc 3 559 0\n"
4019 " exit;\n"
4020 "$LDWend_reduce_uchar_2_false:\n"
4021 " } // reduce_uchar_2_false\n"
4022 "\n"
4023 " .entry reduce_uchar_4_false (\n"
4024 " .param .u64 __cudaparm_reduce_uchar_4_false_g_idata,\n"
4025 " .param .u64 __cudaparm_reduce_uchar_4_false_g_odata,\n"
4026 " .param .u32 __cudaparm_reduce_uchar_4_false_n)\n"
4027 " {\n"
4028 " .reg .u16 %rh<3>;\n"
4029 " .reg .u32 %r<14>;\n"
4030 " .reg .u64 %rd<14>;\n"
4031 " .reg .f32 %f<10>;\n"
4032 " .reg .pred %p<7>;\n"
4033 " .loc 3 561 0\n"
4034 "$LDWbegin_reduce_uchar_4_false:\n"
4035 " .loc 3 181 0\n"
4036 " cvt.u32.u16 %r1, %ctaid.x;\n"
4037 " mul24.lo.u32 %r2, %r1, 8;\n"
4038 " cvt.u32.u16 %r3, %tid.x;\n"
4039 " add.u32 %r4, %r2, %r3;\n"
4040 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_4_false_n];\n"
4041 " setp.ge.u32 %p1, %r4, %r5;\n"
4042 " @%p1 bra $Lt_37_17154;\n"
4043 " add.u32 %r6, %r4, 4;\n"
4044 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_4_false_n];\n"
4045 " add.u32 %r7, %r5, 4;\n"
4046 " mov.u16 %rh1, %nctaid.x;\n"
4047 " mul.wide.u16 %r8, %rh1, 8;\n"
4048 " cvt.u64.u32 %rd1, %r4;\n"
4049 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_4_false_g_idata];\n"
4050 " add.u64 %rd3, %rd1, %rd2;\n"
4051 " cvt.s64.u32 %rd4, %r8;\n"
4052 " mov.f32 %f1, 0f00000000; // 0\n"
4053 "$Lt_37_15106:\n"
4054 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4055 " .loc 3 188 0\n"
4056 " ld.global.u8 %r9, [%rd3+0];\n"
4057 " cvt.rn.f32.u32 %f2, %r9;\n"
4058 " add.f32 %f1, %f2, %f1;\n"
4059 " .loc 3 181 0\n"
4060 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_4_false_n];\n"
4061 " .loc 3 188 0\n"
4062 " setp.ge.u32 %p2, %r6, %r5;\n"
4063 " @%p2 bra $Lt_37_15362;\n"
4064 " //<loop> Part of loop body line 181, head labeled $Lt_37_15106\n"
4065 " .loc 3 191 0\n"
4066 " ld.global.u8 %r10, [%rd3+4];\n"
4067 " cvt.rn.f32.u32 %f3, %r10;\n"
4068 " add.f32 %f1, %f3, %f1;\n"
4069 "$Lt_37_15362:\n"
4070 " //<loop> Part of loop body line 181, head labeled $Lt_37_15106\n"
4071 " add.u32 %r6, %r6, %r8;\n"
4072 " add.u64 %rd3, %rd4, %rd3;\n"
4073 " setp.lt.u32 %p3, %r6, %r7;\n"
4074 " @%p3 bra $Lt_37_15106;\n"
4075 " bra.uni $Lt_37_14594;\n"
4076 "$Lt_37_17154:\n"
4077 " mov.f32 %f1, 0f00000000; // 0\n"
4078 "$Lt_37_14594:\n"
4079 " .loc 3 71 0\n"
4080 " mov.u64 %rd5, __smem;\n"
4081 " cvt.u64.u32 %rd6, %r3;\n"
4082 " mul.wide.u32 %rd7, %r3, 4;\n"
4083 " add.u64 %rd8, %rd5, %rd7;\n"
4084 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4085 " .loc 3 72 0\n"
4086 " bar.sync 0;\n"
4087 " mov.u32 %r11, 31;\n"
4088 " setp.gt.u32 %p4, %r3, %r11;\n"
4089 " @%p4 bra $Lt_37_16130;\n"
4090 " .loc 3 87 0\n"
4091 " ld.volatile.shared.f32 %f4, [%rd8+8];\n"
4092 " add.f32 %f5, %f4, %f1;\n"
4093 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4094 " .loc 3 88 0\n"
4095 " ld.volatile.shared.f32 %f6, [%rd8+4];\n"
4096 " add.f32 %f7, %f6, %f5;\n"
4097 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4098 "$Lt_37_16130:\n"
4099 " .loc 3 195 0\n"
4100 " mov.u32 %r12, 0;\n"
4101 " setp.ne.u32 %p5, %r3, %r12;\n"
4102 " @%p5 bra $Lt_37_16642;\n"
4103 " .loc 3 199 0\n"
4104 " ld.shared.f32 %f8, [__smem+0];\n"
4105 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_4_false_g_odata];\n"
4106 " cvt.u64.u32 %rd10, %r1;\n"
4107 " mul.wide.u32 %rd11, %r1, 4;\n"
4108 " add.u64 %rd12, %rd9, %rd11;\n"
4109 " st.global.f32 [%rd12+0], %f8;\n"
4110 "$Lt_37_16642:\n"
4111 " .loc 3 564 0\n"
4112 " exit;\n"
4113 "$LDWend_reduce_uchar_4_false:\n"
4114 " } // reduce_uchar_4_false\n"
4115 "\n"
4116 " .entry reduce_uchar_8_false (\n"
4117 " .param .u64 __cudaparm_reduce_uchar_8_false_g_idata,\n"
4118 " .param .u64 __cudaparm_reduce_uchar_8_false_g_odata,\n"
4119 " .param .u32 __cudaparm_reduce_uchar_8_false_n)\n"
4120 " {\n"
4121 " .reg .u16 %rh<3>;\n"
4122 " .reg .u32 %r<14>;\n"
4123 " .reg .u64 %rd<14>;\n"
4124 " .reg .f32 %f<12>;\n"
4125 " .reg .pred %p<7>;\n"
4126 " .loc 3 566 0\n"
4127 "$LDWbegin_reduce_uchar_8_false:\n"
4128 " .loc 3 181 0\n"
4129 " cvt.u32.u16 %r1, %ctaid.x;\n"
4130 " mul24.lo.u32 %r2, %r1, 16;\n"
4131 " cvt.u32.u16 %r3, %tid.x;\n"
4132 " add.u32 %r4, %r2, %r3;\n"
4133 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_8_false_n];\n"
4134 " setp.ge.u32 %p1, %r4, %r5;\n"
4135 " @%p1 bra $Lt_38_16898;\n"
4136 " add.u32 %r6, %r4, 8;\n"
4137 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_8_false_n];\n"
4138 " add.u32 %r7, %r5, 8;\n"
4139 " mov.u16 %rh1, %nctaid.x;\n"
4140 " mul.wide.u16 %r8, %rh1, 16;\n"
4141 " cvt.u64.u32 %rd1, %r4;\n"
4142 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_8_false_g_idata];\n"
4143 " add.u64 %rd3, %rd1, %rd2;\n"
4144 " cvt.s64.u32 %rd4, %r8;\n"
4145 " mov.f32 %f1, 0f00000000; // 0\n"
4146 "$Lt_38_14850:\n"
4147 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4148 " .loc 3 188 0\n"
4149 " ld.global.u8 %r9, [%rd3+0];\n"
4150 " cvt.rn.f32.u32 %f2, %r9;\n"
4151 " add.f32 %f1, %f2, %f1;\n"
4152 " .loc 3 181 0\n"
4153 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_8_false_n];\n"
4154 " .loc 3 188 0\n"
4155 " setp.ge.u32 %p2, %r6, %r5;\n"
4156 " @%p2 bra $Lt_38_15106;\n"
4157 " //<loop> Part of loop body line 181, head labeled $Lt_38_14850\n"
4158 " .loc 3 191 0\n"
4159 " ld.global.u8 %r10, [%rd3+8];\n"
4160 " cvt.rn.f32.u32 %f3, %r10;\n"
4161 " add.f32 %f1, %f3, %f1;\n"
4162 "$Lt_38_15106:\n"
4163 " //<loop> Part of loop body line 181, head labeled $Lt_38_14850\n"
4164 " add.u32 %r6, %r6, %r8;\n"
4165 " add.u64 %rd3, %rd4, %rd3;\n"
4166 " setp.lt.u32 %p3, %r6, %r7;\n"
4167 " @%p3 bra $Lt_38_14850;\n"
4168 " bra.uni $Lt_38_14338;\n"
4169 "$Lt_38_16898:\n"
4170 " mov.f32 %f1, 0f00000000; // 0\n"
4171 "$Lt_38_14338:\n"
4172 " .loc 3 71 0\n"
4173 " mov.u64 %rd5, __smem;\n"
4174 " cvt.u64.u32 %rd6, %r3;\n"
4175 " mul.wide.u32 %rd7, %r3, 4;\n"
4176 " add.u64 %rd8, %rd5, %rd7;\n"
4177 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4178 " .loc 3 72 0\n"
4179 " bar.sync 0;\n"
4180 " mov.u32 %r11, 31;\n"
4181 " setp.gt.u32 %p4, %r3, %r11;\n"
4182 " @%p4 bra $Lt_38_15874;\n"
4183 " .loc 3 86 0\n"
4184 " ld.volatile.shared.f32 %f4, [%rd8+16];\n"
4185 " add.f32 %f5, %f4, %f1;\n"
4186 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4187 " .loc 3 87 0\n"
4188 " ld.volatile.shared.f32 %f6, [%rd8+8];\n"
4189 " add.f32 %f7, %f6, %f5;\n"
4190 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4191 " .loc 3 88 0\n"
4192 " ld.volatile.shared.f32 %f8, [%rd8+4];\n"
4193 " add.f32 %f9, %f8, %f7;\n"
4194 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4195 "$Lt_38_15874:\n"
4196 " .loc 3 195 0\n"
4197 " mov.u32 %r12, 0;\n"
4198 " setp.ne.u32 %p5, %r3, %r12;\n"
4199 " @%p5 bra $Lt_38_16386;\n"
4200 " .loc 3 199 0\n"
4201 " ld.shared.f32 %f10, [__smem+0];\n"
4202 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_8_false_g_odata];\n"
4203 " cvt.u64.u32 %rd10, %r1;\n"
4204 " mul.wide.u32 %rd11, %r1, 4;\n"
4205 " add.u64 %rd12, %rd9, %rd11;\n"
4206 " st.global.f32 [%rd12+0], %f10;\n"
4207 "$Lt_38_16386:\n"
4208 " .loc 3 569 0\n"
4209 " exit;\n"
4210 "$LDWend_reduce_uchar_8_false:\n"
4211 " } // reduce_uchar_8_false\n"
4212 "\n"
4213 " .entry reduce_uchar_16_false (\n"
4214 " .param .u64 __cudaparm_reduce_uchar_16_false_g_idata,\n"
4215 " .param .u64 __cudaparm_reduce_uchar_16_false_g_odata,\n"
4216 " .param .u32 __cudaparm_reduce_uchar_16_false_n)\n"
4217 " {\n"
4218 " .reg .u16 %rh<3>;\n"
4219 " .reg .u32 %r<14>;\n"
4220 " .reg .u64 %rd<14>;\n"
4221 " .reg .f32 %f<14>;\n"
4222 " .reg .pred %p<7>;\n"
4223 " .loc 3 571 0\n"
4224 "$LDWbegin_reduce_uchar_16_false:\n"
4225 " .loc 3 181 0\n"
4226 " cvt.u32.u16 %r1, %ctaid.x;\n"
4227 " mul24.lo.u32 %r2, %r1, 32;\n"
4228 " cvt.u32.u16 %r3, %tid.x;\n"
4229 " add.u32 %r4, %r2, %r3;\n"
4230 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_16_false_n];\n"
4231 " setp.ge.u32 %p1, %r4, %r5;\n"
4232 " @%p1 bra $Lt_39_16642;\n"
4233 " add.u32 %r6, %r4, 16;\n"
4234 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_16_false_n];\n"
4235 " add.u32 %r7, %r5, 16;\n"
4236 " mov.u16 %rh1, %nctaid.x;\n"
4237 " mul.wide.u16 %r8, %rh1, 32;\n"
4238 " cvt.u64.u32 %rd1, %r4;\n"
4239 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_16_false_g_idata];\n"
4240 " add.u64 %rd3, %rd1, %rd2;\n"
4241 " cvt.s64.u32 %rd4, %r8;\n"
4242 " mov.f32 %f1, 0f00000000; // 0\n"
4243 "$Lt_39_14594:\n"
4244 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4245 " .loc 3 188 0\n"
4246 " ld.global.u8 %r9, [%rd3+0];\n"
4247 " cvt.rn.f32.u32 %f2, %r9;\n"
4248 " add.f32 %f1, %f2, %f1;\n"
4249 " .loc 3 181 0\n"
4250 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_16_false_n];\n"
4251 " .loc 3 188 0\n"
4252 " setp.ge.u32 %p2, %r6, %r5;\n"
4253 " @%p2 bra $Lt_39_14850;\n"
4254 " //<loop> Part of loop body line 181, head labeled $Lt_39_14594\n"
4255 " .loc 3 191 0\n"
4256 " ld.global.u8 %r10, [%rd3+16];\n"
4257 " cvt.rn.f32.u32 %f3, %r10;\n"
4258 " add.f32 %f1, %f3, %f1;\n"
4259 "$Lt_39_14850:\n"
4260 " //<loop> Part of loop body line 181, head labeled $Lt_39_14594\n"
4261 " add.u32 %r6, %r6, %r8;\n"
4262 " add.u64 %rd3, %rd4, %rd3;\n"
4263 " setp.lt.u32 %p3, %r6, %r7;\n"
4264 " @%p3 bra $Lt_39_14594;\n"
4265 " bra.uni $Lt_39_14082;\n"
4266 "$Lt_39_16642:\n"
4267 " mov.f32 %f1, 0f00000000; // 0\n"
4268 "$Lt_39_14082:\n"
4269 " .loc 3 71 0\n"
4270 " mov.u64 %rd5, __smem;\n"
4271 " cvt.u64.u32 %rd6, %r3;\n"
4272 " mul.wide.u32 %rd7, %r3, 4;\n"
4273 " add.u64 %rd8, %rd5, %rd7;\n"
4274 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4275 " .loc 3 72 0\n"
4276 " bar.sync 0;\n"
4277 " mov.u32 %r11, 31;\n"
4278 " setp.gt.u32 %p4, %r3, %r11;\n"
4279 " @%p4 bra $Lt_39_15618;\n"
4280 " .loc 3 85 0\n"
4281 " ld.volatile.shared.f32 %f4, [%rd8+32];\n"
4282 " add.f32 %f5, %f4, %f1;\n"
4283 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4284 " .loc 3 86 0\n"
4285 " ld.volatile.shared.f32 %f6, [%rd8+16];\n"
4286 " add.f32 %f7, %f6, %f5;\n"
4287 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4288 " .loc 3 87 0\n"
4289 " ld.volatile.shared.f32 %f8, [%rd8+8];\n"
4290 " add.f32 %f9, %f8, %f7;\n"
4291 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4292 " .loc 3 88 0\n"
4293 " ld.volatile.shared.f32 %f10, [%rd8+4];\n"
4294 " add.f32 %f11, %f10, %f9;\n"
4295 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4296 "$Lt_39_15618:\n"
4297 " .loc 3 195 0\n"
4298 " mov.u32 %r12, 0;\n"
4299 " setp.ne.u32 %p5, %r3, %r12;\n"
4300 " @%p5 bra $Lt_39_16130;\n"
4301 " .loc 3 199 0\n"
4302 " ld.shared.f32 %f12, [__smem+0];\n"
4303 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_16_false_g_odata];\n"
4304 " cvt.u64.u32 %rd10, %r1;\n"
4305 " mul.wide.u32 %rd11, %r1, 4;\n"
4306 " add.u64 %rd12, %rd9, %rd11;\n"
4307 " st.global.f32 [%rd12+0], %f12;\n"
4308 "$Lt_39_16130:\n"
4309 " .loc 3 574 0\n"
4310 " exit;\n"
4311 "$LDWend_reduce_uchar_16_false:\n"
4312 " } // reduce_uchar_16_false\n"
4313 "\n"
4314 " .entry reduce_uchar_32_false (\n"
4315 " .param .u64 __cudaparm_reduce_uchar_32_false_g_idata,\n"
4316 " .param .u64 __cudaparm_reduce_uchar_32_false_g_odata,\n"
4317 " .param .u32 __cudaparm_reduce_uchar_32_false_n)\n"
4318 " {\n"
4319 " .reg .u16 %rh<3>;\n"
4320 " .reg .u32 %r<14>;\n"
4321 " .reg .u64 %rd<14>;\n"
4322 " .reg .f32 %f<16>;\n"
4323 " .reg .pred %p<7>;\n"
4324 " .loc 3 576 0\n"
4325 "$LDWbegin_reduce_uchar_32_false:\n"
4326 " .loc 3 181 0\n"
4327 " cvt.u32.u16 %r1, %ctaid.x;\n"
4328 " mul24.lo.u32 %r2, %r1, 64;\n"
4329 " cvt.u32.u16 %r3, %tid.x;\n"
4330 " add.u32 %r4, %r2, %r3;\n"
4331 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_32_false_n];\n"
4332 " setp.ge.u32 %p1, %r4, %r5;\n"
4333 " @%p1 bra $Lt_40_16386;\n"
4334 " add.u32 %r6, %r4, 32;\n"
4335 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_32_false_n];\n"
4336 " add.u32 %r7, %r5, 32;\n"
4337 " mov.u16 %rh1, %nctaid.x;\n"
4338 " mul.wide.u16 %r8, %rh1, 64;\n"
4339 " cvt.u64.u32 %rd1, %r4;\n"
4340 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_32_false_g_idata];\n"
4341 " add.u64 %rd3, %rd1, %rd2;\n"
4342 " cvt.s64.u32 %rd4, %r8;\n"
4343 " mov.f32 %f1, 0f00000000; // 0\n"
4344 "$Lt_40_14338:\n"
4345 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4346 " .loc 3 188 0\n"
4347 " ld.global.u8 %r9, [%rd3+0];\n"
4348 " cvt.rn.f32.u32 %f2, %r9;\n"
4349 " add.f32 %f1, %f2, %f1;\n"
4350 " .loc 3 181 0\n"
4351 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_32_false_n];\n"
4352 " .loc 3 188 0\n"
4353 " setp.ge.u32 %p2, %r6, %r5;\n"
4354 " @%p2 bra $Lt_40_14594;\n"
4355 " //<loop> Part of loop body line 181, head labeled $Lt_40_14338\n"
4356 " .loc 3 191 0\n"
4357 " ld.global.u8 %r10, [%rd3+32];\n"
4358 " cvt.rn.f32.u32 %f3, %r10;\n"
4359 " add.f32 %f1, %f3, %f1;\n"
4360 "$Lt_40_14594:\n"
4361 " //<loop> Part of loop body line 181, head labeled $Lt_40_14338\n"
4362 " add.u32 %r6, %r6, %r8;\n"
4363 " add.u64 %rd3, %rd4, %rd3;\n"
4364 " setp.lt.u32 %p3, %r6, %r7;\n"
4365 " @%p3 bra $Lt_40_14338;\n"
4366 " bra.uni $Lt_40_13826;\n"
4367 "$Lt_40_16386:\n"
4368 " mov.f32 %f1, 0f00000000; // 0\n"
4369 "$Lt_40_13826:\n"
4370 " .loc 3 71 0\n"
4371 " mov.u64 %rd5, __smem;\n"
4372 " cvt.u64.u32 %rd6, %r3;\n"
4373 " mul.wide.u32 %rd7, %r3, 4;\n"
4374 " add.u64 %rd8, %rd5, %rd7;\n"
4375 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4376 " .loc 3 72 0\n"
4377 " bar.sync 0;\n"
4378 " mov.u32 %r11, 31;\n"
4379 " setp.gt.u32 %p4, %r3, %r11;\n"
4380 " @%p4 bra $Lt_40_15362;\n"
4381 " .loc 3 84 0\n"
4382 " ld.volatile.shared.f32 %f4, [%rd8+64];\n"
4383 " add.f32 %f5, %f4, %f1;\n"
4384 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4385 " .loc 3 85 0\n"
4386 " ld.volatile.shared.f32 %f6, [%rd8+32];\n"
4387 " add.f32 %f7, %f6, %f5;\n"
4388 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4389 " .loc 3 86 0\n"
4390 " ld.volatile.shared.f32 %f8, [%rd8+16];\n"
4391 " add.f32 %f9, %f8, %f7;\n"
4392 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4393 " .loc 3 87 0\n"
4394 " ld.volatile.shared.f32 %f10, [%rd8+8];\n"
4395 " add.f32 %f11, %f10, %f9;\n"
4396 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4397 " .loc 3 88 0\n"
4398 " ld.volatile.shared.f32 %f12, [%rd8+4];\n"
4399 " add.f32 %f13, %f12, %f11;\n"
4400 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
4401 "$Lt_40_15362:\n"
4402 " .loc 3 195 0\n"
4403 " mov.u32 %r12, 0;\n"
4404 " setp.ne.u32 %p5, %r3, %r12;\n"
4405 " @%p5 bra $Lt_40_15874;\n"
4406 " .loc 3 199 0\n"
4407 " ld.shared.f32 %f14, [__smem+0];\n"
4408 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_32_false_g_odata];\n"
4409 " cvt.u64.u32 %rd10, %r1;\n"
4410 " mul.wide.u32 %rd11, %r1, 4;\n"
4411 " add.u64 %rd12, %rd9, %rd11;\n"
4412 " st.global.f32 [%rd12+0], %f14;\n"
4413 "$Lt_40_15874:\n"
4414 " .loc 3 579 0\n"
4415 " exit;\n"
4416 "$LDWend_reduce_uchar_32_false:\n"
4417 " } // reduce_uchar_32_false\n"
4418 "\n"
4419 " .entry reduce_uchar_64_false (\n"
4420 " .param .u64 __cudaparm_reduce_uchar_64_false_g_idata,\n"
4421 " .param .u64 __cudaparm_reduce_uchar_64_false_g_odata,\n"
4422 " .param .u32 __cudaparm_reduce_uchar_64_false_n)\n"
4423 " {\n"
4424 " .reg .u16 %rh<3>;\n"
4425 " .reg .u32 %r<14>;\n"
4426 " .reg .u64 %rd<14>;\n"
4427 " .reg .f32 %f<18>;\n"
4428 " .reg .pred %p<7>;\n"
4429 " .loc 3 581 0\n"
4430 "$LDWbegin_reduce_uchar_64_false:\n"
4431 " .loc 3 181 0\n"
4432 " cvt.u32.u16 %r1, %ctaid.x;\n"
4433 " mul24.lo.u32 %r2, %r1, 128;\n"
4434 " cvt.u32.u16 %r3, %tid.x;\n"
4435 " add.u32 %r4, %r2, %r3;\n"
4436 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_64_false_n];\n"
4437 " setp.ge.u32 %p1, %r4, %r5;\n"
4438 " @%p1 bra $Lt_41_16130;\n"
4439 " add.u32 %r6, %r4, 64;\n"
4440 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_64_false_n];\n"
4441 " add.u32 %r7, %r5, 64;\n"
4442 " mov.u16 %rh1, %nctaid.x;\n"
4443 " mul.wide.u16 %r8, %rh1, 128;\n"
4444 " cvt.u64.u32 %rd1, %r4;\n"
4445 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_64_false_g_idata];\n"
4446 " add.u64 %rd3, %rd1, %rd2;\n"
4447 " cvt.s64.u32 %rd4, %r8;\n"
4448 " mov.f32 %f1, 0f00000000; // 0\n"
4449 "$Lt_41_14082:\n"
4450 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4451 " .loc 3 188 0\n"
4452 " ld.global.u8 %r9, [%rd3+0];\n"
4453 " cvt.rn.f32.u32 %f2, %r9;\n"
4454 " add.f32 %f1, %f2, %f1;\n"
4455 " .loc 3 181 0\n"
4456 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_64_false_n];\n"
4457 " .loc 3 188 0\n"
4458 " setp.ge.u32 %p2, %r6, %r5;\n"
4459 " @%p2 bra $Lt_41_14338;\n"
4460 " //<loop> Part of loop body line 181, head labeled $Lt_41_14082\n"
4461 " .loc 3 191 0\n"
4462 " ld.global.u8 %r10, [%rd3+64];\n"
4463 " cvt.rn.f32.u32 %f3, %r10;\n"
4464 " add.f32 %f1, %f3, %f1;\n"
4465 "$Lt_41_14338:\n"
4466 " //<loop> Part of loop body line 181, head labeled $Lt_41_14082\n"
4467 " add.u32 %r6, %r6, %r8;\n"
4468 " add.u64 %rd3, %rd4, %rd3;\n"
4469 " setp.lt.u32 %p3, %r6, %r7;\n"
4470 " @%p3 bra $Lt_41_14082;\n"
4471 " bra.uni $Lt_41_13570;\n"
4472 "$Lt_41_16130:\n"
4473 " mov.f32 %f1, 0f00000000; // 0\n"
4474 "$Lt_41_13570:\n"
4475 " .loc 3 71 0\n"
4476 " mov.u64 %rd5, __smem;\n"
4477 " cvt.u64.u32 %rd6, %r3;\n"
4478 " mul.wide.u32 %rd7, %r3, 4;\n"
4479 " add.u64 %rd8, %rd5, %rd7;\n"
4480 " st.volatile.shared.f32 [%rd8+0], %f1;\n"
4481 " .loc 3 72 0\n"
4482 " bar.sync 0;\n"
4483 " mov.u32 %r11, 31;\n"
4484 " setp.gt.u32 %p4, %r3, %r11;\n"
4485 " @%p4 bra $Lt_41_15106;\n"
4486 " .loc 3 83 0\n"
4487 " ld.volatile.shared.f32 %f4, [%rd8+128];\n"
4488 " add.f32 %f5, %f4, %f1;\n"
4489 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4490 " .loc 3 84 0\n"
4491 " ld.volatile.shared.f32 %f6, [%rd8+64];\n"
4492 " add.f32 %f7, %f6, %f5;\n"
4493 " st.volatile.shared.f32 [%rd8+0], %f7;\n"
4494 " .loc 3 85 0\n"
4495 " ld.volatile.shared.f32 %f8, [%rd8+32];\n"
4496 " add.f32 %f9, %f8, %f7;\n"
4497 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4498 " .loc 3 86 0\n"
4499 " ld.volatile.shared.f32 %f10, [%rd8+16];\n"
4500 " add.f32 %f11, %f10, %f9;\n"
4501 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4502 " .loc 3 87 0\n"
4503 " ld.volatile.shared.f32 %f12, [%rd8+8];\n"
4504 " add.f32 %f13, %f12, %f11;\n"
4505 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
4506 " .loc 3 88 0\n"
4507 " ld.volatile.shared.f32 %f14, [%rd8+4];\n"
4508 " add.f32 %f15, %f14, %f13;\n"
4509 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
4510 "$Lt_41_15106:\n"
4511 " .loc 3 195 0\n"
4512 " mov.u32 %r12, 0;\n"
4513 " setp.ne.u32 %p5, %r3, %r12;\n"
4514 " @%p5 bra $Lt_41_15618;\n"
4515 " .loc 3 199 0\n"
4516 " ld.shared.f32 %f16, [__smem+0];\n"
4517 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_64_false_g_odata];\n"
4518 " cvt.u64.u32 %rd10, %r1;\n"
4519 " mul.wide.u32 %rd11, %r1, 4;\n"
4520 " add.u64 %rd12, %rd9, %rd11;\n"
4521 " st.global.f32 [%rd12+0], %f16;\n"
4522 "$Lt_41_15618:\n"
4523 " .loc 3 584 0\n"
4524 " exit;\n"
4525 "$LDWend_reduce_uchar_64_false:\n"
4526 " } // reduce_uchar_64_false\n"
4527 "\n"
4528 " .entry reduce_uchar_128_false (\n"
4529 " .param .u64 __cudaparm_reduce_uchar_128_false_g_idata,\n"
4530 " .param .u64 __cudaparm_reduce_uchar_128_false_g_odata,\n"
4531 " .param .u32 __cudaparm_reduce_uchar_128_false_n)\n"
4532 " {\n"
4533 " .reg .u16 %rh<3>;\n"
4534 " .reg .u32 %r<15>;\n"
4535 " .reg .u64 %rd<14>;\n"
4536 " .reg .f32 %f<20>;\n"
4537 " .reg .pred %p<8>;\n"
4538 " .loc 3 586 0\n"
4539 "$LDWbegin_reduce_uchar_128_false:\n"
4540 " .loc 3 181 0\n"
4541 " cvt.u32.u16 %r1, %ctaid.x;\n"
4542 " mul.lo.u32 %r2, %r1, 256;\n"
4543 " cvt.u32.u16 %r3, %tid.x;\n"
4544 " add.u32 %r4, %r2, %r3;\n"
4545 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_128_false_n];\n"
4546 " setp.ge.u32 %p1, %r4, %r5;\n"
4547 " @%p1 bra $Lt_42_16386;\n"
4548 " add.u32 %r6, %r4, 128;\n"
4549 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_128_false_n];\n"
4550 " add.u32 %r7, %r5, 128;\n"
4551 " mov.u16 %rh1, %nctaid.x;\n"
4552 " mul.wide.u16 %r8, %rh1, 256;\n"
4553 " cvt.u64.u32 %rd1, %r4;\n"
4554 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_128_false_g_idata];\n"
4555 " add.u64 %rd3, %rd1, %rd2;\n"
4556 " cvt.s64.u32 %rd4, %r8;\n"
4557 " mov.f32 %f1, 0f00000000; // 0\n"
4558 "$Lt_42_13826:\n"
4559 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4560 " .loc 3 188 0\n"
4561 " ld.global.u8 %r9, [%rd3+0];\n"
4562 " cvt.rn.f32.u32 %f2, %r9;\n"
4563 " add.f32 %f1, %f2, %f1;\n"
4564 " .loc 3 181 0\n"
4565 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_128_false_n];\n"
4566 " .loc 3 188 0\n"
4567 " setp.ge.u32 %p2, %r6, %r5;\n"
4568 " @%p2 bra $Lt_42_14082;\n"
4569 " //<loop> Part of loop body line 181, head labeled $Lt_42_13826\n"
4570 " .loc 3 191 0\n"
4571 " ld.global.u8 %r10, [%rd3+128];\n"
4572 " cvt.rn.f32.u32 %f3, %r10;\n"
4573 " add.f32 %f1, %f3, %f1;\n"
4574 "$Lt_42_14082:\n"
4575 " //<loop> Part of loop body line 181, head labeled $Lt_42_13826\n"
4576 " add.u32 %r6, %r6, %r8;\n"
4577 " add.u64 %rd3, %rd4, %rd3;\n"
4578 " setp.lt.u32 %p3, %r6, %r7;\n"
4579 " @%p3 bra $Lt_42_13826;\n"
4580 " bra.uni $Lt_42_13314;\n"
4581 "$Lt_42_16386:\n"
4582 " mov.f32 %f1, 0f00000000; // 0\n"
4583 "$Lt_42_13314:\n"
4584 " .loc 3 195 0\n"
4585 " mov.f32 %f4, %f1;\n"
4586 " mov.f32 %f5, %f4;\n"
4587 " .loc 3 71 0\n"
4588 " mov.u64 %rd5, __smem;\n"
4589 " cvt.u64.u32 %rd6, %r3;\n"
4590 " mul.wide.u32 %rd7, %r3, 4;\n"
4591 " add.u64 %rd8, %rd5, %rd7;\n"
4592 " st.volatile.shared.f32 [%rd8+0], %f4;\n"
4593 " .loc 3 72 0\n"
4594 " bar.sync 0;\n"
4595 " mov.u32 %r11, 63;\n"
4596 " setp.gt.u32 %p4, %r3, %r11;\n"
4597 " @%p4 bra $Lt_42_14850;\n"
4598 " .loc 3 77 0\n"
4599 " ld.volatile.shared.f32 %f6, [%rd8+256];\n"
4600 " add.f32 %f5, %f6, %f4;\n"
4601 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4602 "$Lt_42_14850:\n"
4603 " bar.sync 0;\n"
4604 " mov.u32 %r12, 31;\n"
4605 " setp.gt.u32 %p5, %r3, %r12;\n"
4606 " @%p5 bra $Lt_42_15362;\n"
4607 " .loc 3 83 0\n"
4608 " ld.volatile.shared.f32 %f7, [%rd8+128];\n"
4609 " add.f32 %f8, %f7, %f5;\n"
4610 " st.volatile.shared.f32 [%rd8+0], %f8;\n"
4611 " .loc 3 84 0\n"
4612 " ld.volatile.shared.f32 %f9, [%rd8+64];\n"
4613 " add.f32 %f10, %f9, %f8;\n"
4614 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
4615 " .loc 3 85 0\n"
4616 " ld.volatile.shared.f32 %f11, [%rd8+32];\n"
4617 " add.f32 %f12, %f11, %f10;\n"
4618 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
4619 " .loc 3 86 0\n"
4620 " ld.volatile.shared.f32 %f13, [%rd8+16];\n"
4621 " add.f32 %f14, %f13, %f12;\n"
4622 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
4623 " .loc 3 87 0\n"
4624 " ld.volatile.shared.f32 %f15, [%rd8+8];\n"
4625 " add.f32 %f16, %f15, %f14;\n"
4626 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
4627 " .loc 3 88 0\n"
4628 " ld.volatile.shared.f32 %f17, [%rd8+4];\n"
4629 " add.f32 %f5, %f17, %f16;\n"
4630 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4631 "$Lt_42_15362:\n"
4632 " .loc 3 195 0\n"
4633 " mov.u32 %r13, 0;\n"
4634 " setp.ne.u32 %p6, %r3, %r13;\n"
4635 " @%p6 bra $Lt_42_15874;\n"
4636 " .loc 3 199 0\n"
4637 " ld.shared.f32 %f18, [__smem+0];\n"
4638 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_128_false_g_odata];\n"
4639 " cvt.u64.u32 %rd10, %r1;\n"
4640 " mul.wide.u32 %rd11, %r1, 4;\n"
4641 " add.u64 %rd12, %rd9, %rd11;\n"
4642 " st.global.f32 [%rd12+0], %f18;\n"
4643 "$Lt_42_15874:\n"
4644 " .loc 3 589 0\n"
4645 " exit;\n"
4646 "$LDWend_reduce_uchar_128_false:\n"
4647 " } // reduce_uchar_128_false\n"
4648 "\n"
4649 " .entry reduce_uchar_256_false (\n"
4650 " .param .u64 __cudaparm_reduce_uchar_256_false_g_idata,\n"
4651 " .param .u64 __cudaparm_reduce_uchar_256_false_g_odata,\n"
4652 " .param .u32 __cudaparm_reduce_uchar_256_false_n)\n"
4653 " {\n"
4654 " .reg .u16 %rh<3>;\n"
4655 " .reg .u32 %r<16>;\n"
4656 " .reg .u64 %rd<14>;\n"
4657 " .reg .f32 %f<21>;\n"
4658 " .reg .pred %p<9>;\n"
4659 " .loc 3 591 0\n"
4660 "$LDWbegin_reduce_uchar_256_false:\n"
4661 " .loc 3 181 0\n"
4662 " cvt.u32.u16 %r1, %ctaid.x;\n"
4663 " mul.lo.u32 %r2, %r1, 512;\n"
4664 " cvt.u32.u16 %r3, %tid.x;\n"
4665 " add.u32 %r4, %r2, %r3;\n"
4666 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_256_false_n];\n"
4667 " setp.ge.u32 %p1, %r4, %r5;\n"
4668 " @%p1 bra $Lt_43_16642;\n"
4669 " add.u32 %r6, %r4, 256;\n"
4670 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_256_false_n];\n"
4671 " add.u32 %r7, %r5, 256;\n"
4672 " mov.u16 %rh1, %nctaid.x;\n"
4673 " mul.wide.u16 %r8, %rh1, 512;\n"
4674 " cvt.u64.u32 %rd1, %r4;\n"
4675 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_256_false_g_idata];\n"
4676 " add.u64 %rd3, %rd1, %rd2;\n"
4677 " cvt.s64.u32 %rd4, %r8;\n"
4678 " mov.f32 %f1, 0f00000000; // 0\n"
4679 "$Lt_43_13570:\n"
4680 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4681 " .loc 3 188 0\n"
4682 " ld.global.u8 %r9, [%rd3+0];\n"
4683 " cvt.rn.f32.u32 %f2, %r9;\n"
4684 " add.f32 %f1, %f2, %f1;\n"
4685 " .loc 3 181 0\n"
4686 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_256_false_n];\n"
4687 " .loc 3 188 0\n"
4688 " setp.ge.u32 %p2, %r6, %r5;\n"
4689 " @%p2 bra $Lt_43_13826;\n"
4690 " //<loop> Part of loop body line 181, head labeled $Lt_43_13570\n"
4691 " .loc 3 191 0\n"
4692 " ld.global.u8 %r10, [%rd3+256];\n"
4693 " cvt.rn.f32.u32 %f3, %r10;\n"
4694 " add.f32 %f1, %f3, %f1;\n"
4695 "$Lt_43_13826:\n"
4696 " //<loop> Part of loop body line 181, head labeled $Lt_43_13570\n"
4697 " add.u32 %r6, %r6, %r8;\n"
4698 " add.u64 %rd3, %rd4, %rd3;\n"
4699 " setp.lt.u32 %p3, %r6, %r7;\n"
4700 " @%p3 bra $Lt_43_13570;\n"
4701 " bra.uni $Lt_43_13058;\n"
4702 "$Lt_43_16642:\n"
4703 " mov.f32 %f1, 0f00000000; // 0\n"
4704 "$Lt_43_13058:\n"
4705 " .loc 3 195 0\n"
4706 " mov.f32 %f4, %f1;\n"
4707 " mov.f32 %f5, %f4;\n"
4708 " .loc 3 71 0\n"
4709 " mov.u64 %rd5, __smem;\n"
4710 " cvt.u64.u32 %rd6, %r3;\n"
4711 " mul.wide.u32 %rd7, %r3, 4;\n"
4712 " add.u64 %rd8, %rd5, %rd7;\n"
4713 " st.volatile.shared.f32 [%rd8+0], %f4;\n"
4714 " .loc 3 72 0\n"
4715 " bar.sync 0;\n"
4716 " mov.u32 %r11, 127;\n"
4717 " setp.gt.u32 %p4, %r3, %r11;\n"
4718 " @%p4 bra $Lt_43_14594;\n"
4719 " .loc 3 76 0\n"
4720 " ld.volatile.shared.f32 %f6, [%rd8+512];\n"
4721 " add.f32 %f5, %f6, %f4;\n"
4722 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4723 "$Lt_43_14594:\n"
4724 " bar.sync 0;\n"
4725 " mov.u32 %r12, 63;\n"
4726 " setp.gt.u32 %p5, %r3, %r12;\n"
4727 " @%p5 bra $Lt_43_15106;\n"
4728 " .loc 3 77 0\n"
4729 " ld.volatile.shared.f32 %f7, [%rd8+256];\n"
4730 " add.f32 %f5, %f7, %f5;\n"
4731 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4732 "$Lt_43_15106:\n"
4733 " bar.sync 0;\n"
4734 " mov.u32 %r13, 31;\n"
4735 " setp.gt.u32 %p6, %r3, %r13;\n"
4736 " @%p6 bra $Lt_43_15618;\n"
4737 " .loc 3 83 0\n"
4738 " ld.volatile.shared.f32 %f8, [%rd8+128];\n"
4739 " add.f32 %f9, %f8, %f5;\n"
4740 " st.volatile.shared.f32 [%rd8+0], %f9;\n"
4741 " .loc 3 84 0\n"
4742 " ld.volatile.shared.f32 %f10, [%rd8+64];\n"
4743 " add.f32 %f11, %f10, %f9;\n"
4744 " st.volatile.shared.f32 [%rd8+0], %f11;\n"
4745 " .loc 3 85 0\n"
4746 " ld.volatile.shared.f32 %f12, [%rd8+32];\n"
4747 " add.f32 %f13, %f12, %f11;\n"
4748 " st.volatile.shared.f32 [%rd8+0], %f13;\n"
4749 " .loc 3 86 0\n"
4750 " ld.volatile.shared.f32 %f14, [%rd8+16];\n"
4751 " add.f32 %f15, %f14, %f13;\n"
4752 " st.volatile.shared.f32 [%rd8+0], %f15;\n"
4753 " .loc 3 87 0\n"
4754 " ld.volatile.shared.f32 %f16, [%rd8+8];\n"
4755 " add.f32 %f17, %f16, %f15;\n"
4756 " st.volatile.shared.f32 [%rd8+0], %f17;\n"
4757 " .loc 3 88 0\n"
4758 " ld.volatile.shared.f32 %f18, [%rd8+4];\n"
4759 " add.f32 %f5, %f18, %f17;\n"
4760 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4761 "$Lt_43_15618:\n"
4762 " .loc 3 195 0\n"
4763 " mov.u32 %r14, 0;\n"
4764 " setp.ne.u32 %p7, %r3, %r14;\n"
4765 " @%p7 bra $Lt_43_16130;\n"
4766 " .loc 3 199 0\n"
4767 " ld.shared.f32 %f19, [__smem+0];\n"
4768 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_256_false_g_odata];\n"
4769 " cvt.u64.u32 %rd10, %r1;\n"
4770 " mul.wide.u32 %rd11, %r1, 4;\n"
4771 " add.u64 %rd12, %rd9, %rd11;\n"
4772 " st.global.f32 [%rd12+0], %f19;\n"
4773 "$Lt_43_16130:\n"
4774 " .loc 3 594 0\n"
4775 " exit;\n"
4776 "$LDWend_reduce_uchar_256_false:\n"
4777 " } // reduce_uchar_256_false\n"
4778 "\n"
4779 " .entry reduce_uchar_512_false (\n"
4780 " .param .u64 __cudaparm_reduce_uchar_512_false_g_idata,\n"
4781 " .param .u64 __cudaparm_reduce_uchar_512_false_g_odata,\n"
4782 " .param .u32 __cudaparm_reduce_uchar_512_false_n)\n"
4783 " {\n"
4784 " .reg .u16 %rh<3>;\n"
4785 " .reg .u32 %r<17>;\n"
4786 " .reg .u64 %rd<14>;\n"
4787 " .reg .f32 %f<22>;\n"
4788 " .reg .pred %p<10>;\n"
4789 " .loc 3 596 0\n"
4790 "$LDWbegin_reduce_uchar_512_false:\n"
4791 " .loc 3 181 0\n"
4792 " cvt.u32.u16 %r1, %ctaid.x;\n"
4793 " mul.lo.u32 %r2, %r1, 1024;\n"
4794 " cvt.u32.u16 %r3, %tid.x;\n"
4795 " add.u32 %r4, %r2, %r3;\n"
4796 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_512_false_n];\n"
4797 " setp.ge.u32 %p1, %r4, %r5;\n"
4798 " @%p1 bra $Lt_44_16898;\n"
4799 " add.u32 %r6, %r4, 512;\n"
4800 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_512_false_n];\n"
4801 " add.u32 %r7, %r5, 512;\n"
4802 " mov.u16 %rh1, %nctaid.x;\n"
4803 " mul.wide.u16 %r8, %rh1, 1024;\n"
4804 " cvt.u64.u32 %rd1, %r4;\n"
4805 " ld.param.u64 %rd2, [__cudaparm_reduce_uchar_512_false_g_idata];\n"
4806 " add.u64 %rd3, %rd1, %rd2;\n"
4807 " cvt.s64.u32 %rd4, %r8;\n"
4808 " mov.f32 %f1, 0f00000000; // 0\n"
4809 "$Lt_44_13314:\n"
4810 " //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown\n"
4811 " .loc 3 188 0\n"
4812 " ld.global.u8 %r9, [%rd3+0];\n"
4813 " cvt.rn.f32.u32 %f2, %r9;\n"
4814 " add.f32 %f1, %f2, %f1;\n"
4815 " .loc 3 181 0\n"
4816 " ld.param.u32 %r5, [__cudaparm_reduce_uchar_512_false_n];\n"
4817 " .loc 3 188 0\n"
4818 " setp.ge.u32 %p2, %r6, %r5;\n"
4819 " @%p2 bra $Lt_44_13570;\n"
4820 " //<loop> Part of loop body line 181, head labeled $Lt_44_13314\n"
4821 " .loc 3 191 0\n"
4822 " ld.global.u8 %r10, [%rd3+512];\n"
4823 " cvt.rn.f32.u32 %f3, %r10;\n"
4824 " add.f32 %f1, %f3, %f1;\n"
4825 "$Lt_44_13570:\n"
4826 " //<loop> Part of loop body line 181, head labeled $Lt_44_13314\n"
4827 " add.u32 %r6, %r6, %r8;\n"
4828 " add.u64 %rd3, %rd4, %rd3;\n"
4829 " setp.lt.u32 %p3, %r6, %r7;\n"
4830 " @%p3 bra $Lt_44_13314;\n"
4831 " bra.uni $Lt_44_12802;\n"
4832 "$Lt_44_16898:\n"
4833 " mov.f32 %f1, 0f00000000; // 0\n"
4834 "$Lt_44_12802:\n"
4835 " .loc 3 195 0\n"
4836 " mov.f32 %f4, %f1;\n"
4837 " mov.f32 %f5, %f4;\n"
4838 " .loc 3 71 0\n"
4839 " mov.u64 %rd5, __smem;\n"
4840 " cvt.u64.u32 %rd6, %r3;\n"
4841 " mul.wide.u32 %rd7, %r3, 4;\n"
4842 " add.u64 %rd8, %rd5, %rd7;\n"
4843 " st.volatile.shared.f32 [%rd8+0], %f4;\n"
4844 " .loc 3 72 0\n"
4845 " bar.sync 0;\n"
4846 " mov.u32 %r11, 255;\n"
4847 " setp.gt.u32 %p4, %r3, %r11;\n"
4848 " @%p4 bra $Lt_44_14338;\n"
4849 " .loc 3 75 0\n"
4850 " ld.volatile.shared.f32 %f6, [%rd8+1024];\n"
4851 " add.f32 %f5, %f6, %f4;\n"
4852 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4853 "$Lt_44_14338:\n"
4854 " bar.sync 0;\n"
4855 " mov.u32 %r12, 127;\n"
4856 " setp.gt.u32 %p5, %r3, %r12;\n"
4857 " @%p5 bra $Lt_44_14850;\n"
4858 " .loc 3 76 0\n"
4859 " ld.volatile.shared.f32 %f7, [%rd8+512];\n"
4860 " add.f32 %f5, %f7, %f5;\n"
4861 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4862 "$Lt_44_14850:\n"
4863 " bar.sync 0;\n"
4864 " mov.u32 %r13, 63;\n"
4865 " setp.gt.u32 %p6, %r3, %r13;\n"
4866 " @%p6 bra $Lt_44_15362;\n"
4867 " .loc 3 77 0\n"
4868 " ld.volatile.shared.f32 %f8, [%rd8+256];\n"
4869 " add.f32 %f5, %f8, %f5;\n"
4870 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4871 "$Lt_44_15362:\n"
4872 " bar.sync 0;\n"
4873 " mov.u32 %r14, 31;\n"
4874 " setp.gt.u32 %p7, %r3, %r14;\n"
4875 " @%p7 bra $Lt_44_15874;\n"
4876 " .loc 3 83 0\n"
4877 " ld.volatile.shared.f32 %f9, [%rd8+128];\n"
4878 " add.f32 %f10, %f9, %f5;\n"
4879 " st.volatile.shared.f32 [%rd8+0], %f10;\n"
4880 " .loc 3 84 0\n"
4881 " ld.volatile.shared.f32 %f11, [%rd8+64];\n"
4882 " add.f32 %f12, %f11, %f10;\n"
4883 " st.volatile.shared.f32 [%rd8+0], %f12;\n"
4884 " .loc 3 85 0\n"
4885 " ld.volatile.shared.f32 %f13, [%rd8+32];\n"
4886 " add.f32 %f14, %f13, %f12;\n"
4887 " st.volatile.shared.f32 [%rd8+0], %f14;\n"
4888 " .loc 3 86 0\n"
4889 " ld.volatile.shared.f32 %f15, [%rd8+16];\n"
4890 " add.f32 %f16, %f15, %f14;\n"
4891 " st.volatile.shared.f32 [%rd8+0], %f16;\n"
4892 " .loc 3 87 0\n"
4893 " ld.volatile.shared.f32 %f17, [%rd8+8];\n"
4894 " add.f32 %f18, %f17, %f16;\n"
4895 " st.volatile.shared.f32 [%rd8+0], %f18;\n"
4896 " .loc 3 88 0\n"
4897 " ld.volatile.shared.f32 %f19, [%rd8+4];\n"
4898 " add.f32 %f5, %f19, %f18;\n"
4899 " st.volatile.shared.f32 [%rd8+0], %f5;\n"
4900 "$Lt_44_15874:\n"
4901 " .loc 3 195 0\n"
4902 " mov.u32 %r15, 0;\n"
4903 " setp.ne.u32 %p8, %r3, %r15;\n"
4904 " @%p8 bra $Lt_44_16386;\n"
4905 " .loc 3 199 0\n"
4906 " ld.shared.f32 %f20, [__smem+0];\n"
4907 " ld.param.u64 %rd9, [__cudaparm_reduce_uchar_512_false_g_odata];\n"
4908 " cvt.u64.u32 %rd10, %r1;\n"
4909 " mul.wide.u32 %rd11, %r1, 4;\n"
4910 " add.u64 %rd12, %rd9, %rd11;\n"
4911 " st.global.f32 [%rd12+0], %f20;\n"
4912 "$Lt_44_16386:\n"
4913 " .loc 3 599 0\n"
4914 " exit;\n"
4915 "$LDWend_reduce_uchar_512_false:\n"
4916 " } // reduce_uchar_512_false\n"
4917 "\n"
4918 " .entry packed_float_reduce_1_false_false (\n"
4919 " .param .u64 __cudaparm_packed_float_reduce_1_false_false_g_idata,\n"
4920 " .param .u64 __cudaparm_packed_float_reduce_1_false_false_g_odata,\n"
4921 " .param .u32 __cudaparm_packed_float_reduce_1_false_false_n)\n"
4922 " {\n"
4923 " .reg .u16 %rh<7>;\n"
4924 " .reg .u32 %r<14>;\n"
4925 " .reg .u64 %rd<13>;\n"
4926 " .reg .f32 %f<4>;\n"
4927 " .reg .pred %p<5>;\n"
4928 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
4929 " .loc 3 722 0\n"
4930 "$LDWbegin_packed_float_reduce_1_false_false:\n"
4931 " .loc 3 637 0\n"
4932 " cvt.u32.u16 %r1, %ctaid.x;\n"
4933 " mul24.lo.u32 %r2, %r1, 2;\n"
4934 " cvt.u32.u16 %r3, %tid.x;\n"
4935 " add.u32 %r4, %r2, %r3;\n"
4936 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_false_n];\n"
4937 " setp.ge.u32 %p1, %r4, %r5;\n"
4938 " @%p1 bra $Lt_45_18178;\n"
4939 " mul.lo.u32 %r6, %r4, 4;\n"
4940 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_false_n];\n"
4941 " mul.lo.u32 %r7, %r5, 4;\n"
4942 " mov.u16 %rh1, %nctaid.x;\n"
4943 " mul.wide.u16 %r8, %rh1, 8;\n"
4944 " add.u32 %r9, %r6, 4;\n"
4945 " add.u32 %r10, %r7, 4;\n"
4946 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_false_g_idata];\n"
4947 "$Lt_45_17154:\n"
4948 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
4949 " .loc 3 677 0\n"
4950 " cvt.u8.u32 %r11, %r9;\n"
4951 " cvt.u64.u32 %rd2, %r11;\n"
4952 " .loc 3 637 0\n"
4953 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_false_g_idata];\n"
4954 " .loc 3 677 0\n"
4955 " add.u64 %rd3, %rd2, %rd1;\n"
4956 " ld.global.u8 %rh2, [%rd3+0];\n"
4957 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
4958 " .loc 3 678 0\n"
4959 " ld.global.u8 %rh3, [%rd3+1];\n"
4960 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
4961 " .loc 3 679 0\n"
4962 " ld.global.u8 %rh4, [%rd3+2];\n"
4963 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
4964 " .loc 3 680 0\n"
4965 " ld.global.u8 %rh5, [%rd3+3];\n"
4966 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
4967 " add.u32 %r9, %r8, %r9;\n"
4968 " setp.lt.u32 %p2, %r9, %r10;\n"
4969 " @%p2 bra $Lt_45_17154;\n"
4970 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
4971 " bra.uni $Lt_45_16642;\n"
4972 "$Lt_45_18178:\n"
4973 " mov.f32 %f1, 0f00000000; // 0\n"
4974 "$Lt_45_16642:\n"
4975 " .loc 3 692 0\n"
4976 " mov.u64 %rd4, __smem;\n"
4977 " cvt.u64.u32 %rd5, %r3;\n"
4978 " mul.wide.u32 %rd6, %r3, 4;\n"
4979 " add.u64 %rd7, %rd4, %rd6;\n"
4980 " st.shared.f32 [%rd7+0], %f1;\n"
4981 " .loc 3 693 0\n"
4982 " bar.sync 0;\n"
4983 " mov.u32 %r12, 0;\n"
4984 " setp.ne.u32 %p3, %r3, %r12;\n"
4985 " @%p3 bra $Lt_45_17666;\n"
4986 " .loc 3 719 0\n"
4987 " ld.shared.f32 %f2, [__smem+0];\n"
4988 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_false_false_g_odata];\n"
4989 " cvt.u64.u32 %rd9, %r1;\n"
4990 " mul.wide.u32 %rd10, %r1, 4;\n"
4991 " add.u64 %rd11, %rd8, %rd10;\n"
4992 " st.global.f32 [%rd11+0], %f2;\n"
4993 "$Lt_45_17666:\n"
4994 " .loc 3 723 0\n"
4995 " exit;\n"
4996 "$LDWend_packed_float_reduce_1_false_false:\n"
4997 " } // packed_float_reduce_1_false_false\n"
4998 "\n"
4999 " .entry packed_float_reduce_1_false_true (\n"
5000 " .param .u64 __cudaparm_packed_float_reduce_1_false_true_g_idata,\n"
5001 " .param .u64 __cudaparm_packed_float_reduce_1_false_true_g_odata,\n"
5002 " .param .u32 __cudaparm_packed_float_reduce_1_false_true_n)\n"
5003 " {\n"
5004 " .reg .u16 %rh<7>;\n"
5005 " .reg .u32 %r<14>;\n"
5006 " .reg .u64 %rd<13>;\n"
5007 " .reg .f32 %f<4>;\n"
5008 " .reg .pred %p<5>;\n"
5009 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5010 " .loc 3 724 0\n"
5011 "$LDWbegin_packed_float_reduce_1_false_true:\n"
5012 " .loc 3 637 0\n"
5013 " cvt.u32.u16 %r1, %ctaid.x;\n"
5014 " mul24.lo.u32 %r2, %r1, 2;\n"
5015 " cvt.u32.u16 %r3, %tid.x;\n"
5016 " add.u32 %r4, %r2, %r3;\n"
5017 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_true_n];\n"
5018 " setp.ge.u32 %p1, %r4, %r5;\n"
5019 " @%p1 bra $Lt_46_18178;\n"
5020 " mul.lo.u32 %r6, %r4, 4;\n"
5021 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_false_true_n];\n"
5022 " mul.lo.u32 %r7, %r5, 4;\n"
5023 " mov.u16 %rh1, %nctaid.x;\n"
5024 " mul.wide.u16 %r8, %rh1, 8;\n"
5025 " add.u32 %r9, %r6, 4;\n"
5026 " add.u32 %r10, %r7, 4;\n"
5027 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_true_g_idata];\n"
5028 "$Lt_46_17154:\n"
5029 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5030 " .loc 3 677 0\n"
5031 " cvt.u8.u32 %r11, %r9;\n"
5032 " cvt.u64.u32 %rd2, %r11;\n"
5033 " .loc 3 637 0\n"
5034 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_false_true_g_idata];\n"
5035 " .loc 3 677 0\n"
5036 " add.u64 %rd3, %rd2, %rd1;\n"
5037 " ld.global.u8 %rh2, [%rd3+0];\n"
5038 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5039 " .loc 3 678 0\n"
5040 " ld.global.u8 %rh3, [%rd3+1];\n"
5041 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5042 " .loc 3 679 0\n"
5043 " ld.global.u8 %rh4, [%rd3+2];\n"
5044 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5045 " .loc 3 680 0\n"
5046 " ld.global.u8 %rh5, [%rd3+3];\n"
5047 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5048 " add.u32 %r9, %r8, %r9;\n"
5049 " setp.lt.u32 %p2, %r9, %r10;\n"
5050 " @%p2 bra $Lt_46_17154;\n"
5051 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5052 " bra.uni $Lt_46_16642;\n"
5053 "$Lt_46_18178:\n"
5054 " mov.f32 %f1, 0f00000000; // 0\n"
5055 "$Lt_46_16642:\n"
5056 " .loc 3 692 0\n"
5057 " mov.u64 %rd4, __smem;\n"
5058 " cvt.u64.u32 %rd5, %r3;\n"
5059 " mul.wide.u32 %rd6, %r3, 4;\n"
5060 " add.u64 %rd7, %rd4, %rd6;\n"
5061 " st.shared.f32 [%rd7+0], %f1;\n"
5062 " .loc 3 693 0\n"
5063 " bar.sync 0;\n"
5064 " mov.u32 %r12, 0;\n"
5065 " setp.ne.u32 %p3, %r3, %r12;\n"
5066 " @%p3 bra $Lt_46_17666;\n"
5067 " .loc 3 719 0\n"
5068 " ld.shared.f32 %f2, [__smem+0];\n"
5069 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_false_true_g_odata];\n"
5070 " cvt.u64.u32 %rd9, %r1;\n"
5071 " mul.wide.u32 %rd10, %r1, 4;\n"
5072 " add.u64 %rd11, %rd8, %rd10;\n"
5073 " st.global.f32 [%rd11+0], %f2;\n"
5074 "$Lt_46_17666:\n"
5075 " .loc 3 725 0\n"
5076 " exit;\n"
5077 "$LDWend_packed_float_reduce_1_false_true:\n"
5078 " } // packed_float_reduce_1_false_true\n"
5079 "\n"
5080 " .entry packed_float_reduce_1_true_false (\n"
5081 " .param .u64 __cudaparm_packed_float_reduce_1_true_false_g_idata,\n"
5082 " .param .u64 __cudaparm_packed_float_reduce_1_true_false_g_odata,\n"
5083 " .param .u32 __cudaparm_packed_float_reduce_1_true_false_n)\n"
5084 " {\n"
5085 " .reg .u16 %rh<7>;\n"
5086 " .reg .u32 %r<14>;\n"
5087 " .reg .u64 %rd<13>;\n"
5088 " .reg .f32 %f<4>;\n"
5089 " .reg .pred %p<5>;\n"
5090 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5091 " .loc 3 726 0\n"
5092 "$LDWbegin_packed_float_reduce_1_true_false:\n"
5093 " .loc 3 637 0\n"
5094 " cvt.u32.u16 %r1, %ctaid.x;\n"
5095 " mul24.lo.u32 %r2, %r1, 2;\n"
5096 " cvt.u32.u16 %r3, %tid.x;\n"
5097 " add.u32 %r4, %r2, %r3;\n"
5098 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_false_n];\n"
5099 " setp.ge.u32 %p1, %r4, %r5;\n"
5100 " @%p1 bra $Lt_47_18178;\n"
5101 " mul.lo.u32 %r6, %r4, 4;\n"
5102 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_false_n];\n"
5103 " mul.lo.u32 %r7, %r5, 4;\n"
5104 " mov.u16 %rh1, %nctaid.x;\n"
5105 " mul.wide.u16 %r8, %rh1, 8;\n"
5106 " add.u32 %r9, %r6, 4;\n"
5107 " add.u32 %r10, %r7, 4;\n"
5108 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_false_g_idata];\n"
5109 "$Lt_47_17154:\n"
5110 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5111 " .loc 3 677 0\n"
5112 " cvt.u8.u32 %r11, %r9;\n"
5113 " cvt.u64.u32 %rd2, %r11;\n"
5114 " .loc 3 637 0\n"
5115 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_false_g_idata];\n"
5116 " .loc 3 677 0\n"
5117 " add.u64 %rd3, %rd2, %rd1;\n"
5118 " ld.global.u8 %rh2, [%rd3+0];\n"
5119 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5120 " .loc 3 678 0\n"
5121 " ld.global.u8 %rh3, [%rd3+1];\n"
5122 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5123 " .loc 3 679 0\n"
5124 " ld.global.u8 %rh4, [%rd3+2];\n"
5125 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5126 " .loc 3 680 0\n"
5127 " ld.global.u8 %rh5, [%rd3+3];\n"
5128 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5129 " add.u32 %r9, %r8, %r9;\n"
5130 " setp.lt.u32 %p2, %r9, %r10;\n"
5131 " @%p2 bra $Lt_47_17154;\n"
5132 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5133 " bra.uni $Lt_47_16642;\n"
5134 "$Lt_47_18178:\n"
5135 " mov.f32 %f1, 0f00000000; // 0\n"
5136 "$Lt_47_16642:\n"
5137 " .loc 3 692 0\n"
5138 " mov.u64 %rd4, __smem;\n"
5139 " cvt.u64.u32 %rd5, %r3;\n"
5140 " mul.wide.u32 %rd6, %r3, 4;\n"
5141 " add.u64 %rd7, %rd4, %rd6;\n"
5142 " st.shared.f32 [%rd7+0], %f1;\n"
5143 " .loc 3 693 0\n"
5144 " bar.sync 0;\n"
5145 " mov.u32 %r12, 0;\n"
5146 " setp.ne.u32 %p3, %r3, %r12;\n"
5147 " @%p3 bra $Lt_47_17666;\n"
5148 " .loc 3 719 0\n"
5149 " ld.shared.f32 %f2, [__smem+0];\n"
5150 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_true_false_g_odata];\n"
5151 " cvt.u64.u32 %rd9, %r1;\n"
5152 " mul.wide.u32 %rd10, %r1, 4;\n"
5153 " add.u64 %rd11, %rd8, %rd10;\n"
5154 " st.global.f32 [%rd11+0], %f2;\n"
5155 "$Lt_47_17666:\n"
5156 " .loc 3 727 0\n"
5157 " exit;\n"
5158 "$LDWend_packed_float_reduce_1_true_false:\n"
5159 " } // packed_float_reduce_1_true_false\n"
5160 "\n"
5161 " .entry packed_float_reduce_1_true_true (\n"
5162 " .param .u64 __cudaparm_packed_float_reduce_1_true_true_g_idata,\n"
5163 " .param .u64 __cudaparm_packed_float_reduce_1_true_true_g_odata,\n"
5164 " .param .u32 __cudaparm_packed_float_reduce_1_true_true_n)\n"
5165 " {\n"
5166 " .reg .u16 %rh<7>;\n"
5167 " .reg .u32 %r<14>;\n"
5168 " .reg .u64 %rd<13>;\n"
5169 " .reg .f32 %f<4>;\n"
5170 " .reg .pred %p<5>;\n"
5171 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5172 " .loc 3 728 0\n"
5173 "$LDWbegin_packed_float_reduce_1_true_true:\n"
5174 " .loc 3 637 0\n"
5175 " cvt.u32.u16 %r1, %ctaid.x;\n"
5176 " mul24.lo.u32 %r2, %r1, 2;\n"
5177 " cvt.u32.u16 %r3, %tid.x;\n"
5178 " add.u32 %r4, %r2, %r3;\n"
5179 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_true_n];\n"
5180 " setp.ge.u32 %p1, %r4, %r5;\n"
5181 " @%p1 bra $Lt_48_18178;\n"
5182 " mul.lo.u32 %r6, %r4, 4;\n"
5183 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_1_true_true_n];\n"
5184 " mul.lo.u32 %r7, %r5, 4;\n"
5185 " mov.u16 %rh1, %nctaid.x;\n"
5186 " mul.wide.u16 %r8, %rh1, 8;\n"
5187 " add.u32 %r9, %r6, 4;\n"
5188 " add.u32 %r10, %r7, 4;\n"
5189 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_true_g_idata];\n"
5190 "$Lt_48_17154:\n"
5191 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5192 " .loc 3 677 0\n"
5193 " cvt.u8.u32 %r11, %r9;\n"
5194 " cvt.u64.u32 %rd2, %r11;\n"
5195 " .loc 3 637 0\n"
5196 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_1_true_true_g_idata];\n"
5197 " .loc 3 677 0\n"
5198 " add.u64 %rd3, %rd2, %rd1;\n"
5199 " ld.global.u8 %rh2, [%rd3+0];\n"
5200 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5201 " .loc 3 678 0\n"
5202 " ld.global.u8 %rh3, [%rd3+1];\n"
5203 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5204 " .loc 3 679 0\n"
5205 " ld.global.u8 %rh4, [%rd3+2];\n"
5206 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5207 " .loc 3 680 0\n"
5208 " ld.global.u8 %rh5, [%rd3+3];\n"
5209 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5210 " add.u32 %r9, %r8, %r9;\n"
5211 " setp.lt.u32 %p2, %r9, %r10;\n"
5212 " @%p2 bra $Lt_48_17154;\n"
5213 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5214 " bra.uni $Lt_48_16642;\n"
5215 "$Lt_48_18178:\n"
5216 " mov.f32 %f1, 0f00000000; // 0\n"
5217 "$Lt_48_16642:\n"
5218 " .loc 3 692 0\n"
5219 " mov.u64 %rd4, __smem;\n"
5220 " cvt.u64.u32 %rd5, %r3;\n"
5221 " mul.wide.u32 %rd6, %r3, 4;\n"
5222 " add.u64 %rd7, %rd4, %rd6;\n"
5223 " st.shared.f32 [%rd7+0], %f1;\n"
5224 " .loc 3 693 0\n"
5225 " bar.sync 0;\n"
5226 " mov.u32 %r12, 0;\n"
5227 " setp.ne.u32 %p3, %r3, %r12;\n"
5228 " @%p3 bra $Lt_48_17666;\n"
5229 " .loc 3 719 0\n"
5230 " ld.shared.f32 %f2, [__smem+0];\n"
5231 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_1_true_true_g_odata];\n"
5232 " cvt.u64.u32 %rd9, %r1;\n"
5233 " mul.wide.u32 %rd10, %r1, 4;\n"
5234 " add.u64 %rd11, %rd8, %rd10;\n"
5235 " st.global.f32 [%rd11+0], %f2;\n"
5236 "$Lt_48_17666:\n"
5237 " .loc 3 729 0\n"
5238 " exit;\n"
5239 "$LDWend_packed_float_reduce_1_true_true:\n"
5240 " } // packed_float_reduce_1_true_true\n"
5241 "\n"
5242 " .entry packed_float_reduce_2_false_false (\n"
5243 " .param .u64 __cudaparm_packed_float_reduce_2_false_false_g_idata,\n"
5244 " .param .u64 __cudaparm_packed_float_reduce_2_false_false_g_odata,\n"
5245 " .param .u32 __cudaparm_packed_float_reduce_2_false_false_n)\n"
5246 " {\n"
5247 " .reg .u16 %rh<7>;\n"
5248 " .reg .u32 %r<15>;\n"
5249 " .reg .u64 %rd<13>;\n"
5250 " .reg .f32 %f<5>;\n"
5251 " .reg .pred %p<6>;\n"
5252 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5253 " .loc 3 731 0\n"
5254 "$LDWbegin_packed_float_reduce_2_false_false:\n"
5255 " .loc 3 637 0\n"
5256 " cvt.u32.u16 %r1, %ctaid.x;\n"
5257 " mul24.lo.u32 %r2, %r1, 4;\n"
5258 " cvt.u32.u16 %r3, %tid.x;\n"
5259 " add.u32 %r4, %r2, %r3;\n"
5260 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_false_n];\n"
5261 " setp.ge.u32 %p1, %r4, %r5;\n"
5262 " @%p1 bra $Lt_49_18434;\n"
5263 " mul.lo.u32 %r6, %r4, 4;\n"
5264 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_false_n];\n"
5265 " mul.lo.u32 %r7, %r5, 4;\n"
5266 " mov.u16 %rh1, %nctaid.x;\n"
5267 " mul.wide.u16 %r8, %rh1, 16;\n"
5268 " add.u32 %r9, %r6, 8;\n"
5269 " add.u32 %r10, %r7, 8;\n"
5270 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_false_g_idata];\n"
5271 "$Lt_49_16898:\n"
5272 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5273 " .loc 3 677 0\n"
5274 " cvt.u8.u32 %r11, %r9;\n"
5275 " cvt.u64.u32 %rd2, %r11;\n"
5276 " .loc 3 637 0\n"
5277 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_false_g_idata];\n"
5278 " .loc 3 677 0\n"
5279 " add.u64 %rd3, %rd2, %rd1;\n"
5280 " ld.global.u8 %rh2, [%rd3+0];\n"
5281 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5282 " .loc 3 678 0\n"
5283 " ld.global.u8 %rh3, [%rd3+1];\n"
5284 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5285 " .loc 3 679 0\n"
5286 " ld.global.u8 %rh4, [%rd3+2];\n"
5287 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5288 " .loc 3 680 0\n"
5289 " ld.global.u8 %rh5, [%rd3+3];\n"
5290 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5291 " add.u32 %r9, %r8, %r9;\n"
5292 " setp.lt.u32 %p2, %r9, %r10;\n"
5293 " @%p2 bra $Lt_49_16898;\n"
5294 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5295 " bra.uni $Lt_49_16386;\n"
5296 "$Lt_49_18434:\n"
5297 " mov.f32 %f1, 0f00000000; // 0\n"
5298 "$Lt_49_16386:\n"
5299 " .loc 3 692 0\n"
5300 " mov.u64 %rd4, __smem;\n"
5301 " cvt.u64.u32 %rd5, %r3;\n"
5302 " mul.wide.u32 %rd6, %r3, 4;\n"
5303 " add.u64 %rd7, %rd4, %rd6;\n"
5304 " st.shared.f32 [%rd7+0], %f1;\n"
5305 " .loc 3 693 0\n"
5306 " bar.sync 0;\n"
5307 " mov.u32 %r12, 31;\n"
5308 " setp.gt.u32 %p3, %r3, %r12;\n"
5309 " @%p3 bra $Lt_49_17410;\n"
5310 " .loc 3 714 0\n"
5311 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5312 " add.f32 %f1, %f2, %f1;\n"
5313 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5314 "$Lt_49_17410:\n"
5315 " mov.u32 %r13, 0;\n"
5316 " setp.ne.u32 %p4, %r3, %r13;\n"
5317 " @%p4 bra $Lt_49_17922;\n"
5318 " .loc 3 719 0\n"
5319 " ld.shared.f32 %f3, [__smem+0];\n"
5320 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_false_false_g_odata];\n"
5321 " cvt.u64.u32 %rd9, %r1;\n"
5322 " mul.wide.u32 %rd10, %r1, 4;\n"
5323 " add.u64 %rd11, %rd8, %rd10;\n"
5324 " st.global.f32 [%rd11+0], %f3;\n"
5325 "$Lt_49_17922:\n"
5326 " .loc 3 732 0\n"
5327 " exit;\n"
5328 "$LDWend_packed_float_reduce_2_false_false:\n"
5329 " } // packed_float_reduce_2_false_false\n"
5330 "\n"
5331 " .entry packed_float_reduce_2_false_true (\n"
5332 " .param .u64 __cudaparm_packed_float_reduce_2_false_true_g_idata,\n"
5333 " .param .u64 __cudaparm_packed_float_reduce_2_false_true_g_odata,\n"
5334 " .param .u32 __cudaparm_packed_float_reduce_2_false_true_n)\n"
5335 " {\n"
5336 " .reg .u16 %rh<7>;\n"
5337 " .reg .u32 %r<15>;\n"
5338 " .reg .u64 %rd<13>;\n"
5339 " .reg .f32 %f<5>;\n"
5340 " .reg .pred %p<6>;\n"
5341 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5342 " .loc 3 733 0\n"
5343 "$LDWbegin_packed_float_reduce_2_false_true:\n"
5344 " .loc 3 637 0\n"
5345 " cvt.u32.u16 %r1, %ctaid.x;\n"
5346 " mul24.lo.u32 %r2, %r1, 4;\n"
5347 " cvt.u32.u16 %r3, %tid.x;\n"
5348 " add.u32 %r4, %r2, %r3;\n"
5349 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_true_n];\n"
5350 " setp.ge.u32 %p1, %r4, %r5;\n"
5351 " @%p1 bra $Lt_50_18434;\n"
5352 " mul.lo.u32 %r6, %r4, 4;\n"
5353 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_false_true_n];\n"
5354 " mul.lo.u32 %r7, %r5, 4;\n"
5355 " mov.u16 %rh1, %nctaid.x;\n"
5356 " mul.wide.u16 %r8, %rh1, 16;\n"
5357 " add.u32 %r9, %r6, 8;\n"
5358 " add.u32 %r10, %r7, 8;\n"
5359 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_true_g_idata];\n"
5360 "$Lt_50_16898:\n"
5361 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5362 " .loc 3 677 0\n"
5363 " cvt.u8.u32 %r11, %r9;\n"
5364 " cvt.u64.u32 %rd2, %r11;\n"
5365 " .loc 3 637 0\n"
5366 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_false_true_g_idata];\n"
5367 " .loc 3 677 0\n"
5368 " add.u64 %rd3, %rd2, %rd1;\n"
5369 " ld.global.u8 %rh2, [%rd3+0];\n"
5370 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5371 " .loc 3 678 0\n"
5372 " ld.global.u8 %rh3, [%rd3+1];\n"
5373 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5374 " .loc 3 679 0\n"
5375 " ld.global.u8 %rh4, [%rd3+2];\n"
5376 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5377 " .loc 3 680 0\n"
5378 " ld.global.u8 %rh5, [%rd3+3];\n"
5379 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5380 " add.u32 %r9, %r8, %r9;\n"
5381 " setp.lt.u32 %p2, %r9, %r10;\n"
5382 " @%p2 bra $Lt_50_16898;\n"
5383 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5384 " bra.uni $Lt_50_16386;\n"
5385 "$Lt_50_18434:\n"
5386 " mov.f32 %f1, 0f00000000; // 0\n"
5387 "$Lt_50_16386:\n"
5388 " .loc 3 692 0\n"
5389 " mov.u64 %rd4, __smem;\n"
5390 " cvt.u64.u32 %rd5, %r3;\n"
5391 " mul.wide.u32 %rd6, %r3, 4;\n"
5392 " add.u64 %rd7, %rd4, %rd6;\n"
5393 " st.shared.f32 [%rd7+0], %f1;\n"
5394 " .loc 3 693 0\n"
5395 " bar.sync 0;\n"
5396 " mov.u32 %r12, 31;\n"
5397 " setp.gt.u32 %p3, %r3, %r12;\n"
5398 " @%p3 bra $Lt_50_17410;\n"
5399 " .loc 3 714 0\n"
5400 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5401 " add.f32 %f1, %f2, %f1;\n"
5402 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5403 "$Lt_50_17410:\n"
5404 " mov.u32 %r13, 0;\n"
5405 " setp.ne.u32 %p4, %r3, %r13;\n"
5406 " @%p4 bra $Lt_50_17922;\n"
5407 " .loc 3 719 0\n"
5408 " ld.shared.f32 %f3, [__smem+0];\n"
5409 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_false_true_g_odata];\n"
5410 " cvt.u64.u32 %rd9, %r1;\n"
5411 " mul.wide.u32 %rd10, %r1, 4;\n"
5412 " add.u64 %rd11, %rd8, %rd10;\n"
5413 " st.global.f32 [%rd11+0], %f3;\n"
5414 "$Lt_50_17922:\n"
5415 " .loc 3 734 0\n"
5416 " exit;\n"
5417 "$LDWend_packed_float_reduce_2_false_true:\n"
5418 " } // packed_float_reduce_2_false_true\n"
5419 "\n"
5420 " .entry packed_float_reduce_2_true_false (\n"
5421 " .param .u64 __cudaparm_packed_float_reduce_2_true_false_g_idata,\n"
5422 " .param .u64 __cudaparm_packed_float_reduce_2_true_false_g_odata,\n"
5423 " .param .u32 __cudaparm_packed_float_reduce_2_true_false_n)\n"
5424 " {\n"
5425 " .reg .u16 %rh<7>;\n"
5426 " .reg .u32 %r<15>;\n"
5427 " .reg .u64 %rd<13>;\n"
5428 " .reg .f32 %f<5>;\n"
5429 " .reg .pred %p<6>;\n"
5430 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5431 " .loc 3 735 0\n"
5432 "$LDWbegin_packed_float_reduce_2_true_false:\n"
5433 " .loc 3 637 0\n"
5434 " cvt.u32.u16 %r1, %ctaid.x;\n"
5435 " mul24.lo.u32 %r2, %r1, 4;\n"
5436 " cvt.u32.u16 %r3, %tid.x;\n"
5437 " add.u32 %r4, %r2, %r3;\n"
5438 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_false_n];\n"
5439 " setp.ge.u32 %p1, %r4, %r5;\n"
5440 " @%p1 bra $Lt_51_18434;\n"
5441 " mul.lo.u32 %r6, %r4, 4;\n"
5442 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_false_n];\n"
5443 " mul.lo.u32 %r7, %r5, 4;\n"
5444 " mov.u16 %rh1, %nctaid.x;\n"
5445 " mul.wide.u16 %r8, %rh1, 16;\n"
5446 " add.u32 %r9, %r6, 8;\n"
5447 " add.u32 %r10, %r7, 8;\n"
5448 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_false_g_idata];\n"
5449 "$Lt_51_16898:\n"
5450 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5451 " .loc 3 677 0\n"
5452 " cvt.u8.u32 %r11, %r9;\n"
5453 " cvt.u64.u32 %rd2, %r11;\n"
5454 " .loc 3 637 0\n"
5455 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_false_g_idata];\n"
5456 " .loc 3 677 0\n"
5457 " add.u64 %rd3, %rd2, %rd1;\n"
5458 " ld.global.u8 %rh2, [%rd3+0];\n"
5459 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5460 " .loc 3 678 0\n"
5461 " ld.global.u8 %rh3, [%rd3+1];\n"
5462 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5463 " .loc 3 679 0\n"
5464 " ld.global.u8 %rh4, [%rd3+2];\n"
5465 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5466 " .loc 3 680 0\n"
5467 " ld.global.u8 %rh5, [%rd3+3];\n"
5468 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5469 " add.u32 %r9, %r8, %r9;\n"
5470 " setp.lt.u32 %p2, %r9, %r10;\n"
5471 " @%p2 bra $Lt_51_16898;\n"
5472 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5473 " bra.uni $Lt_51_16386;\n"
5474 "$Lt_51_18434:\n"
5475 " mov.f32 %f1, 0f00000000; // 0\n"
5476 "$Lt_51_16386:\n"
5477 " .loc 3 692 0\n"
5478 " mov.u64 %rd4, __smem;\n"
5479 " cvt.u64.u32 %rd5, %r3;\n"
5480 " mul.wide.u32 %rd6, %r3, 4;\n"
5481 " add.u64 %rd7, %rd4, %rd6;\n"
5482 " st.shared.f32 [%rd7+0], %f1;\n"
5483 " .loc 3 693 0\n"
5484 " bar.sync 0;\n"
5485 " mov.u32 %r12, 31;\n"
5486 " setp.gt.u32 %p3, %r3, %r12;\n"
5487 " @%p3 bra $Lt_51_17410;\n"
5488 " .loc 3 714 0\n"
5489 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5490 " add.f32 %f1, %f2, %f1;\n"
5491 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5492 "$Lt_51_17410:\n"
5493 " mov.u32 %r13, 0;\n"
5494 " setp.ne.u32 %p4, %r3, %r13;\n"
5495 " @%p4 bra $Lt_51_17922;\n"
5496 " .loc 3 719 0\n"
5497 " ld.shared.f32 %f3, [__smem+0];\n"
5498 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_true_false_g_odata];\n"
5499 " cvt.u64.u32 %rd9, %r1;\n"
5500 " mul.wide.u32 %rd10, %r1, 4;\n"
5501 " add.u64 %rd11, %rd8, %rd10;\n"
5502 " st.global.f32 [%rd11+0], %f3;\n"
5503 "$Lt_51_17922:\n"
5504 " .loc 3 736 0\n"
5505 " exit;\n"
5506 "$LDWend_packed_float_reduce_2_true_false:\n"
5507 " } // packed_float_reduce_2_true_false\n"
5508 "\n"
5509 " .entry packed_float_reduce_2_true_true (\n"
5510 " .param .u64 __cudaparm_packed_float_reduce_2_true_true_g_idata,\n"
5511 " .param .u64 __cudaparm_packed_float_reduce_2_true_true_g_odata,\n"
5512 " .param .u32 __cudaparm_packed_float_reduce_2_true_true_n)\n"
5513 " {\n"
5514 " .reg .u16 %rh<7>;\n"
5515 " .reg .u32 %r<15>;\n"
5516 " .reg .u64 %rd<13>;\n"
5517 " .reg .f32 %f<5>;\n"
5518 " .reg .pred %p<6>;\n"
5519 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5520 " .loc 3 737 0\n"
5521 "$LDWbegin_packed_float_reduce_2_true_true:\n"
5522 " .loc 3 637 0\n"
5523 " cvt.u32.u16 %r1, %ctaid.x;\n"
5524 " mul24.lo.u32 %r2, %r1, 4;\n"
5525 " cvt.u32.u16 %r3, %tid.x;\n"
5526 " add.u32 %r4, %r2, %r3;\n"
5527 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_true_n];\n"
5528 " setp.ge.u32 %p1, %r4, %r5;\n"
5529 " @%p1 bra $Lt_52_18434;\n"
5530 " mul.lo.u32 %r6, %r4, 4;\n"
5531 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_2_true_true_n];\n"
5532 " mul.lo.u32 %r7, %r5, 4;\n"
5533 " mov.u16 %rh1, %nctaid.x;\n"
5534 " mul.wide.u16 %r8, %rh1, 16;\n"
5535 " add.u32 %r9, %r6, 8;\n"
5536 " add.u32 %r10, %r7, 8;\n"
5537 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_true_g_idata];\n"
5538 "$Lt_52_16898:\n"
5539 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5540 " .loc 3 677 0\n"
5541 " cvt.u8.u32 %r11, %r9;\n"
5542 " cvt.u64.u32 %rd2, %r11;\n"
5543 " .loc 3 637 0\n"
5544 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_2_true_true_g_idata];\n"
5545 " .loc 3 677 0\n"
5546 " add.u64 %rd3, %rd2, %rd1;\n"
5547 " ld.global.u8 %rh2, [%rd3+0];\n"
5548 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5549 " .loc 3 678 0\n"
5550 " ld.global.u8 %rh3, [%rd3+1];\n"
5551 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5552 " .loc 3 679 0\n"
5553 " ld.global.u8 %rh4, [%rd3+2];\n"
5554 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5555 " .loc 3 680 0\n"
5556 " ld.global.u8 %rh5, [%rd3+3];\n"
5557 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5558 " add.u32 %r9, %r8, %r9;\n"
5559 " setp.lt.u32 %p2, %r9, %r10;\n"
5560 " @%p2 bra $Lt_52_16898;\n"
5561 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5562 " bra.uni $Lt_52_16386;\n"
5563 "$Lt_52_18434:\n"
5564 " mov.f32 %f1, 0f00000000; // 0\n"
5565 "$Lt_52_16386:\n"
5566 " .loc 3 692 0\n"
5567 " mov.u64 %rd4, __smem;\n"
5568 " cvt.u64.u32 %rd5, %r3;\n"
5569 " mul.wide.u32 %rd6, %r3, 4;\n"
5570 " add.u64 %rd7, %rd4, %rd6;\n"
5571 " st.shared.f32 [%rd7+0], %f1;\n"
5572 " .loc 3 693 0\n"
5573 " bar.sync 0;\n"
5574 " mov.u32 %r12, 31;\n"
5575 " setp.gt.u32 %p3, %r3, %r12;\n"
5576 " @%p3 bra $Lt_52_17410;\n"
5577 " .loc 3 714 0\n"
5578 " ld.volatile.shared.f32 %f2, [%rd7+4];\n"
5579 " add.f32 %f1, %f2, %f1;\n"
5580 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5581 "$Lt_52_17410:\n"
5582 " mov.u32 %r13, 0;\n"
5583 " setp.ne.u32 %p4, %r3, %r13;\n"
5584 " @%p4 bra $Lt_52_17922;\n"
5585 " .loc 3 719 0\n"
5586 " ld.shared.f32 %f3, [__smem+0];\n"
5587 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_2_true_true_g_odata];\n"
5588 " cvt.u64.u32 %rd9, %r1;\n"
5589 " mul.wide.u32 %rd10, %r1, 4;\n"
5590 " add.u64 %rd11, %rd8, %rd10;\n"
5591 " st.global.f32 [%rd11+0], %f3;\n"
5592 "$Lt_52_17922:\n"
5593 " .loc 3 738 0\n"
5594 " exit;\n"
5595 "$LDWend_packed_float_reduce_2_true_true:\n"
5596 " } // packed_float_reduce_2_true_true\n"
5597 "\n"
5598 " .entry packed_float_reduce_4_false_false (\n"
5599 " .param .u64 __cudaparm_packed_float_reduce_4_false_false_g_idata,\n"
5600 " .param .u64 __cudaparm_packed_float_reduce_4_false_false_g_odata,\n"
5601 " .param .u32 __cudaparm_packed_float_reduce_4_false_false_n)\n"
5602 " {\n"
5603 " .reg .u16 %rh<7>;\n"
5604 " .reg .u32 %r<15>;\n"
5605 " .reg .u64 %rd<13>;\n"
5606 " .reg .f32 %f<7>;\n"
5607 " .reg .pred %p<6>;\n"
5608 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5609 " .loc 3 740 0\n"
5610 "$LDWbegin_packed_float_reduce_4_false_false:\n"
5611 " .loc 3 637 0\n"
5612 " cvt.u32.u16 %r1, %ctaid.x;\n"
5613 " mul24.lo.u32 %r2, %r1, 8;\n"
5614 " cvt.u32.u16 %r3, %tid.x;\n"
5615 " add.u32 %r4, %r2, %r3;\n"
5616 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_false_n];\n"
5617 " setp.ge.u32 %p1, %r4, %r5;\n"
5618 " @%p1 bra $Lt_53_18178;\n"
5619 " mul.lo.u32 %r6, %r4, 4;\n"
5620 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_false_n];\n"
5621 " mul.lo.u32 %r7, %r5, 4;\n"
5622 " mov.u16 %rh1, %nctaid.x;\n"
5623 " mul.wide.u16 %r8, %rh1, 32;\n"
5624 " add.u32 %r9, %r6, 16;\n"
5625 " add.u32 %r10, %r7, 16;\n"
5626 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_false_g_idata];\n"
5627 "$Lt_53_16642:\n"
5628 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5629 " .loc 3 677 0\n"
5630 " cvt.u8.u32 %r11, %r9;\n"
5631 " cvt.u64.u32 %rd2, %r11;\n"
5632 " .loc 3 637 0\n"
5633 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_false_g_idata];\n"
5634 " .loc 3 677 0\n"
5635 " add.u64 %rd3, %rd2, %rd1;\n"
5636 " ld.global.u8 %rh2, [%rd3+0];\n"
5637 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5638 " .loc 3 678 0\n"
5639 " ld.global.u8 %rh3, [%rd3+1];\n"
5640 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5641 " .loc 3 679 0\n"
5642 " ld.global.u8 %rh4, [%rd3+2];\n"
5643 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5644 " .loc 3 680 0\n"
5645 " ld.global.u8 %rh5, [%rd3+3];\n"
5646 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5647 " add.u32 %r9, %r8, %r9;\n"
5648 " setp.lt.u32 %p2, %r9, %r10;\n"
5649 " @%p2 bra $Lt_53_16642;\n"
5650 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5651 " bra.uni $Lt_53_16130;\n"
5652 "$Lt_53_18178:\n"
5653 " mov.f32 %f1, 0f00000000; // 0\n"
5654 "$Lt_53_16130:\n"
5655 " .loc 3 692 0\n"
5656 " mov.u64 %rd4, __smem;\n"
5657 " cvt.u64.u32 %rd5, %r3;\n"
5658 " mul.wide.u32 %rd6, %r3, 4;\n"
5659 " add.u64 %rd7, %rd4, %rd6;\n"
5660 " st.shared.f32 [%rd7+0], %f1;\n"
5661 " .loc 3 693 0\n"
5662 " bar.sync 0;\n"
5663 " mov.u32 %r12, 31;\n"
5664 " setp.gt.u32 %p3, %r3, %r12;\n"
5665 " @%p3 bra $Lt_53_17154;\n"
5666 " .loc 3 713 0\n"
5667 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5668 " add.f32 %f3, %f2, %f1;\n"
5669 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5670 " .loc 3 714 0\n"
5671 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5672 " add.f32 %f1, %f4, %f3;\n"
5673 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5674 "$Lt_53_17154:\n"
5675 " mov.u32 %r13, 0;\n"
5676 " setp.ne.u32 %p4, %r3, %r13;\n"
5677 " @%p4 bra $Lt_53_17666;\n"
5678 " .loc 3 719 0\n"
5679 " ld.shared.f32 %f5, [__smem+0];\n"
5680 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_false_false_g_odata];\n"
5681 " cvt.u64.u32 %rd9, %r1;\n"
5682 " mul.wide.u32 %rd10, %r1, 4;\n"
5683 " add.u64 %rd11, %rd8, %rd10;\n"
5684 " st.global.f32 [%rd11+0], %f5;\n"
5685 "$Lt_53_17666:\n"
5686 " .loc 3 741 0\n"
5687 " exit;\n"
5688 "$LDWend_packed_float_reduce_4_false_false:\n"
5689 " } // packed_float_reduce_4_false_false\n"
5690 "\n"
5691 " .entry packed_float_reduce_4_false_true (\n"
5692 " .param .u64 __cudaparm_packed_float_reduce_4_false_true_g_idata,\n"
5693 " .param .u64 __cudaparm_packed_float_reduce_4_false_true_g_odata,\n"
5694 " .param .u32 __cudaparm_packed_float_reduce_4_false_true_n)\n"
5695 " {\n"
5696 " .reg .u16 %rh<7>;\n"
5697 " .reg .u32 %r<15>;\n"
5698 " .reg .u64 %rd<13>;\n"
5699 " .reg .f32 %f<7>;\n"
5700 " .reg .pred %p<6>;\n"
5701 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5702 " .loc 3 742 0\n"
5703 "$LDWbegin_packed_float_reduce_4_false_true:\n"
5704 " .loc 3 637 0\n"
5705 " cvt.u32.u16 %r1, %ctaid.x;\n"
5706 " mul24.lo.u32 %r2, %r1, 8;\n"
5707 " cvt.u32.u16 %r3, %tid.x;\n"
5708 " add.u32 %r4, %r2, %r3;\n"
5709 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_true_n];\n"
5710 " setp.ge.u32 %p1, %r4, %r5;\n"
5711 " @%p1 bra $Lt_54_18178;\n"
5712 " mul.lo.u32 %r6, %r4, 4;\n"
5713 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_false_true_n];\n"
5714 " mul.lo.u32 %r7, %r5, 4;\n"
5715 " mov.u16 %rh1, %nctaid.x;\n"
5716 " mul.wide.u16 %r8, %rh1, 32;\n"
5717 " add.u32 %r9, %r6, 16;\n"
5718 " add.u32 %r10, %r7, 16;\n"
5719 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_true_g_idata];\n"
5720 "$Lt_54_16642:\n"
5721 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5722 " .loc 3 677 0\n"
5723 " cvt.u8.u32 %r11, %r9;\n"
5724 " cvt.u64.u32 %rd2, %r11;\n"
5725 " .loc 3 637 0\n"
5726 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_false_true_g_idata];\n"
5727 " .loc 3 677 0\n"
5728 " add.u64 %rd3, %rd2, %rd1;\n"
5729 " ld.global.u8 %rh2, [%rd3+0];\n"
5730 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5731 " .loc 3 678 0\n"
5732 " ld.global.u8 %rh3, [%rd3+1];\n"
5733 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5734 " .loc 3 679 0\n"
5735 " ld.global.u8 %rh4, [%rd3+2];\n"
5736 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5737 " .loc 3 680 0\n"
5738 " ld.global.u8 %rh5, [%rd3+3];\n"
5739 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5740 " add.u32 %r9, %r8, %r9;\n"
5741 " setp.lt.u32 %p2, %r9, %r10;\n"
5742 " @%p2 bra $Lt_54_16642;\n"
5743 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5744 " bra.uni $Lt_54_16130;\n"
5745 "$Lt_54_18178:\n"
5746 " mov.f32 %f1, 0f00000000; // 0\n"
5747 "$Lt_54_16130:\n"
5748 " .loc 3 692 0\n"
5749 " mov.u64 %rd4, __smem;\n"
5750 " cvt.u64.u32 %rd5, %r3;\n"
5751 " mul.wide.u32 %rd6, %r3, 4;\n"
5752 " add.u64 %rd7, %rd4, %rd6;\n"
5753 " st.shared.f32 [%rd7+0], %f1;\n"
5754 " .loc 3 693 0\n"
5755 " bar.sync 0;\n"
5756 " mov.u32 %r12, 31;\n"
5757 " setp.gt.u32 %p3, %r3, %r12;\n"
5758 " @%p3 bra $Lt_54_17154;\n"
5759 " .loc 3 713 0\n"
5760 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5761 " add.f32 %f3, %f2, %f1;\n"
5762 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5763 " .loc 3 714 0\n"
5764 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5765 " add.f32 %f1, %f4, %f3;\n"
5766 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5767 "$Lt_54_17154:\n"
5768 " mov.u32 %r13, 0;\n"
5769 " setp.ne.u32 %p4, %r3, %r13;\n"
5770 " @%p4 bra $Lt_54_17666;\n"
5771 " .loc 3 719 0\n"
5772 " ld.shared.f32 %f5, [__smem+0];\n"
5773 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_false_true_g_odata];\n"
5774 " cvt.u64.u32 %rd9, %r1;\n"
5775 " mul.wide.u32 %rd10, %r1, 4;\n"
5776 " add.u64 %rd11, %rd8, %rd10;\n"
5777 " st.global.f32 [%rd11+0], %f5;\n"
5778 "$Lt_54_17666:\n"
5779 " .loc 3 743 0\n"
5780 " exit;\n"
5781 "$LDWend_packed_float_reduce_4_false_true:\n"
5782 " } // packed_float_reduce_4_false_true\n"
5783 "\n"
5784 " .entry packed_float_reduce_4_true_false (\n"
5785 " .param .u64 __cudaparm_packed_float_reduce_4_true_false_g_idata,\n"
5786 " .param .u64 __cudaparm_packed_float_reduce_4_true_false_g_odata,\n"
5787 " .param .u32 __cudaparm_packed_float_reduce_4_true_false_n)\n"
5788 " {\n"
5789 " .reg .u16 %rh<7>;\n"
5790 " .reg .u32 %r<15>;\n"
5791 " .reg .u64 %rd<13>;\n"
5792 " .reg .f32 %f<7>;\n"
5793 " .reg .pred %p<6>;\n"
5794 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5795 " .loc 3 744 0\n"
5796 "$LDWbegin_packed_float_reduce_4_true_false:\n"
5797 " .loc 3 637 0\n"
5798 " cvt.u32.u16 %r1, %ctaid.x;\n"
5799 " mul24.lo.u32 %r2, %r1, 8;\n"
5800 " cvt.u32.u16 %r3, %tid.x;\n"
5801 " add.u32 %r4, %r2, %r3;\n"
5802 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_false_n];\n"
5803 " setp.ge.u32 %p1, %r4, %r5;\n"
5804 " @%p1 bra $Lt_55_18178;\n"
5805 " mul.lo.u32 %r6, %r4, 4;\n"
5806 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_false_n];\n"
5807 " mul.lo.u32 %r7, %r5, 4;\n"
5808 " mov.u16 %rh1, %nctaid.x;\n"
5809 " mul.wide.u16 %r8, %rh1, 32;\n"
5810 " add.u32 %r9, %r6, 16;\n"
5811 " add.u32 %r10, %r7, 16;\n"
5812 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_false_g_idata];\n"
5813 "$Lt_55_16642:\n"
5814 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5815 " .loc 3 677 0\n"
5816 " cvt.u8.u32 %r11, %r9;\n"
5817 " cvt.u64.u32 %rd2, %r11;\n"
5818 " .loc 3 637 0\n"
5819 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_false_g_idata];\n"
5820 " .loc 3 677 0\n"
5821 " add.u64 %rd3, %rd2, %rd1;\n"
5822 " ld.global.u8 %rh2, [%rd3+0];\n"
5823 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5824 " .loc 3 678 0\n"
5825 " ld.global.u8 %rh3, [%rd3+1];\n"
5826 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5827 " .loc 3 679 0\n"
5828 " ld.global.u8 %rh4, [%rd3+2];\n"
5829 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5830 " .loc 3 680 0\n"
5831 " ld.global.u8 %rh5, [%rd3+3];\n"
5832 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5833 " add.u32 %r9, %r8, %r9;\n"
5834 " setp.lt.u32 %p2, %r9, %r10;\n"
5835 " @%p2 bra $Lt_55_16642;\n"
5836 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5837 " bra.uni $Lt_55_16130;\n"
5838 "$Lt_55_18178:\n"
5839 " mov.f32 %f1, 0f00000000; // 0\n"
5840 "$Lt_55_16130:\n"
5841 " .loc 3 692 0\n"
5842 " mov.u64 %rd4, __smem;\n"
5843 " cvt.u64.u32 %rd5, %r3;\n"
5844 " mul.wide.u32 %rd6, %r3, 4;\n"
5845 " add.u64 %rd7, %rd4, %rd6;\n"
5846 " st.shared.f32 [%rd7+0], %f1;\n"
5847 " .loc 3 693 0\n"
5848 " bar.sync 0;\n"
5849 " mov.u32 %r12, 31;\n"
5850 " setp.gt.u32 %p3, %r3, %r12;\n"
5851 " @%p3 bra $Lt_55_17154;\n"
5852 " .loc 3 713 0\n"
5853 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5854 " add.f32 %f3, %f2, %f1;\n"
5855 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5856 " .loc 3 714 0\n"
5857 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5858 " add.f32 %f1, %f4, %f3;\n"
5859 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5860 "$Lt_55_17154:\n"
5861 " mov.u32 %r13, 0;\n"
5862 " setp.ne.u32 %p4, %r3, %r13;\n"
5863 " @%p4 bra $Lt_55_17666;\n"
5864 " .loc 3 719 0\n"
5865 " ld.shared.f32 %f5, [__smem+0];\n"
5866 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_true_false_g_odata];\n"
5867 " cvt.u64.u32 %rd9, %r1;\n"
5868 " mul.wide.u32 %rd10, %r1, 4;\n"
5869 " add.u64 %rd11, %rd8, %rd10;\n"
5870 " st.global.f32 [%rd11+0], %f5;\n"
5871 "$Lt_55_17666:\n"
5872 " .loc 3 745 0\n"
5873 " exit;\n"
5874 "$LDWend_packed_float_reduce_4_true_false:\n"
5875 " } // packed_float_reduce_4_true_false\n"
5876 "\n"
5877 " .entry packed_float_reduce_4_true_true (\n"
5878 " .param .u64 __cudaparm_packed_float_reduce_4_true_true_g_idata,\n"
5879 " .param .u64 __cudaparm_packed_float_reduce_4_true_true_g_odata,\n"
5880 " .param .u32 __cudaparm_packed_float_reduce_4_true_true_n)\n"
5881 " {\n"
5882 " .reg .u16 %rh<7>;\n"
5883 " .reg .u32 %r<15>;\n"
5884 " .reg .u64 %rd<13>;\n"
5885 " .reg .f32 %f<7>;\n"
5886 " .reg .pred %p<6>;\n"
5887 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5888 " .loc 3 746 0\n"
5889 "$LDWbegin_packed_float_reduce_4_true_true:\n"
5890 " .loc 3 637 0\n"
5891 " cvt.u32.u16 %r1, %ctaid.x;\n"
5892 " mul24.lo.u32 %r2, %r1, 8;\n"
5893 " cvt.u32.u16 %r3, %tid.x;\n"
5894 " add.u32 %r4, %r2, %r3;\n"
5895 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_true_n];\n"
5896 " setp.ge.u32 %p1, %r4, %r5;\n"
5897 " @%p1 bra $Lt_56_18178;\n"
5898 " mul.lo.u32 %r6, %r4, 4;\n"
5899 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_4_true_true_n];\n"
5900 " mul.lo.u32 %r7, %r5, 4;\n"
5901 " mov.u16 %rh1, %nctaid.x;\n"
5902 " mul.wide.u16 %r8, %rh1, 32;\n"
5903 " add.u32 %r9, %r6, 16;\n"
5904 " add.u32 %r10, %r7, 16;\n"
5905 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_true_g_idata];\n"
5906 "$Lt_56_16642:\n"
5907 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
5908 " .loc 3 677 0\n"
5909 " cvt.u8.u32 %r11, %r9;\n"
5910 " cvt.u64.u32 %rd2, %r11;\n"
5911 " .loc 3 637 0\n"
5912 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_4_true_true_g_idata];\n"
5913 " .loc 3 677 0\n"
5914 " add.u64 %rd3, %rd2, %rd1;\n"
5915 " ld.global.u8 %rh2, [%rd3+0];\n"
5916 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
5917 " .loc 3 678 0\n"
5918 " ld.global.u8 %rh3, [%rd3+1];\n"
5919 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
5920 " .loc 3 679 0\n"
5921 " ld.global.u8 %rh4, [%rd3+2];\n"
5922 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
5923 " .loc 3 680 0\n"
5924 " ld.global.u8 %rh5, [%rd3+3];\n"
5925 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
5926 " add.u32 %r9, %r8, %r9;\n"
5927 " setp.lt.u32 %p2, %r9, %r10;\n"
5928 " @%p2 bra $Lt_56_16642;\n"
5929 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
5930 " bra.uni $Lt_56_16130;\n"
5931 "$Lt_56_18178:\n"
5932 " mov.f32 %f1, 0f00000000; // 0\n"
5933 "$Lt_56_16130:\n"
5934 " .loc 3 692 0\n"
5935 " mov.u64 %rd4, __smem;\n"
5936 " cvt.u64.u32 %rd5, %r3;\n"
5937 " mul.wide.u32 %rd6, %r3, 4;\n"
5938 " add.u64 %rd7, %rd4, %rd6;\n"
5939 " st.shared.f32 [%rd7+0], %f1;\n"
5940 " .loc 3 693 0\n"
5941 " bar.sync 0;\n"
5942 " mov.u32 %r12, 31;\n"
5943 " setp.gt.u32 %p3, %r3, %r12;\n"
5944 " @%p3 bra $Lt_56_17154;\n"
5945 " .loc 3 713 0\n"
5946 " ld.volatile.shared.f32 %f2, [%rd7+8];\n"
5947 " add.f32 %f3, %f2, %f1;\n"
5948 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
5949 " .loc 3 714 0\n"
5950 " ld.volatile.shared.f32 %f4, [%rd7+4];\n"
5951 " add.f32 %f1, %f4, %f3;\n"
5952 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
5953 "$Lt_56_17154:\n"
5954 " mov.u32 %r13, 0;\n"
5955 " setp.ne.u32 %p4, %r3, %r13;\n"
5956 " @%p4 bra $Lt_56_17666;\n"
5957 " .loc 3 719 0\n"
5958 " ld.shared.f32 %f5, [__smem+0];\n"
5959 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_4_true_true_g_odata];\n"
5960 " cvt.u64.u32 %rd9, %r1;\n"
5961 " mul.wide.u32 %rd10, %r1, 4;\n"
5962 " add.u64 %rd11, %rd8, %rd10;\n"
5963 " st.global.f32 [%rd11+0], %f5;\n"
5964 "$Lt_56_17666:\n"
5965 " .loc 3 747 0\n"
5966 " exit;\n"
5967 "$LDWend_packed_float_reduce_4_true_true:\n"
5968 " } // packed_float_reduce_4_true_true\n"
5969 "\n"
5970 " .entry packed_float_reduce_8_false_false (\n"
5971 " .param .u64 __cudaparm_packed_float_reduce_8_false_false_g_idata,\n"
5972 " .param .u64 __cudaparm_packed_float_reduce_8_false_false_g_odata,\n"
5973 " .param .u32 __cudaparm_packed_float_reduce_8_false_false_n)\n"
5974 " {\n"
5975 " .reg .u16 %rh<7>;\n"
5976 " .reg .u32 %r<15>;\n"
5977 " .reg .u64 %rd<13>;\n"
5978 " .reg .f32 %f<9>;\n"
5979 " .reg .pred %p<6>;\n"
5980 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
5981 " .loc 3 749 0\n"
5982 "$LDWbegin_packed_float_reduce_8_false_false:\n"
5983 " .loc 3 637 0\n"
5984 " cvt.u32.u16 %r1, %ctaid.x;\n"
5985 " mul24.lo.u32 %r2, %r1, 16;\n"
5986 " cvt.u32.u16 %r3, %tid.x;\n"
5987 " add.u32 %r4, %r2, %r3;\n"
5988 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_false_false_n];\n"
5989 " setp.ge.u32 %p1, %r4, %r5;\n"
5990 " @%p1 bra $Lt_57_17922;\n"
5991 " mul.lo.u32 %r6, %r4, 4;\n"
5992 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_false_false_n];\n"
5993 " mul.lo.u32 %r7, %r5, 4;\n"
5994 " mov.u16 %rh1, %nctaid.x;\n"
5995 " mul.wide.u16 %r8, %rh1, 64;\n"
5996 " add.u32 %r9, %r6, 32;\n"
5997 " add.u32 %r10, %r7, 32;\n"
5998 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_false_false_g_idata];\n"
5999 "$Lt_57_16386:\n"
6000 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6001 " .loc 3 677 0\n"
6002 " cvt.u8.u32 %r11, %r9;\n"
6003 " cvt.u64.u32 %rd2, %r11;\n"
6004 " .loc 3 637 0\n"
6005 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_false_false_g_idata];\n"
6006 " .loc 3 677 0\n"
6007 " add.u64 %rd3, %rd2, %rd1;\n"
6008 " ld.global.u8 %rh2, [%rd3+0];\n"
6009 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6010 " .loc 3 678 0\n"
6011 " ld.global.u8 %rh3, [%rd3+1];\n"
6012 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6013 " .loc 3 679 0\n"
6014 " ld.global.u8 %rh4, [%rd3+2];\n"
6015 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6016 " .loc 3 680 0\n"
6017 " ld.global.u8 %rh5, [%rd3+3];\n"
6018 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6019 " add.u32 %r9, %r8, %r9;\n"
6020 " setp.lt.u32 %p2, %r9, %r10;\n"
6021 " @%p2 bra $Lt_57_16386;\n"
6022 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6023 " bra.uni $Lt_57_15874;\n"
6024 "$Lt_57_17922:\n"
6025 " mov.f32 %f1, 0f00000000; // 0\n"
6026 "$Lt_57_15874:\n"
6027 " .loc 3 692 0\n"
6028 " mov.u64 %rd4, __smem;\n"
6029 " cvt.u64.u32 %rd5, %r3;\n"
6030 " mul.wide.u32 %rd6, %r3, 4;\n"
6031 " add.u64 %rd7, %rd4, %rd6;\n"
6032 " st.shared.f32 [%rd7+0], %f1;\n"
6033 " .loc 3 693 0\n"
6034 " bar.sync 0;\n"
6035 " mov.u32 %r12, 31;\n"
6036 " setp.gt.u32 %p3, %r3, %r12;\n"
6037 " @%p3 bra $Lt_57_16898;\n"
6038 " .loc 3 712 0\n"
6039 " ld.volatile.shared.f32 %f2, [%rd7+16];\n"
6040 " add.f32 %f3, %f2, %f1;\n"
6041 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6042 " .loc 3 713 0\n"
6043 " ld.volatile.shared.f32 %f4, [%rd7+8];\n"
6044 " add.f32 %f5, %f4, %f3;\n"
6045 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6046 " .loc 3 714 0\n"
6047 " ld.volatile.shared.f32 %f6, [%rd7+4];\n"
6048 " add.f32 %f1, %f6, %f5;\n"
6049 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6050 "$Lt_57_16898:\n"
6051 " mov.u32 %r13, 0;\n"
6052 " setp.ne.u32 %p4, %r3, %r13;\n"
6053 " @%p4 bra $Lt_57_17410;\n"
6054 " .loc 3 719 0\n"
6055 " ld.shared.f32 %f7, [__smem+0];\n"
6056 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_8_false_false_g_odata];\n"
6057 " cvt.u64.u32 %rd9, %r1;\n"
6058 " mul.wide.u32 %rd10, %r1, 4;\n"
6059 " add.u64 %rd11, %rd8, %rd10;\n"
6060 " st.global.f32 [%rd11+0], %f7;\n"
6061 "$Lt_57_17410:\n"
6062 " .loc 3 750 0\n"
6063 " exit;\n"
6064 "$LDWend_packed_float_reduce_8_false_false:\n"
6065 " } // packed_float_reduce_8_false_false\n"
6066 "\n"
6067 " .entry packed_float_reduce_8_false_true (\n"
6068 " .param .u64 __cudaparm_packed_float_reduce_8_false_true_g_idata,\n"
6069 " .param .u64 __cudaparm_packed_float_reduce_8_false_true_g_odata,\n"
6070 " .param .u32 __cudaparm_packed_float_reduce_8_false_true_n)\n"
6071 " {\n"
6072 " .reg .u16 %rh<7>;\n"
6073 " .reg .u32 %r<15>;\n"
6074 " .reg .u64 %rd<13>;\n"
6075 " .reg .f32 %f<9>;\n"
6076 " .reg .pred %p<6>;\n"
6077 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
6078 " .loc 3 751 0\n"
6079 "$LDWbegin_packed_float_reduce_8_false_true:\n"
6080 " .loc 3 637 0\n"
6081 " cvt.u32.u16 %r1, %ctaid.x;\n"
6082 " mul24.lo.u32 %r2, %r1, 16;\n"
6083 " cvt.u32.u16 %r3, %tid.x;\n"
6084 " add.u32 %r4, %r2, %r3;\n"
6085 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_false_true_n];\n"
6086 " setp.ge.u32 %p1, %r4, %r5;\n"
6087 " @%p1 bra $Lt_58_17922;\n"
6088 " mul.lo.u32 %r6, %r4, 4;\n"
6089 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_false_true_n];\n"
6090 " mul.lo.u32 %r7, %r5, 4;\n"
6091 " mov.u16 %rh1, %nctaid.x;\n"
6092 " mul.wide.u16 %r8, %rh1, 64;\n"
6093 " add.u32 %r9, %r6, 32;\n"
6094 " add.u32 %r10, %r7, 32;\n"
6095 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_false_true_g_idata];\n"
6096 "$Lt_58_16386:\n"
6097 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6098 " .loc 3 677 0\n"
6099 " cvt.u8.u32 %r11, %r9;\n"
6100 " cvt.u64.u32 %rd2, %r11;\n"
6101 " .loc 3 637 0\n"
6102 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_false_true_g_idata];\n"
6103 " .loc 3 677 0\n"
6104 " add.u64 %rd3, %rd2, %rd1;\n"
6105 " ld.global.u8 %rh2, [%rd3+0];\n"
6106 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6107 " .loc 3 678 0\n"
6108 " ld.global.u8 %rh3, [%rd3+1];\n"
6109 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6110 " .loc 3 679 0\n"
6111 " ld.global.u8 %rh4, [%rd3+2];\n"
6112 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6113 " .loc 3 680 0\n"
6114 " ld.global.u8 %rh5, [%rd3+3];\n"
6115 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6116 " add.u32 %r9, %r8, %r9;\n"
6117 " setp.lt.u32 %p2, %r9, %r10;\n"
6118 " @%p2 bra $Lt_58_16386;\n"
6119 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6120 " bra.uni $Lt_58_15874;\n"
6121 "$Lt_58_17922:\n"
6122 " mov.f32 %f1, 0f00000000; // 0\n"
6123 "$Lt_58_15874:\n"
6124 " .loc 3 692 0\n"
6125 " mov.u64 %rd4, __smem;\n"
6126 " cvt.u64.u32 %rd5, %r3;\n"
6127 " mul.wide.u32 %rd6, %r3, 4;\n"
6128 " add.u64 %rd7, %rd4, %rd6;\n"
6129 " st.shared.f32 [%rd7+0], %f1;\n"
6130 " .loc 3 693 0\n"
6131 " bar.sync 0;\n"
6132 " mov.u32 %r12, 31;\n"
6133 " setp.gt.u32 %p3, %r3, %r12;\n"
6134 " @%p3 bra $Lt_58_16898;\n"
6135 " .loc 3 712 0\n"
6136 " ld.volatile.shared.f32 %f2, [%rd7+16];\n"
6137 " add.f32 %f3, %f2, %f1;\n"
6138 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6139 " .loc 3 713 0\n"
6140 " ld.volatile.shared.f32 %f4, [%rd7+8];\n"
6141 " add.f32 %f5, %f4, %f3;\n"
6142 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6143 " .loc 3 714 0\n"
6144 " ld.volatile.shared.f32 %f6, [%rd7+4];\n"
6145 " add.f32 %f1, %f6, %f5;\n"
6146 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6147 "$Lt_58_16898:\n"
6148 " mov.u32 %r13, 0;\n"
6149 " setp.ne.u32 %p4, %r3, %r13;\n"
6150 " @%p4 bra $Lt_58_17410;\n"
6151 " .loc 3 719 0\n"
6152 " ld.shared.f32 %f7, [__smem+0];\n"
6153 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_8_false_true_g_odata];\n"
6154 " cvt.u64.u32 %rd9, %r1;\n"
6155 " mul.wide.u32 %rd10, %r1, 4;\n"
6156 " add.u64 %rd11, %rd8, %rd10;\n"
6157 " st.global.f32 [%rd11+0], %f7;\n"
6158 "$Lt_58_17410:\n"
6159 " .loc 3 752 0\n"
6160 " exit;\n"
6161 "$LDWend_packed_float_reduce_8_false_true:\n"
6162 " } // packed_float_reduce_8_false_true\n"
6163 "\n"
6164 " .entry packed_float_reduce_8_true_false (\n"
6165 " .param .u64 __cudaparm_packed_float_reduce_8_true_false_g_idata,\n"
6166 " .param .u64 __cudaparm_packed_float_reduce_8_true_false_g_odata,\n"
6167 " .param .u32 __cudaparm_packed_float_reduce_8_true_false_n)\n"
6168 " {\n"
6169 " .reg .u16 %rh<7>;\n"
6170 " .reg .u32 %r<15>;\n"
6171 " .reg .u64 %rd<13>;\n"
6172 " .reg .f32 %f<9>;\n"
6173 " .reg .pred %p<6>;\n"
6174 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
6175 " .loc 3 753 0\n"
6176 "$LDWbegin_packed_float_reduce_8_true_false:\n"
6177 " .loc 3 637 0\n"
6178 " cvt.u32.u16 %r1, %ctaid.x;\n"
6179 " mul24.lo.u32 %r2, %r1, 16;\n"
6180 " cvt.u32.u16 %r3, %tid.x;\n"
6181 " add.u32 %r4, %r2, %r3;\n"
6182 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_true_false_n];\n"
6183 " setp.ge.u32 %p1, %r4, %r5;\n"
6184 " @%p1 bra $Lt_59_17922;\n"
6185 " mul.lo.u32 %r6, %r4, 4;\n"
6186 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_true_false_n];\n"
6187 " mul.lo.u32 %r7, %r5, 4;\n"
6188 " mov.u16 %rh1, %nctaid.x;\n"
6189 " mul.wide.u16 %r8, %rh1, 64;\n"
6190 " add.u32 %r9, %r6, 32;\n"
6191 " add.u32 %r10, %r7, 32;\n"
6192 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_true_false_g_idata];\n"
6193 "$Lt_59_16386:\n"
6194 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6195 " .loc 3 677 0\n"
6196 " cvt.u8.u32 %r11, %r9;\n"
6197 " cvt.u64.u32 %rd2, %r11;\n"
6198 " .loc 3 637 0\n"
6199 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_true_false_g_idata];\n"
6200 " .loc 3 677 0\n"
6201 " add.u64 %rd3, %rd2, %rd1;\n"
6202 " ld.global.u8 %rh2, [%rd3+0];\n"
6203 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6204 " .loc 3 678 0\n"
6205 " ld.global.u8 %rh3, [%rd3+1];\n"
6206 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6207 " .loc 3 679 0\n"
6208 " ld.global.u8 %rh4, [%rd3+2];\n"
6209 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6210 " .loc 3 680 0\n"
6211 " ld.global.u8 %rh5, [%rd3+3];\n"
6212 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6213 " add.u32 %r9, %r8, %r9;\n"
6214 " setp.lt.u32 %p2, %r9, %r10;\n"
6215 " @%p2 bra $Lt_59_16386;\n"
6216 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6217 " bra.uni $Lt_59_15874;\n"
6218 "$Lt_59_17922:\n"
6219 " mov.f32 %f1, 0f00000000; // 0\n"
6220 "$Lt_59_15874:\n"
6221 " .loc 3 692 0\n"
6222 " mov.u64 %rd4, __smem;\n"
6223 " cvt.u64.u32 %rd5, %r3;\n"
6224 " mul.wide.u32 %rd6, %r3, 4;\n"
6225 " add.u64 %rd7, %rd4, %rd6;\n"
6226 " st.shared.f32 [%rd7+0], %f1;\n"
6227 " .loc 3 693 0\n"
6228 " bar.sync 0;\n"
6229 " mov.u32 %r12, 31;\n"
6230 " setp.gt.u32 %p3, %r3, %r12;\n"
6231 " @%p3 bra $Lt_59_16898;\n"
6232 " .loc 3 712 0\n"
6233 " ld.volatile.shared.f32 %f2, [%rd7+16];\n"
6234 " add.f32 %f3, %f2, %f1;\n"
6235 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6236 " .loc 3 713 0\n"
6237 " ld.volatile.shared.f32 %f4, [%rd7+8];\n"
6238 " add.f32 %f5, %f4, %f3;\n"
6239 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6240 " .loc 3 714 0\n"
6241 " ld.volatile.shared.f32 %f6, [%rd7+4];\n"
6242 " add.f32 %f1, %f6, %f5;\n"
6243 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6244 "$Lt_59_16898:\n"
6245 " mov.u32 %r13, 0;\n"
6246 " setp.ne.u32 %p4, %r3, %r13;\n"
6247 " @%p4 bra $Lt_59_17410;\n"
6248 " .loc 3 719 0\n"
6249 " ld.shared.f32 %f7, [__smem+0];\n"
6250 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_8_true_false_g_odata];\n"
6251 " cvt.u64.u32 %rd9, %r1;\n"
6252 " mul.wide.u32 %rd10, %r1, 4;\n"
6253 " add.u64 %rd11, %rd8, %rd10;\n"
6254 " st.global.f32 [%rd11+0], %f7;\n"
6255 "$Lt_59_17410:\n"
6256 " .loc 3 754 0\n"
6257 " exit;\n"
6258 "$LDWend_packed_float_reduce_8_true_false:\n"
6259 " } // packed_float_reduce_8_true_false\n"
6260 "\n"
6261 " .entry packed_float_reduce_8_true_true (\n"
6262 " .param .u64 __cudaparm_packed_float_reduce_8_true_true_g_idata,\n"
6263 " .param .u64 __cudaparm_packed_float_reduce_8_true_true_g_odata,\n"
6264 " .param .u32 __cudaparm_packed_float_reduce_8_true_true_n)\n"
6265 " {\n"
6266 " .reg .u16 %rh<7>;\n"
6267 " .reg .u32 %r<15>;\n"
6268 " .reg .u64 %rd<13>;\n"
6269 " .reg .f32 %f<9>;\n"
6270 " .reg .pred %p<6>;\n"
6271 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
6272 " .loc 3 755 0\n"
6273 "$LDWbegin_packed_float_reduce_8_true_true:\n"
6274 " .loc 3 637 0\n"
6275 " cvt.u32.u16 %r1, %ctaid.x;\n"
6276 " mul24.lo.u32 %r2, %r1, 16;\n"
6277 " cvt.u32.u16 %r3, %tid.x;\n"
6278 " add.u32 %r4, %r2, %r3;\n"
6279 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_true_true_n];\n"
6280 " setp.ge.u32 %p1, %r4, %r5;\n"
6281 " @%p1 bra $Lt_60_17922;\n"
6282 " mul.lo.u32 %r6, %r4, 4;\n"
6283 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_8_true_true_n];\n"
6284 " mul.lo.u32 %r7, %r5, 4;\n"
6285 " mov.u16 %rh1, %nctaid.x;\n"
6286 " mul.wide.u16 %r8, %rh1, 64;\n"
6287 " add.u32 %r9, %r6, 32;\n"
6288 " add.u32 %r10, %r7, 32;\n"
6289 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_true_true_g_idata];\n"
6290 "$Lt_60_16386:\n"
6291 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6292 " .loc 3 677 0\n"
6293 " cvt.u8.u32 %r11, %r9;\n"
6294 " cvt.u64.u32 %rd2, %r11;\n"
6295 " .loc 3 637 0\n"
6296 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_8_true_true_g_idata];\n"
6297 " .loc 3 677 0\n"
6298 " add.u64 %rd3, %rd2, %rd1;\n"
6299 " ld.global.u8 %rh2, [%rd3+0];\n"
6300 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6301 " .loc 3 678 0\n"
6302 " ld.global.u8 %rh3, [%rd3+1];\n"
6303 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6304 " .loc 3 679 0\n"
6305 " ld.global.u8 %rh4, [%rd3+2];\n"
6306 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6307 " .loc 3 680 0\n"
6308 " ld.global.u8 %rh5, [%rd3+3];\n"
6309 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6310 " add.u32 %r9, %r8, %r9;\n"
6311 " setp.lt.u32 %p2, %r9, %r10;\n"
6312 " @%p2 bra $Lt_60_16386;\n"
6313 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6314 " bra.uni $Lt_60_15874;\n"
6315 "$Lt_60_17922:\n"
6316 " mov.f32 %f1, 0f00000000; // 0\n"
6317 "$Lt_60_15874:\n"
6318 " .loc 3 692 0\n"
6319 " mov.u64 %rd4, __smem;\n"
6320 " cvt.u64.u32 %rd5, %r3;\n"
6321 " mul.wide.u32 %rd6, %r3, 4;\n"
6322 " add.u64 %rd7, %rd4, %rd6;\n"
6323 " st.shared.f32 [%rd7+0], %f1;\n"
6324 " .loc 3 693 0\n"
6325 " bar.sync 0;\n"
6326 " mov.u32 %r12, 31;\n"
6327 " setp.gt.u32 %p3, %r3, %r12;\n"
6328 " @%p3 bra $Lt_60_16898;\n"
6329 " .loc 3 712 0\n"
6330 " ld.volatile.shared.f32 %f2, [%rd7+16];\n"
6331 " add.f32 %f3, %f2, %f1;\n"
6332 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6333 " .loc 3 713 0\n"
6334 " ld.volatile.shared.f32 %f4, [%rd7+8];\n"
6335 " add.f32 %f5, %f4, %f3;\n"
6336 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6337 " .loc 3 714 0\n"
6338 " ld.volatile.shared.f32 %f6, [%rd7+4];\n"
6339 " add.f32 %f1, %f6, %f5;\n"
6340 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6341 "$Lt_60_16898:\n"
6342 " mov.u32 %r13, 0;\n"
6343 " setp.ne.u32 %p4, %r3, %r13;\n"
6344 " @%p4 bra $Lt_60_17410;\n"
6345 " .loc 3 719 0\n"
6346 " ld.shared.f32 %f7, [__smem+0];\n"
6347 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_8_true_true_g_odata];\n"
6348 " cvt.u64.u32 %rd9, %r1;\n"
6349 " mul.wide.u32 %rd10, %r1, 4;\n"
6350 " add.u64 %rd11, %rd8, %rd10;\n"
6351 " st.global.f32 [%rd11+0], %f7;\n"
6352 "$Lt_60_17410:\n"
6353 " .loc 3 756 0\n"
6354 " exit;\n"
6355 "$LDWend_packed_float_reduce_8_true_true:\n"
6356 " } // packed_float_reduce_8_true_true\n"
6357 "\n"
6358 " .entry packed_float_reduce_16_false_false (\n"
6359 " .param .u64 __cudaparm_packed_float_reduce_16_false_false_g_idata,\n"
6360 " .param .u64 __cudaparm_packed_float_reduce_16_false_false_g_odata,\n"
6361 " .param .u32 __cudaparm_packed_float_reduce_16_false_false_n)\n"
6362 " {\n"
6363 " .reg .u16 %rh<7>;\n"
6364 " .reg .u32 %r<15>;\n"
6365 " .reg .u64 %rd<13>;\n"
6366 " .reg .f32 %f<11>;\n"
6367 " .reg .pred %p<6>;\n"
6368 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
6369 " .loc 3 758 0\n"
6370 "$LDWbegin_packed_float_reduce_16_false_false:\n"
6371 " .loc 3 637 0\n"
6372 " cvt.u32.u16 %r1, %ctaid.x;\n"
6373 " mul24.lo.u32 %r2, %r1, 32;\n"
6374 " cvt.u32.u16 %r3, %tid.x;\n"
6375 " add.u32 %r4, %r2, %r3;\n"
6376 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_16_false_false_n];\n"
6377 " setp.ge.u32 %p1, %r4, %r5;\n"
6378 " @%p1 bra $Lt_61_17666;\n"
6379 " mul.lo.u32 %r6, %r4, 4;\n"
6380 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_16_false_false_n];\n"
6381 " mul.lo.u32 %r7, %r5, 4;\n"
6382 " mov.u16 %rh1, %nctaid.x;\n"
6383 " mul.wide.u16 %r8, %rh1, 128;\n"
6384 " add.u32 %r9, %r6, 64;\n"
6385 " add.u32 %r10, %r7, 64;\n"
6386 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_16_false_false_g_idata];\n"
6387 "$Lt_61_16130:\n"
6388 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6389 " .loc 3 677 0\n"
6390 " cvt.u8.u32 %r11, %r9;\n"
6391 " cvt.u64.u32 %rd2, %r11;\n"
6392 " .loc 3 637 0\n"
6393 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_16_false_false_g_idata];\n"
6394 " .loc 3 677 0\n"
6395 " add.u64 %rd3, %rd2, %rd1;\n"
6396 " ld.global.u8 %rh2, [%rd3+0];\n"
6397 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6398 " .loc 3 678 0\n"
6399 " ld.global.u8 %rh3, [%rd3+1];\n"
6400 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6401 " .loc 3 679 0\n"
6402 " ld.global.u8 %rh4, [%rd3+2];\n"
6403 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6404 " .loc 3 680 0\n"
6405 " ld.global.u8 %rh5, [%rd3+3];\n"
6406 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6407 " add.u32 %r9, %r8, %r9;\n"
6408 " setp.lt.u32 %p2, %r9, %r10;\n"
6409 " @%p2 bra $Lt_61_16130;\n"
6410 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6411 " bra.uni $Lt_61_15618;\n"
6412 "$Lt_61_17666:\n"
6413 " mov.f32 %f1, 0f00000000; // 0\n"
6414 "$Lt_61_15618:\n"
6415 " .loc 3 692 0\n"
6416 " mov.u64 %rd4, __smem;\n"
6417 " cvt.u64.u32 %rd5, %r3;\n"
6418 " mul.wide.u32 %rd6, %r3, 4;\n"
6419 " add.u64 %rd7, %rd4, %rd6;\n"
6420 " st.shared.f32 [%rd7+0], %f1;\n"
6421 " .loc 3 693 0\n"
6422 " bar.sync 0;\n"
6423 " mov.u32 %r12, 31;\n"
6424 " setp.gt.u32 %p3, %r3, %r12;\n"
6425 " @%p3 bra $Lt_61_16642;\n"
6426 " .loc 3 711 0\n"
6427 " ld.volatile.shared.f32 %f2, [%rd7+32];\n"
6428 " add.f32 %f3, %f2, %f1;\n"
6429 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6430 " .loc 3 712 0\n"
6431 " ld.volatile.shared.f32 %f4, [%rd7+16];\n"
6432 " add.f32 %f5, %f4, %f3;\n"
6433 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6434 " .loc 3 713 0\n"
6435 " ld.volatile.shared.f32 %f6, [%rd7+8];\n"
6436 " add.f32 %f7, %f6, %f5;\n"
6437 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
6438 " .loc 3 714 0\n"
6439 " ld.volatile.shared.f32 %f8, [%rd7+4];\n"
6440 " add.f32 %f1, %f8, %f7;\n"
6441 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6442 "$Lt_61_16642:\n"
6443 " mov.u32 %r13, 0;\n"
6444 " setp.ne.u32 %p4, %r3, %r13;\n"
6445 " @%p4 bra $Lt_61_17154;\n"
6446 " .loc 3 719 0\n"
6447 " ld.shared.f32 %f9, [__smem+0];\n"
6448 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_16_false_false_g_odata];\n"
6449 " cvt.u64.u32 %rd9, %r1;\n"
6450 " mul.wide.u32 %rd10, %r1, 4;\n"
6451 " add.u64 %rd11, %rd8, %rd10;\n"
6452 " st.global.f32 [%rd11+0], %f9;\n"
6453 "$Lt_61_17154:\n"
6454 " .loc 3 759 0\n"
6455 " exit;\n"
6456 "$LDWend_packed_float_reduce_16_false_false:\n"
6457 " } // packed_float_reduce_16_false_false\n"
6458 "\n"
6459 " .entry packed_float_reduce_16_false_true (\n"
6460 " .param .u64 __cudaparm_packed_float_reduce_16_false_true_g_idata,\n"
6461 " .param .u64 __cudaparm_packed_float_reduce_16_false_true_g_odata,\n"
6462 " .param .u32 __cudaparm_packed_float_reduce_16_false_true_n)\n"
6463 " {\n"
6464 " .reg .u16 %rh<7>;\n"
6465 " .reg .u32 %r<15>;\n"
6466 " .reg .u64 %rd<13>;\n"
6467 " .reg .f32 %f<11>;\n"
6468 " .reg .pred %p<6>;\n"
6469 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
6470 " .loc 3 760 0\n"
6471 "$LDWbegin_packed_float_reduce_16_false_true:\n"
6472 " .loc 3 637 0\n"
6473 " cvt.u32.u16 %r1, %ctaid.x;\n"
6474 " mul24.lo.u32 %r2, %r1, 32;\n"
6475 " cvt.u32.u16 %r3, %tid.x;\n"
6476 " add.u32 %r4, %r2, %r3;\n"
6477 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_16_false_true_n];\n"
6478 " setp.ge.u32 %p1, %r4, %r5;\n"
6479 " @%p1 bra $Lt_62_17666;\n"
6480 " mul.lo.u32 %r6, %r4, 4;\n"
6481 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_16_false_true_n];\n"
6482 " mul.lo.u32 %r7, %r5, 4;\n"
6483 " mov.u16 %rh1, %nctaid.x;\n"
6484 " mul.wide.u16 %r8, %rh1, 128;\n"
6485 " add.u32 %r9, %r6, 64;\n"
6486 " add.u32 %r10, %r7, 64;\n"
6487 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_16_false_true_g_idata];\n"
6488 "$Lt_62_16130:\n"
6489 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6490 " .loc 3 677 0\n"
6491 " cvt.u8.u32 %r11, %r9;\n"
6492 " cvt.u64.u32 %rd2, %r11;\n"
6493 " .loc 3 637 0\n"
6494 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_16_false_true_g_idata];\n"
6495 " .loc 3 677 0\n"
6496 " add.u64 %rd3, %rd2, %rd1;\n"
6497 " ld.global.u8 %rh2, [%rd3+0];\n"
6498 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6499 " .loc 3 678 0\n"
6500 " ld.global.u8 %rh3, [%rd3+1];\n"
6501 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6502 " .loc 3 679 0\n"
6503 " ld.global.u8 %rh4, [%rd3+2];\n"
6504 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6505 " .loc 3 680 0\n"
6506 " ld.global.u8 %rh5, [%rd3+3];\n"
6507 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6508 " add.u32 %r9, %r8, %r9;\n"
6509 " setp.lt.u32 %p2, %r9, %r10;\n"
6510 " @%p2 bra $Lt_62_16130;\n"
6511 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6512 " bra.uni $Lt_62_15618;\n"
6513 "$Lt_62_17666:\n"
6514 " mov.f32 %f1, 0f00000000; // 0\n"
6515 "$Lt_62_15618:\n"
6516 " .loc 3 692 0\n"
6517 " mov.u64 %rd4, __smem;\n"
6518 " cvt.u64.u32 %rd5, %r3;\n"
6519 " mul.wide.u32 %rd6, %r3, 4;\n"
6520 " add.u64 %rd7, %rd4, %rd6;\n"
6521 " st.shared.f32 [%rd7+0], %f1;\n"
6522 " .loc 3 693 0\n"
6523 " bar.sync 0;\n"
6524 " mov.u32 %r12, 31;\n"
6525 " setp.gt.u32 %p3, %r3, %r12;\n"
6526 " @%p3 bra $Lt_62_16642;\n"
6527 " .loc 3 711 0\n"
6528 " ld.volatile.shared.f32 %f2, [%rd7+32];\n"
6529 " add.f32 %f3, %f2, %f1;\n"
6530 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6531 " .loc 3 712 0\n"
6532 " ld.volatile.shared.f32 %f4, [%rd7+16];\n"
6533 " add.f32 %f5, %f4, %f3;\n"
6534 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6535 " .loc 3 713 0\n"
6536 " ld.volatile.shared.f32 %f6, [%rd7+8];\n"
6537 " add.f32 %f7, %f6, %f5;\n"
6538 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
6539 " .loc 3 714 0\n"
6540 " ld.volatile.shared.f32 %f8, [%rd7+4];\n"
6541 " add.f32 %f1, %f8, %f7;\n"
6542 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6543 "$Lt_62_16642:\n"
6544 " mov.u32 %r13, 0;\n"
6545 " setp.ne.u32 %p4, %r3, %r13;\n"
6546 " @%p4 bra $Lt_62_17154;\n"
6547 " .loc 3 719 0\n"
6548 " ld.shared.f32 %f9, [__smem+0];\n"
6549 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_16_false_true_g_odata];\n"
6550 " cvt.u64.u32 %rd9, %r1;\n"
6551 " mul.wide.u32 %rd10, %r1, 4;\n"
6552 " add.u64 %rd11, %rd8, %rd10;\n"
6553 " st.global.f32 [%rd11+0], %f9;\n"
6554 "$Lt_62_17154:\n"
6555 " .loc 3 761 0\n"
6556 " exit;\n"
6557 "$LDWend_packed_float_reduce_16_false_true:\n"
6558 " } // packed_float_reduce_16_false_true\n"
6559 "\n"
6560 " .entry packed_float_reduce_16_true_false (\n"
6561 " .param .u64 __cudaparm_packed_float_reduce_16_true_false_g_idata,\n"
6562 " .param .u64 __cudaparm_packed_float_reduce_16_true_false_g_odata,\n"
6563 " .param .u32 __cudaparm_packed_float_reduce_16_true_false_n)\n"
6564 " {\n"
6565 " .reg .u16 %rh<7>;\n"
6566 " .reg .u32 %r<15>;\n"
6567 " .reg .u64 %rd<13>;\n"
6568 " .reg .f32 %f<11>;\n"
6569 " .reg .pred %p<6>;\n"
6570 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
6571 " .loc 3 762 0\n"
6572 "$LDWbegin_packed_float_reduce_16_true_false:\n"
6573 " .loc 3 637 0\n"
6574 " cvt.u32.u16 %r1, %ctaid.x;\n"
6575 " mul24.lo.u32 %r2, %r1, 32;\n"
6576 " cvt.u32.u16 %r3, %tid.x;\n"
6577 " add.u32 %r4, %r2, %r3;\n"
6578 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_16_true_false_n];\n"
6579 " setp.ge.u32 %p1, %r4, %r5;\n"
6580 " @%p1 bra $Lt_63_17666;\n"
6581 " mul.lo.u32 %r6, %r4, 4;\n"
6582 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_16_true_false_n];\n"
6583 " mul.lo.u32 %r7, %r5, 4;\n"
6584 " mov.u16 %rh1, %nctaid.x;\n"
6585 " mul.wide.u16 %r8, %rh1, 128;\n"
6586 " add.u32 %r9, %r6, 64;\n"
6587 " add.u32 %r10, %r7, 64;\n"
6588 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_16_true_false_g_idata];\n"
6589 "$Lt_63_16130:\n"
6590 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6591 " .loc 3 677 0\n"
6592 " cvt.u8.u32 %r11, %r9;\n"
6593 " cvt.u64.u32 %rd2, %r11;\n"
6594 " .loc 3 637 0\n"
6595 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_16_true_false_g_idata];\n"
6596 " .loc 3 677 0\n"
6597 " add.u64 %rd3, %rd2, %rd1;\n"
6598 " ld.global.u8 %rh2, [%rd3+0];\n"
6599 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6600 " .loc 3 678 0\n"
6601 " ld.global.u8 %rh3, [%rd3+1];\n"
6602 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6603 " .loc 3 679 0\n"
6604 " ld.global.u8 %rh4, [%rd3+2];\n"
6605 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6606 " .loc 3 680 0\n"
6607 " ld.global.u8 %rh5, [%rd3+3];\n"
6608 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6609 " add.u32 %r9, %r8, %r9;\n"
6610 " setp.lt.u32 %p2, %r9, %r10;\n"
6611 " @%p2 bra $Lt_63_16130;\n"
6612 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6613 " bra.uni $Lt_63_15618;\n"
6614 "$Lt_63_17666:\n"
6615 " mov.f32 %f1, 0f00000000; // 0\n"
6616 "$Lt_63_15618:\n"
6617 " .loc 3 692 0\n"
6618 " mov.u64 %rd4, __smem;\n"
6619 " cvt.u64.u32 %rd5, %r3;\n"
6620 " mul.wide.u32 %rd6, %r3, 4;\n"
6621 " add.u64 %rd7, %rd4, %rd6;\n"
6622 " st.shared.f32 [%rd7+0], %f1;\n"
6623 " .loc 3 693 0\n"
6624 " bar.sync 0;\n"
6625 " mov.u32 %r12, 31;\n"
6626 " setp.gt.u32 %p3, %r3, %r12;\n"
6627 " @%p3 bra $Lt_63_16642;\n"
6628 " .loc 3 711 0\n"
6629 " ld.volatile.shared.f32 %f2, [%rd7+32];\n"
6630 " add.f32 %f3, %f2, %f1;\n"
6631 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6632 " .loc 3 712 0\n"
6633 " ld.volatile.shared.f32 %f4, [%rd7+16];\n"
6634 " add.f32 %f5, %f4, %f3;\n"
6635 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6636 " .loc 3 713 0\n"
6637 " ld.volatile.shared.f32 %f6, [%rd7+8];\n"
6638 " add.f32 %f7, %f6, %f5;\n"
6639 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
6640 " .loc 3 714 0\n"
6641 " ld.volatile.shared.f32 %f8, [%rd7+4];\n"
6642 " add.f32 %f1, %f8, %f7;\n"
6643 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6644 "$Lt_63_16642:\n"
6645 " mov.u32 %r13, 0;\n"
6646 " setp.ne.u32 %p4, %r3, %r13;\n"
6647 " @%p4 bra $Lt_63_17154;\n"
6648 " .loc 3 719 0\n"
6649 " ld.shared.f32 %f9, [__smem+0];\n"
6650 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_16_true_false_g_odata];\n"
6651 " cvt.u64.u32 %rd9, %r1;\n"
6652 " mul.wide.u32 %rd10, %r1, 4;\n"
6653 " add.u64 %rd11, %rd8, %rd10;\n"
6654 " st.global.f32 [%rd11+0], %f9;\n"
6655 "$Lt_63_17154:\n"
6656 " .loc 3 763 0\n"
6657 " exit;\n"
6658 "$LDWend_packed_float_reduce_16_true_false:\n"
6659 " } // packed_float_reduce_16_true_false\n"
6660 "\n"
6661 " .entry packed_float_reduce_16_true_true (\n"
6662 " .param .u64 __cudaparm_packed_float_reduce_16_true_true_g_idata,\n"
6663 " .param .u64 __cudaparm_packed_float_reduce_16_true_true_g_odata,\n"
6664 " .param .u32 __cudaparm_packed_float_reduce_16_true_true_n)\n"
6665 " {\n"
6666 " .reg .u16 %rh<7>;\n"
6667 " .reg .u32 %r<15>;\n"
6668 " .reg .u64 %rd<13>;\n"
6669 " .reg .f32 %f<11>;\n"
6670 " .reg .pred %p<6>;\n"
6671 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
6672 " .loc 3 764 0\n"
6673 "$LDWbegin_packed_float_reduce_16_true_true:\n"
6674 " .loc 3 637 0\n"
6675 " cvt.u32.u16 %r1, %ctaid.x;\n"
6676 " mul24.lo.u32 %r2, %r1, 32;\n"
6677 " cvt.u32.u16 %r3, %tid.x;\n"
6678 " add.u32 %r4, %r2, %r3;\n"
6679 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_16_true_true_n];\n"
6680 " setp.ge.u32 %p1, %r4, %r5;\n"
6681 " @%p1 bra $Lt_64_17666;\n"
6682 " mul.lo.u32 %r6, %r4, 4;\n"
6683 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_16_true_true_n];\n"
6684 " mul.lo.u32 %r7, %r5, 4;\n"
6685 " mov.u16 %rh1, %nctaid.x;\n"
6686 " mul.wide.u16 %r8, %rh1, 128;\n"
6687 " add.u32 %r9, %r6, 64;\n"
6688 " add.u32 %r10, %r7, 64;\n"
6689 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_16_true_true_g_idata];\n"
6690 "$Lt_64_16130:\n"
6691 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6692 " .loc 3 677 0\n"
6693 " cvt.u8.u32 %r11, %r9;\n"
6694 " cvt.u64.u32 %rd2, %r11;\n"
6695 " .loc 3 637 0\n"
6696 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_16_true_true_g_idata];\n"
6697 " .loc 3 677 0\n"
6698 " add.u64 %rd3, %rd2, %rd1;\n"
6699 " ld.global.u8 %rh2, [%rd3+0];\n"
6700 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6701 " .loc 3 678 0\n"
6702 " ld.global.u8 %rh3, [%rd3+1];\n"
6703 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6704 " .loc 3 679 0\n"
6705 " ld.global.u8 %rh4, [%rd3+2];\n"
6706 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6707 " .loc 3 680 0\n"
6708 " ld.global.u8 %rh5, [%rd3+3];\n"
6709 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6710 " add.u32 %r9, %r8, %r9;\n"
6711 " setp.lt.u32 %p2, %r9, %r10;\n"
6712 " @%p2 bra $Lt_64_16130;\n"
6713 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6714 " bra.uni $Lt_64_15618;\n"
6715 "$Lt_64_17666:\n"
6716 " mov.f32 %f1, 0f00000000; // 0\n"
6717 "$Lt_64_15618:\n"
6718 " .loc 3 692 0\n"
6719 " mov.u64 %rd4, __smem;\n"
6720 " cvt.u64.u32 %rd5, %r3;\n"
6721 " mul.wide.u32 %rd6, %r3, 4;\n"
6722 " add.u64 %rd7, %rd4, %rd6;\n"
6723 " st.shared.f32 [%rd7+0], %f1;\n"
6724 " .loc 3 693 0\n"
6725 " bar.sync 0;\n"
6726 " mov.u32 %r12, 31;\n"
6727 " setp.gt.u32 %p3, %r3, %r12;\n"
6728 " @%p3 bra $Lt_64_16642;\n"
6729 " .loc 3 711 0\n"
6730 " ld.volatile.shared.f32 %f2, [%rd7+32];\n"
6731 " add.f32 %f3, %f2, %f1;\n"
6732 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6733 " .loc 3 712 0\n"
6734 " ld.volatile.shared.f32 %f4, [%rd7+16];\n"
6735 " add.f32 %f5, %f4, %f3;\n"
6736 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6737 " .loc 3 713 0\n"
6738 " ld.volatile.shared.f32 %f6, [%rd7+8];\n"
6739 " add.f32 %f7, %f6, %f5;\n"
6740 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
6741 " .loc 3 714 0\n"
6742 " ld.volatile.shared.f32 %f8, [%rd7+4];\n"
6743 " add.f32 %f1, %f8, %f7;\n"
6744 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6745 "$Lt_64_16642:\n"
6746 " mov.u32 %r13, 0;\n"
6747 " setp.ne.u32 %p4, %r3, %r13;\n"
6748 " @%p4 bra $Lt_64_17154;\n"
6749 " .loc 3 719 0\n"
6750 " ld.shared.f32 %f9, [__smem+0];\n"
6751 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_16_true_true_g_odata];\n"
6752 " cvt.u64.u32 %rd9, %r1;\n"
6753 " mul.wide.u32 %rd10, %r1, 4;\n"
6754 " add.u64 %rd11, %rd8, %rd10;\n"
6755 " st.global.f32 [%rd11+0], %f9;\n"
6756 "$Lt_64_17154:\n"
6757 " .loc 3 765 0\n"
6758 " exit;\n"
6759 "$LDWend_packed_float_reduce_16_true_true:\n"
6760 " } // packed_float_reduce_16_true_true\n"
6761 "\n"
6762 " .entry packed_float_reduce_32_false_false (\n"
6763 " .param .u64 __cudaparm_packed_float_reduce_32_false_false_g_idata,\n"
6764 " .param .u64 __cudaparm_packed_float_reduce_32_false_false_g_odata,\n"
6765 " .param .u32 __cudaparm_packed_float_reduce_32_false_false_n)\n"
6766 " {\n"
6767 " .reg .u16 %rh<7>;\n"
6768 " .reg .u32 %r<15>;\n"
6769 " .reg .u64 %rd<13>;\n"
6770 " .reg .f32 %f<13>;\n"
6771 " .reg .pred %p<6>;\n"
6772 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
6773 " .loc 3 767 0\n"
6774 "$LDWbegin_packed_float_reduce_32_false_false:\n"
6775 " .loc 3 637 0\n"
6776 " cvt.u32.u16 %r1, %ctaid.x;\n"
6777 " mul24.lo.u32 %r2, %r1, 64;\n"
6778 " cvt.u32.u16 %r3, %tid.x;\n"
6779 " add.u32 %r4, %r2, %r3;\n"
6780 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_32_false_false_n];\n"
6781 " setp.ge.u32 %p1, %r4, %r5;\n"
6782 " @%p1 bra $Lt_65_17410;\n"
6783 " mul.lo.u32 %r6, %r4, 4;\n"
6784 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_32_false_false_n];\n"
6785 " mul.lo.u32 %r7, %r5, 4;\n"
6786 " mov.u16 %rh1, %nctaid.x;\n"
6787 " mul.wide.u16 %r8, %rh1, 256;\n"
6788 " add.u32 %r9, %r6, 128;\n"
6789 " add.u32 %r10, %r7, 128;\n"
6790 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_32_false_false_g_idata];\n"
6791 "$Lt_65_15874:\n"
6792 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6793 " .loc 3 677 0\n"
6794 " cvt.u8.u32 %r11, %r9;\n"
6795 " cvt.u64.u32 %rd2, %r11;\n"
6796 " .loc 3 637 0\n"
6797 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_32_false_false_g_idata];\n"
6798 " .loc 3 677 0\n"
6799 " add.u64 %rd3, %rd2, %rd1;\n"
6800 " ld.global.u8 %rh2, [%rd3+0];\n"
6801 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6802 " .loc 3 678 0\n"
6803 " ld.global.u8 %rh3, [%rd3+1];\n"
6804 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6805 " .loc 3 679 0\n"
6806 " ld.global.u8 %rh4, [%rd3+2];\n"
6807 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6808 " .loc 3 680 0\n"
6809 " ld.global.u8 %rh5, [%rd3+3];\n"
6810 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6811 " add.u32 %r9, %r8, %r9;\n"
6812 " setp.lt.u32 %p2, %r9, %r10;\n"
6813 " @%p2 bra $Lt_65_15874;\n"
6814 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6815 " bra.uni $Lt_65_15362;\n"
6816 "$Lt_65_17410:\n"
6817 " mov.f32 %f1, 0f00000000; // 0\n"
6818 "$Lt_65_15362:\n"
6819 " .loc 3 692 0\n"
6820 " mov.u64 %rd4, __smem;\n"
6821 " cvt.u64.u32 %rd5, %r3;\n"
6822 " mul.wide.u32 %rd6, %r3, 4;\n"
6823 " add.u64 %rd7, %rd4, %rd6;\n"
6824 " st.shared.f32 [%rd7+0], %f1;\n"
6825 " .loc 3 693 0\n"
6826 " bar.sync 0;\n"
6827 " mov.u32 %r12, 31;\n"
6828 " setp.gt.u32 %p3, %r3, %r12;\n"
6829 " @%p3 bra $Lt_65_16386;\n"
6830 " .loc 3 710 0\n"
6831 " ld.volatile.shared.f32 %f2, [%rd7+64];\n"
6832 " add.f32 %f3, %f2, %f1;\n"
6833 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6834 " .loc 3 711 0\n"
6835 " ld.volatile.shared.f32 %f4, [%rd7+32];\n"
6836 " add.f32 %f5, %f4, %f3;\n"
6837 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6838 " .loc 3 712 0\n"
6839 " ld.volatile.shared.f32 %f6, [%rd7+16];\n"
6840 " add.f32 %f7, %f6, %f5;\n"
6841 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
6842 " .loc 3 713 0\n"
6843 " ld.volatile.shared.f32 %f8, [%rd7+8];\n"
6844 " add.f32 %f9, %f8, %f7;\n"
6845 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
6846 " .loc 3 714 0\n"
6847 " ld.volatile.shared.f32 %f10, [%rd7+4];\n"
6848 " add.f32 %f1, %f10, %f9;\n"
6849 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6850 "$Lt_65_16386:\n"
6851 " mov.u32 %r13, 0;\n"
6852 " setp.ne.u32 %p4, %r3, %r13;\n"
6853 " @%p4 bra $Lt_65_16898;\n"
6854 " .loc 3 719 0\n"
6855 " ld.shared.f32 %f11, [__smem+0];\n"
6856 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_32_false_false_g_odata];\n"
6857 " cvt.u64.u32 %rd9, %r1;\n"
6858 " mul.wide.u32 %rd10, %r1, 4;\n"
6859 " add.u64 %rd11, %rd8, %rd10;\n"
6860 " st.global.f32 [%rd11+0], %f11;\n"
6861 "$Lt_65_16898:\n"
6862 " .loc 3 768 0\n"
6863 " exit;\n"
6864 "$LDWend_packed_float_reduce_32_false_false:\n"
6865 " } // packed_float_reduce_32_false_false\n"
6866 "\n"
6867 " .entry packed_float_reduce_32_false_true (\n"
6868 " .param .u64 __cudaparm_packed_float_reduce_32_false_true_g_idata,\n"
6869 " .param .u64 __cudaparm_packed_float_reduce_32_false_true_g_odata,\n"
6870 " .param .u32 __cudaparm_packed_float_reduce_32_false_true_n)\n"
6871 " {\n"
6872 " .reg .u16 %rh<7>;\n"
6873 " .reg .u32 %r<15>;\n"
6874 " .reg .u64 %rd<13>;\n"
6875 " .reg .f32 %f<13>;\n"
6876 " .reg .pred %p<6>;\n"
6877 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
6878 " .loc 3 769 0\n"
6879 "$LDWbegin_packed_float_reduce_32_false_true:\n"
6880 " .loc 3 637 0\n"
6881 " cvt.u32.u16 %r1, %ctaid.x;\n"
6882 " mul24.lo.u32 %r2, %r1, 64;\n"
6883 " cvt.u32.u16 %r3, %tid.x;\n"
6884 " add.u32 %r4, %r2, %r3;\n"
6885 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_32_false_true_n];\n"
6886 " setp.ge.u32 %p1, %r4, %r5;\n"
6887 " @%p1 bra $Lt_66_17410;\n"
6888 " mul.lo.u32 %r6, %r4, 4;\n"
6889 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_32_false_true_n];\n"
6890 " mul.lo.u32 %r7, %r5, 4;\n"
6891 " mov.u16 %rh1, %nctaid.x;\n"
6892 " mul.wide.u16 %r8, %rh1, 256;\n"
6893 " add.u32 %r9, %r6, 128;\n"
6894 " add.u32 %r10, %r7, 128;\n"
6895 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_32_false_true_g_idata];\n"
6896 "$Lt_66_15874:\n"
6897 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
6898 " .loc 3 677 0\n"
6899 " cvt.u8.u32 %r11, %r9;\n"
6900 " cvt.u64.u32 %rd2, %r11;\n"
6901 " .loc 3 637 0\n"
6902 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_32_false_true_g_idata];\n"
6903 " .loc 3 677 0\n"
6904 " add.u64 %rd3, %rd2, %rd1;\n"
6905 " ld.global.u8 %rh2, [%rd3+0];\n"
6906 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
6907 " .loc 3 678 0\n"
6908 " ld.global.u8 %rh3, [%rd3+1];\n"
6909 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
6910 " .loc 3 679 0\n"
6911 " ld.global.u8 %rh4, [%rd3+2];\n"
6912 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
6913 " .loc 3 680 0\n"
6914 " ld.global.u8 %rh5, [%rd3+3];\n"
6915 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
6916 " add.u32 %r9, %r8, %r9;\n"
6917 " setp.lt.u32 %p2, %r9, %r10;\n"
6918 " @%p2 bra $Lt_66_15874;\n"
6919 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
6920 " bra.uni $Lt_66_15362;\n"
6921 "$Lt_66_17410:\n"
6922 " mov.f32 %f1, 0f00000000; // 0\n"
6923 "$Lt_66_15362:\n"
6924 " .loc 3 692 0\n"
6925 " mov.u64 %rd4, __smem;\n"
6926 " cvt.u64.u32 %rd5, %r3;\n"
6927 " mul.wide.u32 %rd6, %r3, 4;\n"
6928 " add.u64 %rd7, %rd4, %rd6;\n"
6929 " st.shared.f32 [%rd7+0], %f1;\n"
6930 " .loc 3 693 0\n"
6931 " bar.sync 0;\n"
6932 " mov.u32 %r12, 31;\n"
6933 " setp.gt.u32 %p3, %r3, %r12;\n"
6934 " @%p3 bra $Lt_66_16386;\n"
6935 " .loc 3 710 0\n"
6936 " ld.volatile.shared.f32 %f2, [%rd7+64];\n"
6937 " add.f32 %f3, %f2, %f1;\n"
6938 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
6939 " .loc 3 711 0\n"
6940 " ld.volatile.shared.f32 %f4, [%rd7+32];\n"
6941 " add.f32 %f5, %f4, %f3;\n"
6942 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
6943 " .loc 3 712 0\n"
6944 " ld.volatile.shared.f32 %f6, [%rd7+16];\n"
6945 " add.f32 %f7, %f6, %f5;\n"
6946 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
6947 " .loc 3 713 0\n"
6948 " ld.volatile.shared.f32 %f8, [%rd7+8];\n"
6949 " add.f32 %f9, %f8, %f7;\n"
6950 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
6951 " .loc 3 714 0\n"
6952 " ld.volatile.shared.f32 %f10, [%rd7+4];\n"
6953 " add.f32 %f1, %f10, %f9;\n"
6954 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
6955 "$Lt_66_16386:\n"
6956 " mov.u32 %r13, 0;\n"
6957 " setp.ne.u32 %p4, %r3, %r13;\n"
6958 " @%p4 bra $Lt_66_16898;\n"
6959 " .loc 3 719 0\n"
6960 " ld.shared.f32 %f11, [__smem+0];\n"
6961 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_32_false_true_g_odata];\n"
6962 " cvt.u64.u32 %rd9, %r1;\n"
6963 " mul.wide.u32 %rd10, %r1, 4;\n"
6964 " add.u64 %rd11, %rd8, %rd10;\n"
6965 " st.global.f32 [%rd11+0], %f11;\n"
6966 "$Lt_66_16898:\n"
6967 " .loc 3 770 0\n"
6968 " exit;\n"
6969 "$LDWend_packed_float_reduce_32_false_true:\n"
6970 " } // packed_float_reduce_32_false_true\n"
6971 "\n"
6972 " .entry packed_float_reduce_32_true_false (\n"
6973 " .param .u64 __cudaparm_packed_float_reduce_32_true_false_g_idata,\n"
6974 " .param .u64 __cudaparm_packed_float_reduce_32_true_false_g_odata,\n"
6975 " .param .u32 __cudaparm_packed_float_reduce_32_true_false_n)\n"
6976 " {\n"
6977 " .reg .u16 %rh<7>;\n"
6978 " .reg .u32 %r<15>;\n"
6979 " .reg .u64 %rd<13>;\n"
6980 " .reg .f32 %f<13>;\n"
6981 " .reg .pred %p<6>;\n"
6982 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
6983 " .loc 3 771 0\n"
6984 "$LDWbegin_packed_float_reduce_32_true_false:\n"
6985 " .loc 3 637 0\n"
6986 " cvt.u32.u16 %r1, %ctaid.x;\n"
6987 " mul24.lo.u32 %r2, %r1, 64;\n"
6988 " cvt.u32.u16 %r3, %tid.x;\n"
6989 " add.u32 %r4, %r2, %r3;\n"
6990 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_32_true_false_n];\n"
6991 " setp.ge.u32 %p1, %r4, %r5;\n"
6992 " @%p1 bra $Lt_67_17410;\n"
6993 " mul.lo.u32 %r6, %r4, 4;\n"
6994 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_32_true_false_n];\n"
6995 " mul.lo.u32 %r7, %r5, 4;\n"
6996 " mov.u16 %rh1, %nctaid.x;\n"
6997 " mul.wide.u16 %r8, %rh1, 256;\n"
6998 " add.u32 %r9, %r6, 128;\n"
6999 " add.u32 %r10, %r7, 128;\n"
7000 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_32_true_false_g_idata];\n"
7001 "$Lt_67_15874:\n"
7002 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
7003 " .loc 3 677 0\n"
7004 " cvt.u8.u32 %r11, %r9;\n"
7005 " cvt.u64.u32 %rd2, %r11;\n"
7006 " .loc 3 637 0\n"
7007 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_32_true_false_g_idata];\n"
7008 " .loc 3 677 0\n"
7009 " add.u64 %rd3, %rd2, %rd1;\n"
7010 " ld.global.u8 %rh2, [%rd3+0];\n"
7011 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
7012 " .loc 3 678 0\n"
7013 " ld.global.u8 %rh3, [%rd3+1];\n"
7014 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
7015 " .loc 3 679 0\n"
7016 " ld.global.u8 %rh4, [%rd3+2];\n"
7017 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
7018 " .loc 3 680 0\n"
7019 " ld.global.u8 %rh5, [%rd3+3];\n"
7020 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
7021 " add.u32 %r9, %r8, %r9;\n"
7022 " setp.lt.u32 %p2, %r9, %r10;\n"
7023 " @%p2 bra $Lt_67_15874;\n"
7024 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
7025 " bra.uni $Lt_67_15362;\n"
7026 "$Lt_67_17410:\n"
7027 " mov.f32 %f1, 0f00000000; // 0\n"
7028 "$Lt_67_15362:\n"
7029 " .loc 3 692 0\n"
7030 " mov.u64 %rd4, __smem;\n"
7031 " cvt.u64.u32 %rd5, %r3;\n"
7032 " mul.wide.u32 %rd6, %r3, 4;\n"
7033 " add.u64 %rd7, %rd4, %rd6;\n"
7034 " st.shared.f32 [%rd7+0], %f1;\n"
7035 " .loc 3 693 0\n"
7036 " bar.sync 0;\n"
7037 " mov.u32 %r12, 31;\n"
7038 " setp.gt.u32 %p3, %r3, %r12;\n"
7039 " @%p3 bra $Lt_67_16386;\n"
7040 " .loc 3 710 0\n"
7041 " ld.volatile.shared.f32 %f2, [%rd7+64];\n"
7042 " add.f32 %f3, %f2, %f1;\n"
7043 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
7044 " .loc 3 711 0\n"
7045 " ld.volatile.shared.f32 %f4, [%rd7+32];\n"
7046 " add.f32 %f5, %f4, %f3;\n"
7047 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
7048 " .loc 3 712 0\n"
7049 " ld.volatile.shared.f32 %f6, [%rd7+16];\n"
7050 " add.f32 %f7, %f6, %f5;\n"
7051 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
7052 " .loc 3 713 0\n"
7053 " ld.volatile.shared.f32 %f8, [%rd7+8];\n"
7054 " add.f32 %f9, %f8, %f7;\n"
7055 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
7056 " .loc 3 714 0\n"
7057 " ld.volatile.shared.f32 %f10, [%rd7+4];\n"
7058 " add.f32 %f1, %f10, %f9;\n"
7059 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
7060 "$Lt_67_16386:\n"
7061 " mov.u32 %r13, 0;\n"
7062 " setp.ne.u32 %p4, %r3, %r13;\n"
7063 " @%p4 bra $Lt_67_16898;\n"
7064 " .loc 3 719 0\n"
7065 " ld.shared.f32 %f11, [__smem+0];\n"
7066 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_32_true_false_g_odata];\n"
7067 " cvt.u64.u32 %rd9, %r1;\n"
7068 " mul.wide.u32 %rd10, %r1, 4;\n"
7069 " add.u64 %rd11, %rd8, %rd10;\n"
7070 " st.global.f32 [%rd11+0], %f11;\n"
7071 "$Lt_67_16898:\n"
7072 " .loc 3 772 0\n"
7073 " exit;\n"
7074 "$LDWend_packed_float_reduce_32_true_false:\n"
7075 " } // packed_float_reduce_32_true_false\n"
7076 "\n"
7077 " .entry packed_float_reduce_32_true_true (\n"
7078 " .param .u64 __cudaparm_packed_float_reduce_32_true_true_g_idata,\n"
7079 " .param .u64 __cudaparm_packed_float_reduce_32_true_true_g_odata,\n"
7080 " .param .u32 __cudaparm_packed_float_reduce_32_true_true_n)\n"
7081 " {\n"
7082 " .reg .u16 %rh<7>;\n"
7083 " .reg .u32 %r<15>;\n"
7084 " .reg .u64 %rd<13>;\n"
7085 " .reg .f32 %f<13>;\n"
7086 " .reg .pred %p<6>;\n"
7087 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
7088 " .loc 3 773 0\n"
7089 "$LDWbegin_packed_float_reduce_32_true_true:\n"
7090 " .loc 3 637 0\n"
7091 " cvt.u32.u16 %r1, %ctaid.x;\n"
7092 " mul24.lo.u32 %r2, %r1, 64;\n"
7093 " cvt.u32.u16 %r3, %tid.x;\n"
7094 " add.u32 %r4, %r2, %r3;\n"
7095 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_32_true_true_n];\n"
7096 " setp.ge.u32 %p1, %r4, %r5;\n"
7097 " @%p1 bra $Lt_68_17410;\n"
7098 " mul.lo.u32 %r6, %r4, 4;\n"
7099 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_32_true_true_n];\n"
7100 " mul.lo.u32 %r7, %r5, 4;\n"
7101 " mov.u16 %rh1, %nctaid.x;\n"
7102 " mul.wide.u16 %r8, %rh1, 256;\n"
7103 " add.u32 %r9, %r6, 128;\n"
7104 " add.u32 %r10, %r7, 128;\n"
7105 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_32_true_true_g_idata];\n"
7106 "$Lt_68_15874:\n"
7107 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
7108 " .loc 3 677 0\n"
7109 " cvt.u8.u32 %r11, %r9;\n"
7110 " cvt.u64.u32 %rd2, %r11;\n"
7111 " .loc 3 637 0\n"
7112 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_32_true_true_g_idata];\n"
7113 " .loc 3 677 0\n"
7114 " add.u64 %rd3, %rd2, %rd1;\n"
7115 " ld.global.u8 %rh2, [%rd3+0];\n"
7116 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
7117 " .loc 3 678 0\n"
7118 " ld.global.u8 %rh3, [%rd3+1];\n"
7119 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
7120 " .loc 3 679 0\n"
7121 " ld.global.u8 %rh4, [%rd3+2];\n"
7122 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
7123 " .loc 3 680 0\n"
7124 " ld.global.u8 %rh5, [%rd3+3];\n"
7125 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
7126 " add.u32 %r9, %r8, %r9;\n"
7127 " setp.lt.u32 %p2, %r9, %r10;\n"
7128 " @%p2 bra $Lt_68_15874;\n"
7129 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
7130 " bra.uni $Lt_68_15362;\n"
7131 "$Lt_68_17410:\n"
7132 " mov.f32 %f1, 0f00000000; // 0\n"
7133 "$Lt_68_15362:\n"
7134 " .loc 3 692 0\n"
7135 " mov.u64 %rd4, __smem;\n"
7136 " cvt.u64.u32 %rd5, %r3;\n"
7137 " mul.wide.u32 %rd6, %r3, 4;\n"
7138 " add.u64 %rd7, %rd4, %rd6;\n"
7139 " st.shared.f32 [%rd7+0], %f1;\n"
7140 " .loc 3 693 0\n"
7141 " bar.sync 0;\n"
7142 " mov.u32 %r12, 31;\n"
7143 " setp.gt.u32 %p3, %r3, %r12;\n"
7144 " @%p3 bra $Lt_68_16386;\n"
7145 " .loc 3 710 0\n"
7146 " ld.volatile.shared.f32 %f2, [%rd7+64];\n"
7147 " add.f32 %f3, %f2, %f1;\n"
7148 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
7149 " .loc 3 711 0\n"
7150 " ld.volatile.shared.f32 %f4, [%rd7+32];\n"
7151 " add.f32 %f5, %f4, %f3;\n"
7152 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
7153 " .loc 3 712 0\n"
7154 " ld.volatile.shared.f32 %f6, [%rd7+16];\n"
7155 " add.f32 %f7, %f6, %f5;\n"
7156 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
7157 " .loc 3 713 0\n"
7158 " ld.volatile.shared.f32 %f8, [%rd7+8];\n"
7159 " add.f32 %f9, %f8, %f7;\n"
7160 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
7161 " .loc 3 714 0\n"
7162 " ld.volatile.shared.f32 %f10, [%rd7+4];\n"
7163 " add.f32 %f1, %f10, %f9;\n"
7164 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
7165 "$Lt_68_16386:\n"
7166 " mov.u32 %r13, 0;\n"
7167 " setp.ne.u32 %p4, %r3, %r13;\n"
7168 " @%p4 bra $Lt_68_16898;\n"
7169 " .loc 3 719 0\n"
7170 " ld.shared.f32 %f11, [__smem+0];\n"
7171 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_32_true_true_g_odata];\n"
7172 " cvt.u64.u32 %rd9, %r1;\n"
7173 " mul.wide.u32 %rd10, %r1, 4;\n"
7174 " add.u64 %rd11, %rd8, %rd10;\n"
7175 " st.global.f32 [%rd11+0], %f11;\n"
7176 "$Lt_68_16898:\n"
7177 " .loc 3 774 0\n"
7178 " exit;\n"
7179 "$LDWend_packed_float_reduce_32_true_true:\n"
7180 " } // packed_float_reduce_32_true_true\n"
7181 "\n"
7182 " .entry packed_float_reduce_64_false_false (\n"
7183 " .param .u64 __cudaparm_packed_float_reduce_64_false_false_g_idata,\n"
7184 " .param .u64 __cudaparm_packed_float_reduce_64_false_false_g_odata,\n"
7185 " .param .u32 __cudaparm_packed_float_reduce_64_false_false_n)\n"
7186 " {\n"
7187 " .reg .u16 %rh<7>;\n"
7188 " .reg .u32 %r<13>;\n"
7189 " .reg .u64 %rd<13>;\n"
7190 " .reg .f32 %f<15>;\n"
7191 " .reg .pred %p<6>;\n"
7192 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
7193 " .loc 3 776 0\n"
7194 "$LDWbegin_packed_float_reduce_64_false_false:\n"
7195 " .loc 3 637 0\n"
7196 " cvt.u32.u16 %r1, %ctaid.x;\n"
7197 " mul24.lo.u32 %r2, %r1, 128;\n"
7198 " cvt.u32.u16 %r3, %tid.x;\n"
7199 " add.u32 %r4, %r2, %r3;\n"
7200 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_64_false_false_n];\n"
7201 " setp.ge.u32 %p1, %r4, %r5;\n"
7202 " @%p1 bra $Lt_69_17154;\n"
7203 " mul.lo.u32 %r6, %r4, 4;\n"
7204 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_64_false_false_n];\n"
7205 " mul.lo.u32 %r7, %r5, 4;\n"
7206 " mov.u16 %rh1, %nctaid.x;\n"
7207 " mul.wide.u16 %r8, %rh1, 512;\n"
7208 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_64_false_false_g_idata];\n"
7209 "$Lt_69_15618:\n"
7210 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
7211 " .loc 3 677 0\n"
7212 " cvt.u8.u32 %r9, %r6;\n"
7213 " cvt.u64.u32 %rd2, %r9;\n"
7214 " .loc 3 637 0\n"
7215 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_64_false_false_g_idata];\n"
7216 " .loc 3 677 0\n"
7217 " add.u64 %rd3, %rd2, %rd1;\n"
7218 " ld.global.u8 %rh2, [%rd3+0];\n"
7219 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
7220 " .loc 3 678 0\n"
7221 " ld.global.u8 %rh3, [%rd3+1];\n"
7222 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
7223 " .loc 3 679 0\n"
7224 " ld.global.u8 %rh4, [%rd3+2];\n"
7225 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
7226 " .loc 3 680 0\n"
7227 " ld.global.u8 %rh5, [%rd3+3];\n"
7228 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
7229 " add.u32 %r6, %r6, %r8;\n"
7230 " setp.lt.u32 %p2, %r6, %r7;\n"
7231 " @%p2 bra $Lt_69_15618;\n"
7232 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
7233 " bra.uni $Lt_69_15106;\n"
7234 "$Lt_69_17154:\n"
7235 " mov.f32 %f1, 0f00000000; // 0\n"
7236 "$Lt_69_15106:\n"
7237 " .loc 3 692 0\n"
7238 " mov.u64 %rd4, __smem;\n"
7239 " cvt.u64.u32 %rd5, %r3;\n"
7240 " mul.wide.u32 %rd6, %r3, 4;\n"
7241 " add.u64 %rd7, %rd4, %rd6;\n"
7242 " st.shared.f32 [%rd7+0], %f1;\n"
7243 " .loc 3 693 0\n"
7244 " bar.sync 0;\n"
7245 " mov.u32 %r10, 31;\n"
7246 " setp.gt.u32 %p3, %r3, %r10;\n"
7247 " @%p3 bra $Lt_69_16130;\n"
7248 " .loc 3 709 0\n"
7249 " ld.volatile.shared.f32 %f2, [%rd7+128];\n"
7250 " add.f32 %f3, %f2, %f1;\n"
7251 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
7252 " .loc 3 710 0\n"
7253 " ld.volatile.shared.f32 %f4, [%rd7+64];\n"
7254 " add.f32 %f5, %f4, %f3;\n"
7255 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
7256 " .loc 3 711 0\n"
7257 " ld.volatile.shared.f32 %f6, [%rd7+32];\n"
7258 " add.f32 %f7, %f6, %f5;\n"
7259 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
7260 " .loc 3 712 0\n"
7261 " ld.volatile.shared.f32 %f8, [%rd7+16];\n"
7262 " add.f32 %f9, %f8, %f7;\n"
7263 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
7264 " .loc 3 713 0\n"
7265 " ld.volatile.shared.f32 %f10, [%rd7+8];\n"
7266 " add.f32 %f11, %f10, %f9;\n"
7267 " st.volatile.shared.f32 [%rd7+0], %f11;\n"
7268 " .loc 3 714 0\n"
7269 " ld.volatile.shared.f32 %f12, [%rd7+4];\n"
7270 " add.f32 %f1, %f12, %f11;\n"
7271 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
7272 "$Lt_69_16130:\n"
7273 " mov.u32 %r11, 0;\n"
7274 " setp.ne.u32 %p4, %r3, %r11;\n"
7275 " @%p4 bra $Lt_69_16642;\n"
7276 " .loc 3 719 0\n"
7277 " ld.shared.f32 %f13, [__smem+0];\n"
7278 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_64_false_false_g_odata];\n"
7279 " cvt.u64.u32 %rd9, %r1;\n"
7280 " mul.wide.u32 %rd10, %r1, 4;\n"
7281 " add.u64 %rd11, %rd8, %rd10;\n"
7282 " st.global.f32 [%rd11+0], %f13;\n"
7283 "$Lt_69_16642:\n"
7284 " .loc 3 777 0\n"
7285 " exit;\n"
7286 "$LDWend_packed_float_reduce_64_false_false:\n"
7287 " } // packed_float_reduce_64_false_false\n"
7288 "\n"
7289 " .entry packed_float_reduce_64_false_true (\n"
7290 " .param .u64 __cudaparm_packed_float_reduce_64_false_true_g_idata,\n"
7291 " .param .u64 __cudaparm_packed_float_reduce_64_false_true_g_odata,\n"
7292 " .param .u32 __cudaparm_packed_float_reduce_64_false_true_n)\n"
7293 " {\n"
7294 " .reg .u16 %rh<7>;\n"
7295 " .reg .u32 %r<13>;\n"
7296 " .reg .u64 %rd<13>;\n"
7297 " .reg .f32 %f<15>;\n"
7298 " .reg .pred %p<6>;\n"
7299 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
7300 " .loc 3 778 0\n"
7301 "$LDWbegin_packed_float_reduce_64_false_true:\n"
7302 " .loc 3 637 0\n"
7303 " cvt.u32.u16 %r1, %ctaid.x;\n"
7304 " mul24.lo.u32 %r2, %r1, 128;\n"
7305 " cvt.u32.u16 %r3, %tid.x;\n"
7306 " add.u32 %r4, %r2, %r3;\n"
7307 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_64_false_true_n];\n"
7308 " setp.ge.u32 %p1, %r4, %r5;\n"
7309 " @%p1 bra $Lt_70_17154;\n"
7310 " mul.lo.u32 %r6, %r4, 4;\n"
7311 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_64_false_true_n];\n"
7312 " mul.lo.u32 %r7, %r5, 4;\n"
7313 " mov.u16 %rh1, %nctaid.x;\n"
7314 " mul.wide.u16 %r8, %rh1, 512;\n"
7315 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_64_false_true_g_idata];\n"
7316 "$Lt_70_15618:\n"
7317 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
7318 " .loc 3 677 0\n"
7319 " cvt.u8.u32 %r9, %r6;\n"
7320 " cvt.u64.u32 %rd2, %r9;\n"
7321 " .loc 3 637 0\n"
7322 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_64_false_true_g_idata];\n"
7323 " .loc 3 677 0\n"
7324 " add.u64 %rd3, %rd2, %rd1;\n"
7325 " ld.global.u8 %rh2, [%rd3+0];\n"
7326 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
7327 " .loc 3 678 0\n"
7328 " ld.global.u8 %rh3, [%rd3+1];\n"
7329 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
7330 " .loc 3 679 0\n"
7331 " ld.global.u8 %rh4, [%rd3+2];\n"
7332 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
7333 " .loc 3 680 0\n"
7334 " ld.global.u8 %rh5, [%rd3+3];\n"
7335 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
7336 " add.u32 %r6, %r6, %r8;\n"
7337 " setp.lt.u32 %p2, %r6, %r7;\n"
7338 " @%p2 bra $Lt_70_15618;\n"
7339 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
7340 " bra.uni $Lt_70_15106;\n"
7341 "$Lt_70_17154:\n"
7342 " mov.f32 %f1, 0f00000000; // 0\n"
7343 "$Lt_70_15106:\n"
7344 " .loc 3 692 0\n"
7345 " mov.u64 %rd4, __smem;\n"
7346 " cvt.u64.u32 %rd5, %r3;\n"
7347 " mul.wide.u32 %rd6, %r3, 4;\n"
7348 " add.u64 %rd7, %rd4, %rd6;\n"
7349 " st.shared.f32 [%rd7+0], %f1;\n"
7350 " .loc 3 693 0\n"
7351 " bar.sync 0;\n"
7352 " mov.u32 %r10, 31;\n"
7353 " setp.gt.u32 %p3, %r3, %r10;\n"
7354 " @%p3 bra $Lt_70_16130;\n"
7355 " .loc 3 709 0\n"
7356 " ld.volatile.shared.f32 %f2, [%rd7+128];\n"
7357 " add.f32 %f3, %f2, %f1;\n"
7358 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
7359 " .loc 3 710 0\n"
7360 " ld.volatile.shared.f32 %f4, [%rd7+64];\n"
7361 " add.f32 %f5, %f4, %f3;\n"
7362 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
7363 " .loc 3 711 0\n"
7364 " ld.volatile.shared.f32 %f6, [%rd7+32];\n"
7365 " add.f32 %f7, %f6, %f5;\n"
7366 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
7367 " .loc 3 712 0\n"
7368 " ld.volatile.shared.f32 %f8, [%rd7+16];\n"
7369 " add.f32 %f9, %f8, %f7;\n"
7370 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
7371 " .loc 3 713 0\n"
7372 " ld.volatile.shared.f32 %f10, [%rd7+8];\n"
7373 " add.f32 %f11, %f10, %f9;\n"
7374 " st.volatile.shared.f32 [%rd7+0], %f11;\n"
7375 " .loc 3 714 0\n"
7376 " ld.volatile.shared.f32 %f12, [%rd7+4];\n"
7377 " add.f32 %f1, %f12, %f11;\n"
7378 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
7379 "$Lt_70_16130:\n"
7380 " mov.u32 %r11, 0;\n"
7381 " setp.ne.u32 %p4, %r3, %r11;\n"
7382 " @%p4 bra $Lt_70_16642;\n"
7383 " .loc 3 719 0\n"
7384 " ld.shared.f32 %f13, [__smem+0];\n"
7385 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_64_false_true_g_odata];\n"
7386 " cvt.u64.u32 %rd9, %r1;\n"
7387 " mul.wide.u32 %rd10, %r1, 4;\n"
7388 " add.u64 %rd11, %rd8, %rd10;\n"
7389 " st.global.f32 [%rd11+0], %f13;\n"
7390 "$Lt_70_16642:\n"
7391 " .loc 3 779 0\n"
7392 " exit;\n"
7393 "$LDWend_packed_float_reduce_64_false_true:\n"
7394 " } // packed_float_reduce_64_false_true\n"
7395 "\n"
7396 " .entry packed_float_reduce_64_true_false (\n"
7397 " .param .u64 __cudaparm_packed_float_reduce_64_true_false_g_idata,\n"
7398 " .param .u64 __cudaparm_packed_float_reduce_64_true_false_g_odata,\n"
7399 " .param .u32 __cudaparm_packed_float_reduce_64_true_false_n)\n"
7400 " {\n"
7401 " .reg .u16 %rh<7>;\n"
7402 " .reg .u32 %r<13>;\n"
7403 " .reg .u64 %rd<13>;\n"
7404 " .reg .f32 %f<15>;\n"
7405 " .reg .pred %p<6>;\n"
7406 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
7407 " .loc 3 780 0\n"
7408 "$LDWbegin_packed_float_reduce_64_true_false:\n"
7409 " .loc 3 637 0\n"
7410 " cvt.u32.u16 %r1, %ctaid.x;\n"
7411 " mul24.lo.u32 %r2, %r1, 128;\n"
7412 " cvt.u32.u16 %r3, %tid.x;\n"
7413 " add.u32 %r4, %r2, %r3;\n"
7414 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_64_true_false_n];\n"
7415 " setp.ge.u32 %p1, %r4, %r5;\n"
7416 " @%p1 bra $Lt_71_17154;\n"
7417 " mul.lo.u32 %r6, %r4, 4;\n"
7418 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_64_true_false_n];\n"
7419 " mul.lo.u32 %r7, %r5, 4;\n"
7420 " mov.u16 %rh1, %nctaid.x;\n"
7421 " mul.wide.u16 %r8, %rh1, 512;\n"
7422 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_64_true_false_g_idata];\n"
7423 "$Lt_71_15618:\n"
7424 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
7425 " .loc 3 677 0\n"
7426 " cvt.u8.u32 %r9, %r6;\n"
7427 " cvt.u64.u32 %rd2, %r9;\n"
7428 " .loc 3 637 0\n"
7429 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_64_true_false_g_idata];\n"
7430 " .loc 3 677 0\n"
7431 " add.u64 %rd3, %rd2, %rd1;\n"
7432 " ld.global.u8 %rh2, [%rd3+0];\n"
7433 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
7434 " .loc 3 678 0\n"
7435 " ld.global.u8 %rh3, [%rd3+1];\n"
7436 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
7437 " .loc 3 679 0\n"
7438 " ld.global.u8 %rh4, [%rd3+2];\n"
7439 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
7440 " .loc 3 680 0\n"
7441 " ld.global.u8 %rh5, [%rd3+3];\n"
7442 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
7443 " add.u32 %r6, %r6, %r8;\n"
7444 " setp.lt.u32 %p2, %r6, %r7;\n"
7445 " @%p2 bra $Lt_71_15618;\n"
7446 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
7447 " bra.uni $Lt_71_15106;\n"
7448 "$Lt_71_17154:\n"
7449 " mov.f32 %f1, 0f00000000; // 0\n"
7450 "$Lt_71_15106:\n"
7451 " .loc 3 692 0\n"
7452 " mov.u64 %rd4, __smem;\n"
7453 " cvt.u64.u32 %rd5, %r3;\n"
7454 " mul.wide.u32 %rd6, %r3, 4;\n"
7455 " add.u64 %rd7, %rd4, %rd6;\n"
7456 " st.shared.f32 [%rd7+0], %f1;\n"
7457 " .loc 3 693 0\n"
7458 " bar.sync 0;\n"
7459 " mov.u32 %r10, 31;\n"
7460 " setp.gt.u32 %p3, %r3, %r10;\n"
7461 " @%p3 bra $Lt_71_16130;\n"
7462 " .loc 3 709 0\n"
7463 " ld.volatile.shared.f32 %f2, [%rd7+128];\n"
7464 " add.f32 %f3, %f2, %f1;\n"
7465 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
7466 " .loc 3 710 0\n"
7467 " ld.volatile.shared.f32 %f4, [%rd7+64];\n"
7468 " add.f32 %f5, %f4, %f3;\n"
7469 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
7470 " .loc 3 711 0\n"
7471 " ld.volatile.shared.f32 %f6, [%rd7+32];\n"
7472 " add.f32 %f7, %f6, %f5;\n"
7473 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
7474 " .loc 3 712 0\n"
7475 " ld.volatile.shared.f32 %f8, [%rd7+16];\n"
7476 " add.f32 %f9, %f8, %f7;\n"
7477 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
7478 " .loc 3 713 0\n"
7479 " ld.volatile.shared.f32 %f10, [%rd7+8];\n"
7480 " add.f32 %f11, %f10, %f9;\n"
7481 " st.volatile.shared.f32 [%rd7+0], %f11;\n"
7482 " .loc 3 714 0\n"
7483 " ld.volatile.shared.f32 %f12, [%rd7+4];\n"
7484 " add.f32 %f1, %f12, %f11;\n"
7485 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
7486 "$Lt_71_16130:\n"
7487 " mov.u32 %r11, 0;\n"
7488 " setp.ne.u32 %p4, %r3, %r11;\n"
7489 " @%p4 bra $Lt_71_16642;\n"
7490 " .loc 3 719 0\n"
7491 " ld.shared.f32 %f13, [__smem+0];\n"
7492 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_64_true_false_g_odata];\n"
7493 " cvt.u64.u32 %rd9, %r1;\n"
7494 " mul.wide.u32 %rd10, %r1, 4;\n"
7495 " add.u64 %rd11, %rd8, %rd10;\n"
7496 " st.global.f32 [%rd11+0], %f13;\n"
7497 "$Lt_71_16642:\n"
7498 " .loc 3 781 0\n"
7499 " exit;\n"
7500 "$LDWend_packed_float_reduce_64_true_false:\n"
7501 " } // packed_float_reduce_64_true_false\n"
7502 "\n"
7503 " .entry packed_float_reduce_64_true_true (\n"
7504 " .param .u64 __cudaparm_packed_float_reduce_64_true_true_g_idata,\n"
7505 " .param .u64 __cudaparm_packed_float_reduce_64_true_true_g_odata,\n"
7506 " .param .u32 __cudaparm_packed_float_reduce_64_true_true_n)\n"
7507 " {\n"
7508 " .reg .u16 %rh<7>;\n"
7509 " .reg .u32 %r<13>;\n"
7510 " .reg .u64 %rd<13>;\n"
7511 " .reg .f32 %f<15>;\n"
7512 " .reg .pred %p<6>;\n"
7513 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
7514 " .loc 3 782 0\n"
7515 "$LDWbegin_packed_float_reduce_64_true_true:\n"
7516 " .loc 3 637 0\n"
7517 " cvt.u32.u16 %r1, %ctaid.x;\n"
7518 " mul24.lo.u32 %r2, %r1, 128;\n"
7519 " cvt.u32.u16 %r3, %tid.x;\n"
7520 " add.u32 %r4, %r2, %r3;\n"
7521 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_64_true_true_n];\n"
7522 " setp.ge.u32 %p1, %r4, %r5;\n"
7523 " @%p1 bra $Lt_72_17154;\n"
7524 " mul.lo.u32 %r6, %r4, 4;\n"
7525 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_64_true_true_n];\n"
7526 " mul.lo.u32 %r7, %r5, 4;\n"
7527 " mov.u16 %rh1, %nctaid.x;\n"
7528 " mul.wide.u16 %r8, %rh1, 512;\n"
7529 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_64_true_true_g_idata];\n"
7530 "$Lt_72_15618:\n"
7531 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
7532 " .loc 3 677 0\n"
7533 " cvt.u8.u32 %r9, %r6;\n"
7534 " cvt.u64.u32 %rd2, %r9;\n"
7535 " .loc 3 637 0\n"
7536 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_64_true_true_g_idata];\n"
7537 " .loc 3 677 0\n"
7538 " add.u64 %rd3, %rd2, %rd1;\n"
7539 " ld.global.u8 %rh2, [%rd3+0];\n"
7540 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
7541 " .loc 3 678 0\n"
7542 " ld.global.u8 %rh3, [%rd3+1];\n"
7543 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
7544 " .loc 3 679 0\n"
7545 " ld.global.u8 %rh4, [%rd3+2];\n"
7546 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
7547 " .loc 3 680 0\n"
7548 " ld.global.u8 %rh5, [%rd3+3];\n"
7549 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
7550 " add.u32 %r6, %r6, %r8;\n"
7551 " setp.lt.u32 %p2, %r6, %r7;\n"
7552 " @%p2 bra $Lt_72_15618;\n"
7553 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
7554 " bra.uni $Lt_72_15106;\n"
7555 "$Lt_72_17154:\n"
7556 " mov.f32 %f1, 0f00000000; // 0\n"
7557 "$Lt_72_15106:\n"
7558 " .loc 3 692 0\n"
7559 " mov.u64 %rd4, __smem;\n"
7560 " cvt.u64.u32 %rd5, %r3;\n"
7561 " mul.wide.u32 %rd6, %r3, 4;\n"
7562 " add.u64 %rd7, %rd4, %rd6;\n"
7563 " st.shared.f32 [%rd7+0], %f1;\n"
7564 " .loc 3 693 0\n"
7565 " bar.sync 0;\n"
7566 " mov.u32 %r10, 31;\n"
7567 " setp.gt.u32 %p3, %r3, %r10;\n"
7568 " @%p3 bra $Lt_72_16130;\n"
7569 " .loc 3 709 0\n"
7570 " ld.volatile.shared.f32 %f2, [%rd7+128];\n"
7571 " add.f32 %f3, %f2, %f1;\n"
7572 " st.volatile.shared.f32 [%rd7+0], %f3;\n"
7573 " .loc 3 710 0\n"
7574 " ld.volatile.shared.f32 %f4, [%rd7+64];\n"
7575 " add.f32 %f5, %f4, %f3;\n"
7576 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
7577 " .loc 3 711 0\n"
7578 " ld.volatile.shared.f32 %f6, [%rd7+32];\n"
7579 " add.f32 %f7, %f6, %f5;\n"
7580 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
7581 " .loc 3 712 0\n"
7582 " ld.volatile.shared.f32 %f8, [%rd7+16];\n"
7583 " add.f32 %f9, %f8, %f7;\n"
7584 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
7585 " .loc 3 713 0\n"
7586 " ld.volatile.shared.f32 %f10, [%rd7+8];\n"
7587 " add.f32 %f11, %f10, %f9;\n"
7588 " st.volatile.shared.f32 [%rd7+0], %f11;\n"
7589 " .loc 3 714 0\n"
7590 " ld.volatile.shared.f32 %f12, [%rd7+4];\n"
7591 " add.f32 %f1, %f12, %f11;\n"
7592 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
7593 "$Lt_72_16130:\n"
7594 " mov.u32 %r11, 0;\n"
7595 " setp.ne.u32 %p4, %r3, %r11;\n"
7596 " @%p4 bra $Lt_72_16642;\n"
7597 " .loc 3 719 0\n"
7598 " ld.shared.f32 %f13, [__smem+0];\n"
7599 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_64_true_true_g_odata];\n"
7600 " cvt.u64.u32 %rd9, %r1;\n"
7601 " mul.wide.u32 %rd10, %r1, 4;\n"
7602 " add.u64 %rd11, %rd8, %rd10;\n"
7603 " st.global.f32 [%rd11+0], %f13;\n"
7604 "$Lt_72_16642:\n"
7605 " .loc 3 783 0\n"
7606 " exit;\n"
7607 "$LDWend_packed_float_reduce_64_true_true:\n"
7608 " } // packed_float_reduce_64_true_true\n"
7609 "\n"
7610 " .entry packed_float_reduce_128_false_false (\n"
7611 " .param .u64 __cudaparm_packed_float_reduce_128_false_false_g_idata,\n"
7612 " .param .u64 __cudaparm_packed_float_reduce_128_false_false_g_odata,\n"
7613 " .param .u32 __cudaparm_packed_float_reduce_128_false_false_n)\n"
7614 " {\n"
7615 " .reg .u16 %rh<7>;\n"
7616 " .reg .u32 %r<14>;\n"
7617 " .reg .u64 %rd<13>;\n"
7618 " .reg .f32 %f<16>;\n"
7619 " .reg .pred %p<7>;\n"
7620 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
7621 " .loc 3 785 0\n"
7622 "$LDWbegin_packed_float_reduce_128_false_false:\n"
7623 " .loc 3 637 0\n"
7624 " cvt.u32.u16 %r1, %ctaid.x;\n"
7625 " mul.lo.u32 %r2, %r1, 256;\n"
7626 " cvt.u32.u16 %r3, %tid.x;\n"
7627 " add.u32 %r4, %r2, %r3;\n"
7628 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_128_false_false_n];\n"
7629 " setp.ge.u32 %p1, %r4, %r5;\n"
7630 " @%p1 bra $Lt_73_17410;\n"
7631 " mul.lo.u32 %r6, %r4, 4;\n"
7632 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_128_false_false_n];\n"
7633 " mul.lo.u32 %r7, %r5, 4;\n"
7634 " mov.u16 %rh1, %nctaid.x;\n"
7635 " mul.wide.u16 %r8, %rh1, 1024;\n"
7636 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_128_false_false_g_idata];\n"
7637 "$Lt_73_15362:\n"
7638 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
7639 " .loc 3 677 0\n"
7640 " cvt.u8.u32 %r9, %r6;\n"
7641 " cvt.u64.u32 %rd2, %r9;\n"
7642 " .loc 3 637 0\n"
7643 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_128_false_false_g_idata];\n"
7644 " .loc 3 677 0\n"
7645 " add.u64 %rd3, %rd2, %rd1;\n"
7646 " ld.global.u8 %rh2, [%rd3+0];\n"
7647 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
7648 " .loc 3 678 0\n"
7649 " ld.global.u8 %rh3, [%rd3+1];\n"
7650 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
7651 " .loc 3 679 0\n"
7652 " ld.global.u8 %rh4, [%rd3+2];\n"
7653 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
7654 " .loc 3 680 0\n"
7655 " ld.global.u8 %rh5, [%rd3+3];\n"
7656 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
7657 " add.u32 %r6, %r6, %r8;\n"
7658 " setp.lt.u32 %p2, %r6, %r7;\n"
7659 " @%p2 bra $Lt_73_15362;\n"
7660 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
7661 " bra.uni $Lt_73_14850;\n"
7662 "$Lt_73_17410:\n"
7663 " mov.f32 %f1, 0f00000000; // 0\n"
7664 "$Lt_73_14850:\n"
7665 " .loc 3 692 0\n"
7666 " mov.u64 %rd4, __smem;\n"
7667 " cvt.u64.u32 %rd5, %r3;\n"
7668 " mul.wide.u32 %rd6, %r3, 4;\n"
7669 " add.u64 %rd7, %rd4, %rd6;\n"
7670 " st.shared.f32 [%rd7+0], %f1;\n"
7671 " .loc 3 693 0\n"
7672 " bar.sync 0;\n"
7673 " mov.u32 %r10, 63;\n"
7674 " setp.gt.u32 %p3, %r3, %r10;\n"
7675 " @%p3 bra $Lt_73_15874;\n"
7676 " .loc 3 699 0\n"
7677 " ld.shared.f32 %f2, [%rd7+256];\n"
7678 " add.f32 %f1, %f2, %f1;\n"
7679 " st.shared.f32 [%rd7+0], %f1;\n"
7680 "$Lt_73_15874:\n"
7681 " bar.sync 0;\n"
7682 " mov.u32 %r11, 31;\n"
7683 " setp.gt.u32 %p4, %r3, %r11;\n"
7684 " @%p4 bra $Lt_73_16386;\n"
7685 " .loc 3 709 0\n"
7686 " ld.volatile.shared.f32 %f3, [%rd7+128];\n"
7687 " add.f32 %f4, %f3, %f1;\n"
7688 " st.volatile.shared.f32 [%rd7+0], %f4;\n"
7689 " .loc 3 710 0\n"
7690 " ld.volatile.shared.f32 %f5, [%rd7+64];\n"
7691 " add.f32 %f6, %f5, %f4;\n"
7692 " st.volatile.shared.f32 [%rd7+0], %f6;\n"
7693 " .loc 3 711 0\n"
7694 " ld.volatile.shared.f32 %f7, [%rd7+32];\n"
7695 " add.f32 %f8, %f7, %f6;\n"
7696 " st.volatile.shared.f32 [%rd7+0], %f8;\n"
7697 " .loc 3 712 0\n"
7698 " ld.volatile.shared.f32 %f9, [%rd7+16];\n"
7699 " add.f32 %f10, %f9, %f8;\n"
7700 " st.volatile.shared.f32 [%rd7+0], %f10;\n"
7701 " .loc 3 713 0\n"
7702 " ld.volatile.shared.f32 %f11, [%rd7+8];\n"
7703 " add.f32 %f12, %f11, %f10;\n"
7704 " st.volatile.shared.f32 [%rd7+0], %f12;\n"
7705 " .loc 3 714 0\n"
7706 " ld.volatile.shared.f32 %f13, [%rd7+4];\n"
7707 " add.f32 %f1, %f13, %f12;\n"
7708 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
7709 "$Lt_73_16386:\n"
7710 " mov.u32 %r12, 0;\n"
7711 " setp.ne.u32 %p5, %r3, %r12;\n"
7712 " @%p5 bra $Lt_73_16898;\n"
7713 " .loc 3 719 0\n"
7714 " ld.shared.f32 %f14, [__smem+0];\n"
7715 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_128_false_false_g_odata];\n"
7716 " cvt.u64.u32 %rd9, %r1;\n"
7717 " mul.wide.u32 %rd10, %r1, 4;\n"
7718 " add.u64 %rd11, %rd8, %rd10;\n"
7719 " st.global.f32 [%rd11+0], %f14;\n"
7720 "$Lt_73_16898:\n"
7721 " .loc 3 786 0\n"
7722 " exit;\n"
7723 "$LDWend_packed_float_reduce_128_false_false:\n"
7724 " } // packed_float_reduce_128_false_false\n"
7725 "\n"
7726 " .entry packed_float_reduce_128_false_true (\n"
7727 " .param .u64 __cudaparm_packed_float_reduce_128_false_true_g_idata,\n"
7728 " .param .u64 __cudaparm_packed_float_reduce_128_false_true_g_odata,\n"
7729 " .param .u32 __cudaparm_packed_float_reduce_128_false_true_n)\n"
7730 " {\n"
7731 " .reg .u16 %rh<7>;\n"
7732 " .reg .u32 %r<14>;\n"
7733 " .reg .u64 %rd<13>;\n"
7734 " .reg .f32 %f<16>;\n"
7735 " .reg .pred %p<7>;\n"
7736 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
7737 " .loc 3 787 0\n"
7738 "$LDWbegin_packed_float_reduce_128_false_true:\n"
7739 " .loc 3 637 0\n"
7740 " cvt.u32.u16 %r1, %ctaid.x;\n"
7741 " mul.lo.u32 %r2, %r1, 256;\n"
7742 " cvt.u32.u16 %r3, %tid.x;\n"
7743 " add.u32 %r4, %r2, %r3;\n"
7744 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_128_false_true_n];\n"
7745 " setp.ge.u32 %p1, %r4, %r5;\n"
7746 " @%p1 bra $Lt_74_17410;\n"
7747 " mul.lo.u32 %r6, %r4, 4;\n"
7748 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_128_false_true_n];\n"
7749 " mul.lo.u32 %r7, %r5, 4;\n"
7750 " mov.u16 %rh1, %nctaid.x;\n"
7751 " mul.wide.u16 %r8, %rh1, 1024;\n"
7752 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_128_false_true_g_idata];\n"
7753 "$Lt_74_15362:\n"
7754 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
7755 " .loc 3 677 0\n"
7756 " cvt.u8.u32 %r9, %r6;\n"
7757 " cvt.u64.u32 %rd2, %r9;\n"
7758 " .loc 3 637 0\n"
7759 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_128_false_true_g_idata];\n"
7760 " .loc 3 677 0\n"
7761 " add.u64 %rd3, %rd2, %rd1;\n"
7762 " ld.global.u8 %rh2, [%rd3+0];\n"
7763 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
7764 " .loc 3 678 0\n"
7765 " ld.global.u8 %rh3, [%rd3+1];\n"
7766 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
7767 " .loc 3 679 0\n"
7768 " ld.global.u8 %rh4, [%rd3+2];\n"
7769 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
7770 " .loc 3 680 0\n"
7771 " ld.global.u8 %rh5, [%rd3+3];\n"
7772 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
7773 " add.u32 %r6, %r6, %r8;\n"
7774 " setp.lt.u32 %p2, %r6, %r7;\n"
7775 " @%p2 bra $Lt_74_15362;\n"
7776 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
7777 " bra.uni $Lt_74_14850;\n"
7778 "$Lt_74_17410:\n"
7779 " mov.f32 %f1, 0f00000000; // 0\n"
7780 "$Lt_74_14850:\n"
7781 " .loc 3 692 0\n"
7782 " mov.u64 %rd4, __smem;\n"
7783 " cvt.u64.u32 %rd5, %r3;\n"
7784 " mul.wide.u32 %rd6, %r3, 4;\n"
7785 " add.u64 %rd7, %rd4, %rd6;\n"
7786 " st.shared.f32 [%rd7+0], %f1;\n"
7787 " .loc 3 693 0\n"
7788 " bar.sync 0;\n"
7789 " mov.u32 %r10, 63;\n"
7790 " setp.gt.u32 %p3, %r3, %r10;\n"
7791 " @%p3 bra $Lt_74_15874;\n"
7792 " .loc 3 699 0\n"
7793 " ld.shared.f32 %f2, [%rd7+256];\n"
7794 " add.f32 %f1, %f2, %f1;\n"
7795 " st.shared.f32 [%rd7+0], %f1;\n"
7796 "$Lt_74_15874:\n"
7797 " bar.sync 0;\n"
7798 " mov.u32 %r11, 31;\n"
7799 " setp.gt.u32 %p4, %r3, %r11;\n"
7800 " @%p4 bra $Lt_74_16386;\n"
7801 " .loc 3 709 0\n"
7802 " ld.volatile.shared.f32 %f3, [%rd7+128];\n"
7803 " add.f32 %f4, %f3, %f1;\n"
7804 " st.volatile.shared.f32 [%rd7+0], %f4;\n"
7805 " .loc 3 710 0\n"
7806 " ld.volatile.shared.f32 %f5, [%rd7+64];\n"
7807 " add.f32 %f6, %f5, %f4;\n"
7808 " st.volatile.shared.f32 [%rd7+0], %f6;\n"
7809 " .loc 3 711 0\n"
7810 " ld.volatile.shared.f32 %f7, [%rd7+32];\n"
7811 " add.f32 %f8, %f7, %f6;\n"
7812 " st.volatile.shared.f32 [%rd7+0], %f8;\n"
7813 " .loc 3 712 0\n"
7814 " ld.volatile.shared.f32 %f9, [%rd7+16];\n"
7815 " add.f32 %f10, %f9, %f8;\n"
7816 " st.volatile.shared.f32 [%rd7+0], %f10;\n"
7817 " .loc 3 713 0\n"
7818 " ld.volatile.shared.f32 %f11, [%rd7+8];\n"
7819 " add.f32 %f12, %f11, %f10;\n"
7820 " st.volatile.shared.f32 [%rd7+0], %f12;\n"
7821 " .loc 3 714 0\n"
7822 " ld.volatile.shared.f32 %f13, [%rd7+4];\n"
7823 " add.f32 %f1, %f13, %f12;\n"
7824 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
7825 "$Lt_74_16386:\n"
7826 " mov.u32 %r12, 0;\n"
7827 " setp.ne.u32 %p5, %r3, %r12;\n"
7828 " @%p5 bra $Lt_74_16898;\n"
7829 " .loc 3 719 0\n"
7830 " ld.shared.f32 %f14, [__smem+0];\n"
7831 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_128_false_true_g_odata];\n"
7832 " cvt.u64.u32 %rd9, %r1;\n"
7833 " mul.wide.u32 %rd10, %r1, 4;\n"
7834 " add.u64 %rd11, %rd8, %rd10;\n"
7835 " st.global.f32 [%rd11+0], %f14;\n"
7836 "$Lt_74_16898:\n"
7837 " .loc 3 788 0\n"
7838 " exit;\n"
7839 "$LDWend_packed_float_reduce_128_false_true:\n"
7840 " } // packed_float_reduce_128_false_true\n"
7841 "\n"
7842 " .entry packed_float_reduce_128_true_false (\n"
7843 " .param .u64 __cudaparm_packed_float_reduce_128_true_false_g_idata,\n"
7844 " .param .u64 __cudaparm_packed_float_reduce_128_true_false_g_odata,\n"
7845 " .param .u32 __cudaparm_packed_float_reduce_128_true_false_n)\n"
7846 " {\n"
7847 " .reg .u16 %rh<7>;\n"
7848 " .reg .u32 %r<14>;\n"
7849 " .reg .u64 %rd<13>;\n"
7850 " .reg .f32 %f<16>;\n"
7851 " .reg .pred %p<7>;\n"
7852 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
7853 " .loc 3 789 0\n"
7854 "$LDWbegin_packed_float_reduce_128_true_false:\n"
7855 " .loc 3 637 0\n"
7856 " cvt.u32.u16 %r1, %ctaid.x;\n"
7857 " mul.lo.u32 %r2, %r1, 256;\n"
7858 " cvt.u32.u16 %r3, %tid.x;\n"
7859 " add.u32 %r4, %r2, %r3;\n"
7860 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_128_true_false_n];\n"
7861 " setp.ge.u32 %p1, %r4, %r5;\n"
7862 " @%p1 bra $Lt_75_17410;\n"
7863 " mul.lo.u32 %r6, %r4, 4;\n"
7864 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_128_true_false_n];\n"
7865 " mul.lo.u32 %r7, %r5, 4;\n"
7866 " mov.u16 %rh1, %nctaid.x;\n"
7867 " mul.wide.u16 %r8, %rh1, 1024;\n"
7868 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_128_true_false_g_idata];\n"
7869 "$Lt_75_15362:\n"
7870 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
7871 " .loc 3 677 0\n"
7872 " cvt.u8.u32 %r9, %r6;\n"
7873 " cvt.u64.u32 %rd2, %r9;\n"
7874 " .loc 3 637 0\n"
7875 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_128_true_false_g_idata];\n"
7876 " .loc 3 677 0\n"
7877 " add.u64 %rd3, %rd2, %rd1;\n"
7878 " ld.global.u8 %rh2, [%rd3+0];\n"
7879 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
7880 " .loc 3 678 0\n"
7881 " ld.global.u8 %rh3, [%rd3+1];\n"
7882 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
7883 " .loc 3 679 0\n"
7884 " ld.global.u8 %rh4, [%rd3+2];\n"
7885 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
7886 " .loc 3 680 0\n"
7887 " ld.global.u8 %rh5, [%rd3+3];\n"
7888 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
7889 " add.u32 %r6, %r6, %r8;\n"
7890 " setp.lt.u32 %p2, %r6, %r7;\n"
7891 " @%p2 bra $Lt_75_15362;\n"
7892 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
7893 " bra.uni $Lt_75_14850;\n"
7894 "$Lt_75_17410:\n"
7895 " mov.f32 %f1, 0f00000000; // 0\n"
7896 "$Lt_75_14850:\n"
7897 " .loc 3 692 0\n"
7898 " mov.u64 %rd4, __smem;\n"
7899 " cvt.u64.u32 %rd5, %r3;\n"
7900 " mul.wide.u32 %rd6, %r3, 4;\n"
7901 " add.u64 %rd7, %rd4, %rd6;\n"
7902 " st.shared.f32 [%rd7+0], %f1;\n"
7903 " .loc 3 693 0\n"
7904 " bar.sync 0;\n"
7905 " mov.u32 %r10, 63;\n"
7906 " setp.gt.u32 %p3, %r3, %r10;\n"
7907 " @%p3 bra $Lt_75_15874;\n"
7908 " .loc 3 699 0\n"
7909 " ld.shared.f32 %f2, [%rd7+256];\n"
7910 " add.f32 %f1, %f2, %f1;\n"
7911 " st.shared.f32 [%rd7+0], %f1;\n"
7912 "$Lt_75_15874:\n"
7913 " bar.sync 0;\n"
7914 " mov.u32 %r11, 31;\n"
7915 " setp.gt.u32 %p4, %r3, %r11;\n"
7916 " @%p4 bra $Lt_75_16386;\n"
7917 " .loc 3 709 0\n"
7918 " ld.volatile.shared.f32 %f3, [%rd7+128];\n"
7919 " add.f32 %f4, %f3, %f1;\n"
7920 " st.volatile.shared.f32 [%rd7+0], %f4;\n"
7921 " .loc 3 710 0\n"
7922 " ld.volatile.shared.f32 %f5, [%rd7+64];\n"
7923 " add.f32 %f6, %f5, %f4;\n"
7924 " st.volatile.shared.f32 [%rd7+0], %f6;\n"
7925 " .loc 3 711 0\n"
7926 " ld.volatile.shared.f32 %f7, [%rd7+32];\n"
7927 " add.f32 %f8, %f7, %f6;\n"
7928 " st.volatile.shared.f32 [%rd7+0], %f8;\n"
7929 " .loc 3 712 0\n"
7930 " ld.volatile.shared.f32 %f9, [%rd7+16];\n"
7931 " add.f32 %f10, %f9, %f8;\n"
7932 " st.volatile.shared.f32 [%rd7+0], %f10;\n"
7933 " .loc 3 713 0\n"
7934 " ld.volatile.shared.f32 %f11, [%rd7+8];\n"
7935 " add.f32 %f12, %f11, %f10;\n"
7936 " st.volatile.shared.f32 [%rd7+0], %f12;\n"
7937 " .loc 3 714 0\n"
7938 " ld.volatile.shared.f32 %f13, [%rd7+4];\n"
7939 " add.f32 %f1, %f13, %f12;\n"
7940 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
7941 "$Lt_75_16386:\n"
7942 " mov.u32 %r12, 0;\n"
7943 " setp.ne.u32 %p5, %r3, %r12;\n"
7944 " @%p5 bra $Lt_75_16898;\n"
7945 " .loc 3 719 0\n"
7946 " ld.shared.f32 %f14, [__smem+0];\n"
7947 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_128_true_false_g_odata];\n"
7948 " cvt.u64.u32 %rd9, %r1;\n"
7949 " mul.wide.u32 %rd10, %r1, 4;\n"
7950 " add.u64 %rd11, %rd8, %rd10;\n"
7951 " st.global.f32 [%rd11+0], %f14;\n"
7952 "$Lt_75_16898:\n"
7953 " .loc 3 790 0\n"
7954 " exit;\n"
7955 "$LDWend_packed_float_reduce_128_true_false:\n"
7956 " } // packed_float_reduce_128_true_false\n"
7957 "\n"
7958 " .entry packed_float_reduce_128_true_true (\n"
7959 " .param .u64 __cudaparm_packed_float_reduce_128_true_true_g_idata,\n"
7960 " .param .u64 __cudaparm_packed_float_reduce_128_true_true_g_odata,\n"
7961 " .param .u32 __cudaparm_packed_float_reduce_128_true_true_n)\n"
7962 " {\n"
7963 " .reg .u16 %rh<7>;\n"
7964 " .reg .u32 %r<14>;\n"
7965 " .reg .u64 %rd<13>;\n"
7966 " .reg .f32 %f<16>;\n"
7967 " .reg .pred %p<7>;\n"
7968 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
7969 " .loc 3 791 0\n"
7970 "$LDWbegin_packed_float_reduce_128_true_true:\n"
7971 " .loc 3 637 0\n"
7972 " cvt.u32.u16 %r1, %ctaid.x;\n"
7973 " mul.lo.u32 %r2, %r1, 256;\n"
7974 " cvt.u32.u16 %r3, %tid.x;\n"
7975 " add.u32 %r4, %r2, %r3;\n"
7976 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_128_true_true_n];\n"
7977 " setp.ge.u32 %p1, %r4, %r5;\n"
7978 " @%p1 bra $Lt_76_17410;\n"
7979 " mul.lo.u32 %r6, %r4, 4;\n"
7980 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_128_true_true_n];\n"
7981 " mul.lo.u32 %r7, %r5, 4;\n"
7982 " mov.u16 %rh1, %nctaid.x;\n"
7983 " mul.wide.u16 %r8, %rh1, 1024;\n"
7984 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_128_true_true_g_idata];\n"
7985 "$Lt_76_15362:\n"
7986 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
7987 " .loc 3 677 0\n"
7988 " cvt.u8.u32 %r9, %r6;\n"
7989 " cvt.u64.u32 %rd2, %r9;\n"
7990 " .loc 3 637 0\n"
7991 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_128_true_true_g_idata];\n"
7992 " .loc 3 677 0\n"
7993 " add.u64 %rd3, %rd2, %rd1;\n"
7994 " ld.global.u8 %rh2, [%rd3+0];\n"
7995 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
7996 " .loc 3 678 0\n"
7997 " ld.global.u8 %rh3, [%rd3+1];\n"
7998 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
7999 " .loc 3 679 0\n"
8000 " ld.global.u8 %rh4, [%rd3+2];\n"
8001 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
8002 " .loc 3 680 0\n"
8003 " ld.global.u8 %rh5, [%rd3+3];\n"
8004 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
8005 " add.u32 %r6, %r6, %r8;\n"
8006 " setp.lt.u32 %p2, %r6, %r7;\n"
8007 " @%p2 bra $Lt_76_15362;\n"
8008 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
8009 " bra.uni $Lt_76_14850;\n"
8010 "$Lt_76_17410:\n"
8011 " mov.f32 %f1, 0f00000000; // 0\n"
8012 "$Lt_76_14850:\n"
8013 " .loc 3 692 0\n"
8014 " mov.u64 %rd4, __smem;\n"
8015 " cvt.u64.u32 %rd5, %r3;\n"
8016 " mul.wide.u32 %rd6, %r3, 4;\n"
8017 " add.u64 %rd7, %rd4, %rd6;\n"
8018 " st.shared.f32 [%rd7+0], %f1;\n"
8019 " .loc 3 693 0\n"
8020 " bar.sync 0;\n"
8021 " mov.u32 %r10, 63;\n"
8022 " setp.gt.u32 %p3, %r3, %r10;\n"
8023 " @%p3 bra $Lt_76_15874;\n"
8024 " .loc 3 699 0\n"
8025 " ld.shared.f32 %f2, [%rd7+256];\n"
8026 " add.f32 %f1, %f2, %f1;\n"
8027 " st.shared.f32 [%rd7+0], %f1;\n"
8028 "$Lt_76_15874:\n"
8029 " bar.sync 0;\n"
8030 " mov.u32 %r11, 31;\n"
8031 " setp.gt.u32 %p4, %r3, %r11;\n"
8032 " @%p4 bra $Lt_76_16386;\n"
8033 " .loc 3 709 0\n"
8034 " ld.volatile.shared.f32 %f3, [%rd7+128];\n"
8035 " add.f32 %f4, %f3, %f1;\n"
8036 " st.volatile.shared.f32 [%rd7+0], %f4;\n"
8037 " .loc 3 710 0\n"
8038 " ld.volatile.shared.f32 %f5, [%rd7+64];\n"
8039 " add.f32 %f6, %f5, %f4;\n"
8040 " st.volatile.shared.f32 [%rd7+0], %f6;\n"
8041 " .loc 3 711 0\n"
8042 " ld.volatile.shared.f32 %f7, [%rd7+32];\n"
8043 " add.f32 %f8, %f7, %f6;\n"
8044 " st.volatile.shared.f32 [%rd7+0], %f8;\n"
8045 " .loc 3 712 0\n"
8046 " ld.volatile.shared.f32 %f9, [%rd7+16];\n"
8047 " add.f32 %f10, %f9, %f8;\n"
8048 " st.volatile.shared.f32 [%rd7+0], %f10;\n"
8049 " .loc 3 713 0\n"
8050 " ld.volatile.shared.f32 %f11, [%rd7+8];\n"
8051 " add.f32 %f12, %f11, %f10;\n"
8052 " st.volatile.shared.f32 [%rd7+0], %f12;\n"
8053 " .loc 3 714 0\n"
8054 " ld.volatile.shared.f32 %f13, [%rd7+4];\n"
8055 " add.f32 %f1, %f13, %f12;\n"
8056 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
8057 "$Lt_76_16386:\n"
8058 " mov.u32 %r12, 0;\n"
8059 " setp.ne.u32 %p5, %r3, %r12;\n"
8060 " @%p5 bra $Lt_76_16898;\n"
8061 " .loc 3 719 0\n"
8062 " ld.shared.f32 %f14, [__smem+0];\n"
8063 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_128_true_true_g_odata];\n"
8064 " cvt.u64.u32 %rd9, %r1;\n"
8065 " mul.wide.u32 %rd10, %r1, 4;\n"
8066 " add.u64 %rd11, %rd8, %rd10;\n"
8067 " st.global.f32 [%rd11+0], %f14;\n"
8068 "$Lt_76_16898:\n"
8069 " .loc 3 792 0\n"
8070 " exit;\n"
8071 "$LDWend_packed_float_reduce_128_true_true:\n"
8072 " } // packed_float_reduce_128_true_true\n"
8073 "\n"
8074 " .entry packed_float_reduce_256_false_false (\n"
8075 " .param .u64 __cudaparm_packed_float_reduce_256_false_false_g_idata,\n"
8076 " .param .u64 __cudaparm_packed_float_reduce_256_false_false_g_odata,\n"
8077 " .param .u32 __cudaparm_packed_float_reduce_256_false_false_n)\n"
8078 " {\n"
8079 " .reg .u16 %rh<7>;\n"
8080 " .reg .u32 %r<15>;\n"
8081 " .reg .u64 %rd<13>;\n"
8082 " .reg .f32 %f<17>;\n"
8083 " .reg .pred %p<8>;\n"
8084 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
8085 " .loc 3 794 0\n"
8086 "$LDWbegin_packed_float_reduce_256_false_false:\n"
8087 " .loc 3 637 0\n"
8088 " cvt.u32.u16 %r1, %ctaid.x;\n"
8089 " mul.lo.u32 %r2, %r1, 512;\n"
8090 " cvt.u32.u16 %r3, %tid.x;\n"
8091 " add.u32 %r4, %r2, %r3;\n"
8092 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_256_false_false_n];\n"
8093 " setp.ge.u32 %p1, %r4, %r5;\n"
8094 " @%p1 bra $Lt_77_17666;\n"
8095 " mul.lo.u32 %r6, %r4, 4;\n"
8096 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_256_false_false_n];\n"
8097 " mul.lo.u32 %r7, %r5, 4;\n"
8098 " mov.u16 %rh1, %nctaid.x;\n"
8099 " mul.wide.u16 %r8, %rh1, 2048;\n"
8100 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_256_false_false_g_idata];\n"
8101 "$Lt_77_15106:\n"
8102 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
8103 " .loc 3 677 0\n"
8104 " cvt.u8.u32 %r9, %r6;\n"
8105 " cvt.u64.u32 %rd2, %r9;\n"
8106 " .loc 3 637 0\n"
8107 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_256_false_false_g_idata];\n"
8108 " .loc 3 677 0\n"
8109 " add.u64 %rd3, %rd2, %rd1;\n"
8110 " ld.global.u8 %rh2, [%rd3+0];\n"
8111 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
8112 " .loc 3 678 0\n"
8113 " ld.global.u8 %rh3, [%rd3+1];\n"
8114 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
8115 " .loc 3 679 0\n"
8116 " ld.global.u8 %rh4, [%rd3+2];\n"
8117 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
8118 " .loc 3 680 0\n"
8119 " ld.global.u8 %rh5, [%rd3+3];\n"
8120 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
8121 " add.u32 %r6, %r6, %r8;\n"
8122 " setp.lt.u32 %p2, %r6, %r7;\n"
8123 " @%p2 bra $Lt_77_15106;\n"
8124 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
8125 " bra.uni $Lt_77_14594;\n"
8126 "$Lt_77_17666:\n"
8127 " mov.f32 %f1, 0f00000000; // 0\n"
8128 "$Lt_77_14594:\n"
8129 " .loc 3 692 0\n"
8130 " mov.u64 %rd4, __smem;\n"
8131 " cvt.u64.u32 %rd5, %r3;\n"
8132 " mul.wide.u32 %rd6, %r3, 4;\n"
8133 " add.u64 %rd7, %rd4, %rd6;\n"
8134 " st.shared.f32 [%rd7+0], %f1;\n"
8135 " .loc 3 693 0\n"
8136 " bar.sync 0;\n"
8137 " mov.u32 %r10, 127;\n"
8138 " setp.gt.u32 %p3, %r3, %r10;\n"
8139 " @%p3 bra $Lt_77_15618;\n"
8140 " .loc 3 698 0\n"
8141 " ld.shared.f32 %f2, [%rd7+512];\n"
8142 " add.f32 %f1, %f2, %f1;\n"
8143 " st.shared.f32 [%rd7+0], %f1;\n"
8144 "$Lt_77_15618:\n"
8145 " bar.sync 0;\n"
8146 " mov.u32 %r11, 63;\n"
8147 " setp.gt.u32 %p4, %r3, %r11;\n"
8148 " @%p4 bra $Lt_77_16130;\n"
8149 " .loc 3 699 0\n"
8150 " ld.shared.f32 %f3, [%rd7+256];\n"
8151 " add.f32 %f1, %f3, %f1;\n"
8152 " st.shared.f32 [%rd7+0], %f1;\n"
8153 "$Lt_77_16130:\n"
8154 " bar.sync 0;\n"
8155 " mov.u32 %r12, 31;\n"
8156 " setp.gt.u32 %p5, %r3, %r12;\n"
8157 " @%p5 bra $Lt_77_16642;\n"
8158 " .loc 3 709 0\n"
8159 " ld.volatile.shared.f32 %f4, [%rd7+128];\n"
8160 " add.f32 %f5, %f4, %f1;\n"
8161 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
8162 " .loc 3 710 0\n"
8163 " ld.volatile.shared.f32 %f6, [%rd7+64];\n"
8164 " add.f32 %f7, %f6, %f5;\n"
8165 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
8166 " .loc 3 711 0\n"
8167 " ld.volatile.shared.f32 %f8, [%rd7+32];\n"
8168 " add.f32 %f9, %f8, %f7;\n"
8169 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
8170 " .loc 3 712 0\n"
8171 " ld.volatile.shared.f32 %f10, [%rd7+16];\n"
8172 " add.f32 %f11, %f10, %f9;\n"
8173 " st.volatile.shared.f32 [%rd7+0], %f11;\n"
8174 " .loc 3 713 0\n"
8175 " ld.volatile.shared.f32 %f12, [%rd7+8];\n"
8176 " add.f32 %f13, %f12, %f11;\n"
8177 " st.volatile.shared.f32 [%rd7+0], %f13;\n"
8178 " .loc 3 714 0\n"
8179 " ld.volatile.shared.f32 %f14, [%rd7+4];\n"
8180 " add.f32 %f1, %f14, %f13;\n"
8181 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
8182 "$Lt_77_16642:\n"
8183 " mov.u32 %r13, 0;\n"
8184 " setp.ne.u32 %p6, %r3, %r13;\n"
8185 " @%p6 bra $Lt_77_17154;\n"
8186 " .loc 3 719 0\n"
8187 " ld.shared.f32 %f15, [__smem+0];\n"
8188 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_256_false_false_g_odata];\n"
8189 " cvt.u64.u32 %rd9, %r1;\n"
8190 " mul.wide.u32 %rd10, %r1, 4;\n"
8191 " add.u64 %rd11, %rd8, %rd10;\n"
8192 " st.global.f32 [%rd11+0], %f15;\n"
8193 "$Lt_77_17154:\n"
8194 " .loc 3 795 0\n"
8195 " exit;\n"
8196 "$LDWend_packed_float_reduce_256_false_false:\n"
8197 " } // packed_float_reduce_256_false_false\n"
8198 "\n"
8199 " .entry packed_float_reduce_256_false_true (\n"
8200 " .param .u64 __cudaparm_packed_float_reduce_256_false_true_g_idata,\n"
8201 " .param .u64 __cudaparm_packed_float_reduce_256_false_true_g_odata,\n"
8202 " .param .u32 __cudaparm_packed_float_reduce_256_false_true_n)\n"
8203 " {\n"
8204 " .reg .u16 %rh<7>;\n"
8205 " .reg .u32 %r<15>;\n"
8206 " .reg .u64 %rd<13>;\n"
8207 " .reg .f32 %f<17>;\n"
8208 " .reg .pred %p<8>;\n"
8209 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
8210 " .loc 3 796 0\n"
8211 "$LDWbegin_packed_float_reduce_256_false_true:\n"
8212 " .loc 3 637 0\n"
8213 " cvt.u32.u16 %r1, %ctaid.x;\n"
8214 " mul.lo.u32 %r2, %r1, 512;\n"
8215 " cvt.u32.u16 %r3, %tid.x;\n"
8216 " add.u32 %r4, %r2, %r3;\n"
8217 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_256_false_true_n];\n"
8218 " setp.ge.u32 %p1, %r4, %r5;\n"
8219 " @%p1 bra $Lt_78_17666;\n"
8220 " mul.lo.u32 %r6, %r4, 4;\n"
8221 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_256_false_true_n];\n"
8222 " mul.lo.u32 %r7, %r5, 4;\n"
8223 " mov.u16 %rh1, %nctaid.x;\n"
8224 " mul.wide.u16 %r8, %rh1, 2048;\n"
8225 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_256_false_true_g_idata];\n"
8226 "$Lt_78_15106:\n"
8227 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
8228 " .loc 3 677 0\n"
8229 " cvt.u8.u32 %r9, %r6;\n"
8230 " cvt.u64.u32 %rd2, %r9;\n"
8231 " .loc 3 637 0\n"
8232 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_256_false_true_g_idata];\n"
8233 " .loc 3 677 0\n"
8234 " add.u64 %rd3, %rd2, %rd1;\n"
8235 " ld.global.u8 %rh2, [%rd3+0];\n"
8236 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
8237 " .loc 3 678 0\n"
8238 " ld.global.u8 %rh3, [%rd3+1];\n"
8239 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
8240 " .loc 3 679 0\n"
8241 " ld.global.u8 %rh4, [%rd3+2];\n"
8242 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
8243 " .loc 3 680 0\n"
8244 " ld.global.u8 %rh5, [%rd3+3];\n"
8245 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
8246 " add.u32 %r6, %r6, %r8;\n"
8247 " setp.lt.u32 %p2, %r6, %r7;\n"
8248 " @%p2 bra $Lt_78_15106;\n"
8249 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
8250 " bra.uni $Lt_78_14594;\n"
8251 "$Lt_78_17666:\n"
8252 " mov.f32 %f1, 0f00000000; // 0\n"
8253 "$Lt_78_14594:\n"
8254 " .loc 3 692 0\n"
8255 " mov.u64 %rd4, __smem;\n"
8256 " cvt.u64.u32 %rd5, %r3;\n"
8257 " mul.wide.u32 %rd6, %r3, 4;\n"
8258 " add.u64 %rd7, %rd4, %rd6;\n"
8259 " st.shared.f32 [%rd7+0], %f1;\n"
8260 " .loc 3 693 0\n"
8261 " bar.sync 0;\n"
8262 " mov.u32 %r10, 127;\n"
8263 " setp.gt.u32 %p3, %r3, %r10;\n"
8264 " @%p3 bra $Lt_78_15618;\n"
8265 " .loc 3 698 0\n"
8266 " ld.shared.f32 %f2, [%rd7+512];\n"
8267 " add.f32 %f1, %f2, %f1;\n"
8268 " st.shared.f32 [%rd7+0], %f1;\n"
8269 "$Lt_78_15618:\n"
8270 " bar.sync 0;\n"
8271 " mov.u32 %r11, 63;\n"
8272 " setp.gt.u32 %p4, %r3, %r11;\n"
8273 " @%p4 bra $Lt_78_16130;\n"
8274 " .loc 3 699 0\n"
8275 " ld.shared.f32 %f3, [%rd7+256];\n"
8276 " add.f32 %f1, %f3, %f1;\n"
8277 " st.shared.f32 [%rd7+0], %f1;\n"
8278 "$Lt_78_16130:\n"
8279 " bar.sync 0;\n"
8280 " mov.u32 %r12, 31;\n"
8281 " setp.gt.u32 %p5, %r3, %r12;\n"
8282 " @%p5 bra $Lt_78_16642;\n"
8283 " .loc 3 709 0\n"
8284 " ld.volatile.shared.f32 %f4, [%rd7+128];\n"
8285 " add.f32 %f5, %f4, %f1;\n"
8286 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
8287 " .loc 3 710 0\n"
8288 " ld.volatile.shared.f32 %f6, [%rd7+64];\n"
8289 " add.f32 %f7, %f6, %f5;\n"
8290 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
8291 " .loc 3 711 0\n"
8292 " ld.volatile.shared.f32 %f8, [%rd7+32];\n"
8293 " add.f32 %f9, %f8, %f7;\n"
8294 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
8295 " .loc 3 712 0\n"
8296 " ld.volatile.shared.f32 %f10, [%rd7+16];\n"
8297 " add.f32 %f11, %f10, %f9;\n"
8298 " st.volatile.shared.f32 [%rd7+0], %f11;\n"
8299 " .loc 3 713 0\n"
8300 " ld.volatile.shared.f32 %f12, [%rd7+8];\n"
8301 " add.f32 %f13, %f12, %f11;\n"
8302 " st.volatile.shared.f32 [%rd7+0], %f13;\n"
8303 " .loc 3 714 0\n"
8304 " ld.volatile.shared.f32 %f14, [%rd7+4];\n"
8305 " add.f32 %f1, %f14, %f13;\n"
8306 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
8307 "$Lt_78_16642:\n"
8308 " mov.u32 %r13, 0;\n"
8309 " setp.ne.u32 %p6, %r3, %r13;\n"
8310 " @%p6 bra $Lt_78_17154;\n"
8311 " .loc 3 719 0\n"
8312 " ld.shared.f32 %f15, [__smem+0];\n"
8313 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_256_false_true_g_odata];\n"
8314 " cvt.u64.u32 %rd9, %r1;\n"
8315 " mul.wide.u32 %rd10, %r1, 4;\n"
8316 " add.u64 %rd11, %rd8, %rd10;\n"
8317 " st.global.f32 [%rd11+0], %f15;\n"
8318 "$Lt_78_17154:\n"
8319 " .loc 3 797 0\n"
8320 " exit;\n"
8321 "$LDWend_packed_float_reduce_256_false_true:\n"
8322 " } // packed_float_reduce_256_false_true\n"
8323 "\n"
8324 " .entry packed_float_reduce_256_true_false (\n"
8325 " .param .u64 __cudaparm_packed_float_reduce_256_true_false_g_idata,\n"
8326 " .param .u64 __cudaparm_packed_float_reduce_256_true_false_g_odata,\n"
8327 " .param .u32 __cudaparm_packed_float_reduce_256_true_false_n)\n"
8328 " {\n"
8329 " .reg .u16 %rh<7>;\n"
8330 " .reg .u32 %r<15>;\n"
8331 " .reg .u64 %rd<13>;\n"
8332 " .reg .f32 %f<17>;\n"
8333 " .reg .pred %p<8>;\n"
8334 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
8335 " .loc 3 798 0\n"
8336 "$LDWbegin_packed_float_reduce_256_true_false:\n"
8337 " .loc 3 637 0\n"
8338 " cvt.u32.u16 %r1, %ctaid.x;\n"
8339 " mul.lo.u32 %r2, %r1, 512;\n"
8340 " cvt.u32.u16 %r3, %tid.x;\n"
8341 " add.u32 %r4, %r2, %r3;\n"
8342 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_256_true_false_n];\n"
8343 " setp.ge.u32 %p1, %r4, %r5;\n"
8344 " @%p1 bra $Lt_79_17666;\n"
8345 " mul.lo.u32 %r6, %r4, 4;\n"
8346 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_256_true_false_n];\n"
8347 " mul.lo.u32 %r7, %r5, 4;\n"
8348 " mov.u16 %rh1, %nctaid.x;\n"
8349 " mul.wide.u16 %r8, %rh1, 2048;\n"
8350 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_256_true_false_g_idata];\n"
8351 "$Lt_79_15106:\n"
8352 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
8353 " .loc 3 677 0\n"
8354 " cvt.u8.u32 %r9, %r6;\n"
8355 " cvt.u64.u32 %rd2, %r9;\n"
8356 " .loc 3 637 0\n"
8357 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_256_true_false_g_idata];\n"
8358 " .loc 3 677 0\n"
8359 " add.u64 %rd3, %rd2, %rd1;\n"
8360 " ld.global.u8 %rh2, [%rd3+0];\n"
8361 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
8362 " .loc 3 678 0\n"
8363 " ld.global.u8 %rh3, [%rd3+1];\n"
8364 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
8365 " .loc 3 679 0\n"
8366 " ld.global.u8 %rh4, [%rd3+2];\n"
8367 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
8368 " .loc 3 680 0\n"
8369 " ld.global.u8 %rh5, [%rd3+3];\n"
8370 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
8371 " add.u32 %r6, %r6, %r8;\n"
8372 " setp.lt.u32 %p2, %r6, %r7;\n"
8373 " @%p2 bra $Lt_79_15106;\n"
8374 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
8375 " bra.uni $Lt_79_14594;\n"
8376 "$Lt_79_17666:\n"
8377 " mov.f32 %f1, 0f00000000; // 0\n"
8378 "$Lt_79_14594:\n"
8379 " .loc 3 692 0\n"
8380 " mov.u64 %rd4, __smem;\n"
8381 " cvt.u64.u32 %rd5, %r3;\n"
8382 " mul.wide.u32 %rd6, %r3, 4;\n"
8383 " add.u64 %rd7, %rd4, %rd6;\n"
8384 " st.shared.f32 [%rd7+0], %f1;\n"
8385 " .loc 3 693 0\n"
8386 " bar.sync 0;\n"
8387 " mov.u32 %r10, 127;\n"
8388 " setp.gt.u32 %p3, %r3, %r10;\n"
8389 " @%p3 bra $Lt_79_15618;\n"
8390 " .loc 3 698 0\n"
8391 " ld.shared.f32 %f2, [%rd7+512];\n"
8392 " add.f32 %f1, %f2, %f1;\n"
8393 " st.shared.f32 [%rd7+0], %f1;\n"
8394 "$Lt_79_15618:\n"
8395 " bar.sync 0;\n"
8396 " mov.u32 %r11, 63;\n"
8397 " setp.gt.u32 %p4, %r3, %r11;\n"
8398 " @%p4 bra $Lt_79_16130;\n"
8399 " .loc 3 699 0\n"
8400 " ld.shared.f32 %f3, [%rd7+256];\n"
8401 " add.f32 %f1, %f3, %f1;\n"
8402 " st.shared.f32 [%rd7+0], %f1;\n"
8403 "$Lt_79_16130:\n"
8404 " bar.sync 0;\n"
8405 " mov.u32 %r12, 31;\n"
8406 " setp.gt.u32 %p5, %r3, %r12;\n"
8407 " @%p5 bra $Lt_79_16642;\n"
8408 " .loc 3 709 0\n"
8409 " ld.volatile.shared.f32 %f4, [%rd7+128];\n"
8410 " add.f32 %f5, %f4, %f1;\n"
8411 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
8412 " .loc 3 710 0\n"
8413 " ld.volatile.shared.f32 %f6, [%rd7+64];\n"
8414 " add.f32 %f7, %f6, %f5;\n"
8415 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
8416 " .loc 3 711 0\n"
8417 " ld.volatile.shared.f32 %f8, [%rd7+32];\n"
8418 " add.f32 %f9, %f8, %f7;\n"
8419 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
8420 " .loc 3 712 0\n"
8421 " ld.volatile.shared.f32 %f10, [%rd7+16];\n"
8422 " add.f32 %f11, %f10, %f9;\n"
8423 " st.volatile.shared.f32 [%rd7+0], %f11;\n"
8424 " .loc 3 713 0\n"
8425 " ld.volatile.shared.f32 %f12, [%rd7+8];\n"
8426 " add.f32 %f13, %f12, %f11;\n"
8427 " st.volatile.shared.f32 [%rd7+0], %f13;\n"
8428 " .loc 3 714 0\n"
8429 " ld.volatile.shared.f32 %f14, [%rd7+4];\n"
8430 " add.f32 %f1, %f14, %f13;\n"
8431 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
8432 "$Lt_79_16642:\n"
8433 " mov.u32 %r13, 0;\n"
8434 " setp.ne.u32 %p6, %r3, %r13;\n"
8435 " @%p6 bra $Lt_79_17154;\n"
8436 " .loc 3 719 0\n"
8437 " ld.shared.f32 %f15, [__smem+0];\n"
8438 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_256_true_false_g_odata];\n"
8439 " cvt.u64.u32 %rd9, %r1;\n"
8440 " mul.wide.u32 %rd10, %r1, 4;\n"
8441 " add.u64 %rd11, %rd8, %rd10;\n"
8442 " st.global.f32 [%rd11+0], %f15;\n"
8443 "$Lt_79_17154:\n"
8444 " .loc 3 799 0\n"
8445 " exit;\n"
8446 "$LDWend_packed_float_reduce_256_true_false:\n"
8447 " } // packed_float_reduce_256_true_false\n"
8448 "\n"
8449 " .entry packed_float_reduce_256_true_true (\n"
8450 " .param .u64 __cudaparm_packed_float_reduce_256_true_true_g_idata,\n"
8451 " .param .u64 __cudaparm_packed_float_reduce_256_true_true_g_odata,\n"
8452 " .param .u32 __cudaparm_packed_float_reduce_256_true_true_n)\n"
8453 " {\n"
8454 " .reg .u16 %rh<7>;\n"
8455 " .reg .u32 %r<15>;\n"
8456 " .reg .u64 %rd<13>;\n"
8457 " .reg .f32 %f<17>;\n"
8458 " .reg .pred %p<8>;\n"
8459 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
8460 " .loc 3 800 0\n"
8461 "$LDWbegin_packed_float_reduce_256_true_true:\n"
8462 " .loc 3 637 0\n"
8463 " cvt.u32.u16 %r1, %ctaid.x;\n"
8464 " mul.lo.u32 %r2, %r1, 512;\n"
8465 " cvt.u32.u16 %r3, %tid.x;\n"
8466 " add.u32 %r4, %r2, %r3;\n"
8467 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_256_true_true_n];\n"
8468 " setp.ge.u32 %p1, %r4, %r5;\n"
8469 " @%p1 bra $Lt_80_17666;\n"
8470 " mul.lo.u32 %r6, %r4, 4;\n"
8471 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_256_true_true_n];\n"
8472 " mul.lo.u32 %r7, %r5, 4;\n"
8473 " mov.u16 %rh1, %nctaid.x;\n"
8474 " mul.wide.u16 %r8, %rh1, 2048;\n"
8475 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_256_true_true_g_idata];\n"
8476 "$Lt_80_15106:\n"
8477 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
8478 " .loc 3 677 0\n"
8479 " cvt.u8.u32 %r9, %r6;\n"
8480 " cvt.u64.u32 %rd2, %r9;\n"
8481 " .loc 3 637 0\n"
8482 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_256_true_true_g_idata];\n"
8483 " .loc 3 677 0\n"
8484 " add.u64 %rd3, %rd2, %rd1;\n"
8485 " ld.global.u8 %rh2, [%rd3+0];\n"
8486 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
8487 " .loc 3 678 0\n"
8488 " ld.global.u8 %rh3, [%rd3+1];\n"
8489 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
8490 " .loc 3 679 0\n"
8491 " ld.global.u8 %rh4, [%rd3+2];\n"
8492 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
8493 " .loc 3 680 0\n"
8494 " ld.global.u8 %rh5, [%rd3+3];\n"
8495 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
8496 " add.u32 %r6, %r6, %r8;\n"
8497 " setp.lt.u32 %p2, %r6, %r7;\n"
8498 " @%p2 bra $Lt_80_15106;\n"
8499 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
8500 " bra.uni $Lt_80_14594;\n"
8501 "$Lt_80_17666:\n"
8502 " mov.f32 %f1, 0f00000000; // 0\n"
8503 "$Lt_80_14594:\n"
8504 " .loc 3 692 0\n"
8505 " mov.u64 %rd4, __smem;\n"
8506 " cvt.u64.u32 %rd5, %r3;\n"
8507 " mul.wide.u32 %rd6, %r3, 4;\n"
8508 " add.u64 %rd7, %rd4, %rd6;\n"
8509 " st.shared.f32 [%rd7+0], %f1;\n"
8510 " .loc 3 693 0\n"
8511 " bar.sync 0;\n"
8512 " mov.u32 %r10, 127;\n"
8513 " setp.gt.u32 %p3, %r3, %r10;\n"
8514 " @%p3 bra $Lt_80_15618;\n"
8515 " .loc 3 698 0\n"
8516 " ld.shared.f32 %f2, [%rd7+512];\n"
8517 " add.f32 %f1, %f2, %f1;\n"
8518 " st.shared.f32 [%rd7+0], %f1;\n"
8519 "$Lt_80_15618:\n"
8520 " bar.sync 0;\n"
8521 " mov.u32 %r11, 63;\n"
8522 " setp.gt.u32 %p4, %r3, %r11;\n"
8523 " @%p4 bra $Lt_80_16130;\n"
8524 " .loc 3 699 0\n"
8525 " ld.shared.f32 %f3, [%rd7+256];\n"
8526 " add.f32 %f1, %f3, %f1;\n"
8527 " st.shared.f32 [%rd7+0], %f1;\n"
8528 "$Lt_80_16130:\n"
8529 " bar.sync 0;\n"
8530 " mov.u32 %r12, 31;\n"
8531 " setp.gt.u32 %p5, %r3, %r12;\n"
8532 " @%p5 bra $Lt_80_16642;\n"
8533 " .loc 3 709 0\n"
8534 " ld.volatile.shared.f32 %f4, [%rd7+128];\n"
8535 " add.f32 %f5, %f4, %f1;\n"
8536 " st.volatile.shared.f32 [%rd7+0], %f5;\n"
8537 " .loc 3 710 0\n"
8538 " ld.volatile.shared.f32 %f6, [%rd7+64];\n"
8539 " add.f32 %f7, %f6, %f5;\n"
8540 " st.volatile.shared.f32 [%rd7+0], %f7;\n"
8541 " .loc 3 711 0\n"
8542 " ld.volatile.shared.f32 %f8, [%rd7+32];\n"
8543 " add.f32 %f9, %f8, %f7;\n"
8544 " st.volatile.shared.f32 [%rd7+0], %f9;\n"
8545 " .loc 3 712 0\n"
8546 " ld.volatile.shared.f32 %f10, [%rd7+16];\n"
8547 " add.f32 %f11, %f10, %f9;\n"
8548 " st.volatile.shared.f32 [%rd7+0], %f11;\n"
8549 " .loc 3 713 0\n"
8550 " ld.volatile.shared.f32 %f12, [%rd7+8];\n"
8551 " add.f32 %f13, %f12, %f11;\n"
8552 " st.volatile.shared.f32 [%rd7+0], %f13;\n"
8553 " .loc 3 714 0\n"
8554 " ld.volatile.shared.f32 %f14, [%rd7+4];\n"
8555 " add.f32 %f1, %f14, %f13;\n"
8556 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
8557 "$Lt_80_16642:\n"
8558 " mov.u32 %r13, 0;\n"
8559 " setp.ne.u32 %p6, %r3, %r13;\n"
8560 " @%p6 bra $Lt_80_17154;\n"
8561 " .loc 3 719 0\n"
8562 " ld.shared.f32 %f15, [__smem+0];\n"
8563 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_256_true_true_g_odata];\n"
8564 " cvt.u64.u32 %rd9, %r1;\n"
8565 " mul.wide.u32 %rd10, %r1, 4;\n"
8566 " add.u64 %rd11, %rd8, %rd10;\n"
8567 " st.global.f32 [%rd11+0], %f15;\n"
8568 "$Lt_80_17154:\n"
8569 " .loc 3 801 0\n"
8570 " exit;\n"
8571 "$LDWend_packed_float_reduce_256_true_true:\n"
8572 " } // packed_float_reduce_256_true_true\n"
8573 "\n"
8574 " .entry packed_float_reduce_512_false_false (\n"
8575 " .param .u64 __cudaparm_packed_float_reduce_512_false_false_g_idata,\n"
8576 " .param .u64 __cudaparm_packed_float_reduce_512_false_false_g_odata,\n"
8577 " .param .u32 __cudaparm_packed_float_reduce_512_false_false_n)\n"
8578 " {\n"
8579 " .reg .u16 %rh<7>;\n"
8580 " .reg .u32 %r<16>;\n"
8581 " .reg .u64 %rd<13>;\n"
8582 " .reg .f32 %f<18>;\n"
8583 " .reg .pred %p<9>;\n"
8584 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
8585 " .loc 3 803 0\n"
8586 "$LDWbegin_packed_float_reduce_512_false_false:\n"
8587 " .loc 3 637 0\n"
8588 " cvt.u32.u16 %r1, %ctaid.x;\n"
8589 " mul.lo.u32 %r2, %r1, 1024;\n"
8590 " cvt.u32.u16 %r3, %tid.x;\n"
8591 " add.u32 %r4, %r2, %r3;\n"
8592 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_512_false_false_n];\n"
8593 " setp.ge.u32 %p1, %r4, %r5;\n"
8594 " @%p1 bra $Lt_81_17922;\n"
8595 " mul.lo.u32 %r6, %r4, 4;\n"
8596 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_512_false_false_n];\n"
8597 " mul.lo.u32 %r7, %r5, 4;\n"
8598 " mov.u16 %rh1, %nctaid.x;\n"
8599 " mul.wide.u16 %r8, %rh1, 4096;\n"
8600 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_512_false_false_g_idata];\n"
8601 "$Lt_81_14850:\n"
8602 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
8603 " .loc 3 677 0\n"
8604 " cvt.u8.u32 %r9, %r6;\n"
8605 " cvt.u64.u32 %rd2, %r9;\n"
8606 " .loc 3 637 0\n"
8607 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_512_false_false_g_idata];\n"
8608 " .loc 3 677 0\n"
8609 " add.u64 %rd3, %rd2, %rd1;\n"
8610 " ld.global.u8 %rh2, [%rd3+0];\n"
8611 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
8612 " .loc 3 678 0\n"
8613 " ld.global.u8 %rh3, [%rd3+1];\n"
8614 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
8615 " .loc 3 679 0\n"
8616 " ld.global.u8 %rh4, [%rd3+2];\n"
8617 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
8618 " .loc 3 680 0\n"
8619 " ld.global.u8 %rh5, [%rd3+3];\n"
8620 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
8621 " add.u32 %r6, %r6, %r8;\n"
8622 " setp.lt.u32 %p2, %r6, %r7;\n"
8623 " @%p2 bra $Lt_81_14850;\n"
8624 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
8625 " bra.uni $Lt_81_14338;\n"
8626 "$Lt_81_17922:\n"
8627 " mov.f32 %f1, 0f00000000; // 0\n"
8628 "$Lt_81_14338:\n"
8629 " .loc 3 692 0\n"
8630 " mov.u64 %rd4, __smem;\n"
8631 " cvt.u64.u32 %rd5, %r3;\n"
8632 " mul.wide.u32 %rd6, %r3, 4;\n"
8633 " add.u64 %rd7, %rd4, %rd6;\n"
8634 " st.shared.f32 [%rd7+0], %f1;\n"
8635 " .loc 3 693 0\n"
8636 " bar.sync 0;\n"
8637 " mov.u32 %r10, 255;\n"
8638 " setp.gt.u32 %p3, %r3, %r10;\n"
8639 " @%p3 bra $Lt_81_15362;\n"
8640 " .loc 3 697 0\n"
8641 " ld.shared.f32 %f2, [%rd7+1024];\n"
8642 " add.f32 %f1, %f2, %f1;\n"
8643 " st.shared.f32 [%rd7+0], %f1;\n"
8644 "$Lt_81_15362:\n"
8645 " bar.sync 0;\n"
8646 " mov.u32 %r11, 127;\n"
8647 " setp.gt.u32 %p4, %r3, %r11;\n"
8648 " @%p4 bra $Lt_81_15874;\n"
8649 " .loc 3 698 0\n"
8650 " ld.shared.f32 %f3, [%rd7+512];\n"
8651 " add.f32 %f1, %f3, %f1;\n"
8652 " st.shared.f32 [%rd7+0], %f1;\n"
8653 "$Lt_81_15874:\n"
8654 " bar.sync 0;\n"
8655 " mov.u32 %r12, 63;\n"
8656 " setp.gt.u32 %p5, %r3, %r12;\n"
8657 " @%p5 bra $Lt_81_16386;\n"
8658 " .loc 3 699 0\n"
8659 " ld.shared.f32 %f4, [%rd7+256];\n"
8660 " add.f32 %f1, %f4, %f1;\n"
8661 " st.shared.f32 [%rd7+0], %f1;\n"
8662 "$Lt_81_16386:\n"
8663 " bar.sync 0;\n"
8664 " mov.u32 %r13, 31;\n"
8665 " setp.gt.u32 %p6, %r3, %r13;\n"
8666 " @%p6 bra $Lt_81_16898;\n"
8667 " .loc 3 709 0\n"
8668 " ld.volatile.shared.f32 %f5, [%rd7+128];\n"
8669 " add.f32 %f6, %f5, %f1;\n"
8670 " st.volatile.shared.f32 [%rd7+0], %f6;\n"
8671 " .loc 3 710 0\n"
8672 " ld.volatile.shared.f32 %f7, [%rd7+64];\n"
8673 " add.f32 %f8, %f7, %f6;\n"
8674 " st.volatile.shared.f32 [%rd7+0], %f8;\n"
8675 " .loc 3 711 0\n"
8676 " ld.volatile.shared.f32 %f9, [%rd7+32];\n"
8677 " add.f32 %f10, %f9, %f8;\n"
8678 " st.volatile.shared.f32 [%rd7+0], %f10;\n"
8679 " .loc 3 712 0\n"
8680 " ld.volatile.shared.f32 %f11, [%rd7+16];\n"
8681 " add.f32 %f12, %f11, %f10;\n"
8682 " st.volatile.shared.f32 [%rd7+0], %f12;\n"
8683 " .loc 3 713 0\n"
8684 " ld.volatile.shared.f32 %f13, [%rd7+8];\n"
8685 " add.f32 %f14, %f13, %f12;\n"
8686 " st.volatile.shared.f32 [%rd7+0], %f14;\n"
8687 " .loc 3 714 0\n"
8688 " ld.volatile.shared.f32 %f15, [%rd7+4];\n"
8689 " add.f32 %f1, %f15, %f14;\n"
8690 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
8691 "$Lt_81_16898:\n"
8692 " mov.u32 %r14, 0;\n"
8693 " setp.ne.u32 %p7, %r3, %r14;\n"
8694 " @%p7 bra $Lt_81_17410;\n"
8695 " .loc 3 719 0\n"
8696 " ld.shared.f32 %f16, [__smem+0];\n"
8697 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_512_false_false_g_odata];\n"
8698 " cvt.u64.u32 %rd9, %r1;\n"
8699 " mul.wide.u32 %rd10, %r1, 4;\n"
8700 " add.u64 %rd11, %rd8, %rd10;\n"
8701 " st.global.f32 [%rd11+0], %f16;\n"
8702 "$Lt_81_17410:\n"
8703 " .loc 3 804 0\n"
8704 " exit;\n"
8705 "$LDWend_packed_float_reduce_512_false_false:\n"
8706 " } // packed_float_reduce_512_false_false\n"
8707 "\n"
8708 " .entry packed_float_reduce_512_false_true (\n"
8709 " .param .u64 __cudaparm_packed_float_reduce_512_false_true_g_idata,\n"
8710 " .param .u64 __cudaparm_packed_float_reduce_512_false_true_g_odata,\n"
8711 " .param .u32 __cudaparm_packed_float_reduce_512_false_true_n)\n"
8712 " {\n"
8713 " .reg .u16 %rh<7>;\n"
8714 " .reg .u32 %r<16>;\n"
8715 " .reg .u64 %rd<13>;\n"
8716 " .reg .f32 %f<18>;\n"
8717 " .reg .pred %p<9>;\n"
8718 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
8719 " .loc 3 805 0\n"
8720 "$LDWbegin_packed_float_reduce_512_false_true:\n"
8721 " .loc 3 637 0\n"
8722 " cvt.u32.u16 %r1, %ctaid.x;\n"
8723 " mul.lo.u32 %r2, %r1, 1024;\n"
8724 " cvt.u32.u16 %r3, %tid.x;\n"
8725 " add.u32 %r4, %r2, %r3;\n"
8726 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_512_false_true_n];\n"
8727 " setp.ge.u32 %p1, %r4, %r5;\n"
8728 " @%p1 bra $Lt_82_17922;\n"
8729 " mul.lo.u32 %r6, %r4, 4;\n"
8730 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_512_false_true_n];\n"
8731 " mul.lo.u32 %r7, %r5, 4;\n"
8732 " mov.u16 %rh1, %nctaid.x;\n"
8733 " mul.wide.u16 %r8, %rh1, 4096;\n"
8734 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_512_false_true_g_idata];\n"
8735 "$Lt_82_14850:\n"
8736 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
8737 " .loc 3 677 0\n"
8738 " cvt.u8.u32 %r9, %r6;\n"
8739 " cvt.u64.u32 %rd2, %r9;\n"
8740 " .loc 3 637 0\n"
8741 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_512_false_true_g_idata];\n"
8742 " .loc 3 677 0\n"
8743 " add.u64 %rd3, %rd2, %rd1;\n"
8744 " ld.global.u8 %rh2, [%rd3+0];\n"
8745 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
8746 " .loc 3 678 0\n"
8747 " ld.global.u8 %rh3, [%rd3+1];\n"
8748 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
8749 " .loc 3 679 0\n"
8750 " ld.global.u8 %rh4, [%rd3+2];\n"
8751 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
8752 " .loc 3 680 0\n"
8753 " ld.global.u8 %rh5, [%rd3+3];\n"
8754 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
8755 " add.u32 %r6, %r6, %r8;\n"
8756 " setp.lt.u32 %p2, %r6, %r7;\n"
8757 " @%p2 bra $Lt_82_14850;\n"
8758 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
8759 " bra.uni $Lt_82_14338;\n"
8760 "$Lt_82_17922:\n"
8761 " mov.f32 %f1, 0f00000000; // 0\n"
8762 "$Lt_82_14338:\n"
8763 " .loc 3 692 0\n"
8764 " mov.u64 %rd4, __smem;\n"
8765 " cvt.u64.u32 %rd5, %r3;\n"
8766 " mul.wide.u32 %rd6, %r3, 4;\n"
8767 " add.u64 %rd7, %rd4, %rd6;\n"
8768 " st.shared.f32 [%rd7+0], %f1;\n"
8769 " .loc 3 693 0\n"
8770 " bar.sync 0;\n"
8771 " mov.u32 %r10, 255;\n"
8772 " setp.gt.u32 %p3, %r3, %r10;\n"
8773 " @%p3 bra $Lt_82_15362;\n"
8774 " .loc 3 697 0\n"
8775 " ld.shared.f32 %f2, [%rd7+1024];\n"
8776 " add.f32 %f1, %f2, %f1;\n"
8777 " st.shared.f32 [%rd7+0], %f1;\n"
8778 "$Lt_82_15362:\n"
8779 " bar.sync 0;\n"
8780 " mov.u32 %r11, 127;\n"
8781 " setp.gt.u32 %p4, %r3, %r11;\n"
8782 " @%p4 bra $Lt_82_15874;\n"
8783 " .loc 3 698 0\n"
8784 " ld.shared.f32 %f3, [%rd7+512];\n"
8785 " add.f32 %f1, %f3, %f1;\n"
8786 " st.shared.f32 [%rd7+0], %f1;\n"
8787 "$Lt_82_15874:\n"
8788 " bar.sync 0;\n"
8789 " mov.u32 %r12, 63;\n"
8790 " setp.gt.u32 %p5, %r3, %r12;\n"
8791 " @%p5 bra $Lt_82_16386;\n"
8792 " .loc 3 699 0\n"
8793 " ld.shared.f32 %f4, [%rd7+256];\n"
8794 " add.f32 %f1, %f4, %f1;\n"
8795 " st.shared.f32 [%rd7+0], %f1;\n"
8796 "$Lt_82_16386:\n"
8797 " bar.sync 0;\n"
8798 " mov.u32 %r13, 31;\n"
8799 " setp.gt.u32 %p6, %r3, %r13;\n"
8800 " @%p6 bra $Lt_82_16898;\n"
8801 " .loc 3 709 0\n"
8802 " ld.volatile.shared.f32 %f5, [%rd7+128];\n"
8803 " add.f32 %f6, %f5, %f1;\n"
8804 " st.volatile.shared.f32 [%rd7+0], %f6;\n"
8805 " .loc 3 710 0\n"
8806 " ld.volatile.shared.f32 %f7, [%rd7+64];\n"
8807 " add.f32 %f8, %f7, %f6;\n"
8808 " st.volatile.shared.f32 [%rd7+0], %f8;\n"
8809 " .loc 3 711 0\n"
8810 " ld.volatile.shared.f32 %f9, [%rd7+32];\n"
8811 " add.f32 %f10, %f9, %f8;\n"
8812 " st.volatile.shared.f32 [%rd7+0], %f10;\n"
8813 " .loc 3 712 0\n"
8814 " ld.volatile.shared.f32 %f11, [%rd7+16];\n"
8815 " add.f32 %f12, %f11, %f10;\n"
8816 " st.volatile.shared.f32 [%rd7+0], %f12;\n"
8817 " .loc 3 713 0\n"
8818 " ld.volatile.shared.f32 %f13, [%rd7+8];\n"
8819 " add.f32 %f14, %f13, %f12;\n"
8820 " st.volatile.shared.f32 [%rd7+0], %f14;\n"
8821 " .loc 3 714 0\n"
8822 " ld.volatile.shared.f32 %f15, [%rd7+4];\n"
8823 " add.f32 %f1, %f15, %f14;\n"
8824 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
8825 "$Lt_82_16898:\n"
8826 " mov.u32 %r14, 0;\n"
8827 " setp.ne.u32 %p7, %r3, %r14;\n"
8828 " @%p7 bra $Lt_82_17410;\n"
8829 " .loc 3 719 0\n"
8830 " ld.shared.f32 %f16, [__smem+0];\n"
8831 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_512_false_true_g_odata];\n"
8832 " cvt.u64.u32 %rd9, %r1;\n"
8833 " mul.wide.u32 %rd10, %r1, 4;\n"
8834 " add.u64 %rd11, %rd8, %rd10;\n"
8835 " st.global.f32 [%rd11+0], %f16;\n"
8836 "$Lt_82_17410:\n"
8837 " .loc 3 806 0\n"
8838 " exit;\n"
8839 "$LDWend_packed_float_reduce_512_false_true:\n"
8840 " } // packed_float_reduce_512_false_true\n"
8841 "\n"
8842 " .entry packed_float_reduce_512_true_false (\n"
8843 " .param .u64 __cudaparm_packed_float_reduce_512_true_false_g_idata,\n"
8844 " .param .u64 __cudaparm_packed_float_reduce_512_true_false_g_odata,\n"
8845 " .param .u32 __cudaparm_packed_float_reduce_512_true_false_n)\n"
8846 " {\n"
8847 " .reg .u16 %rh<7>;\n"
8848 " .reg .u32 %r<16>;\n"
8849 " .reg .u64 %rd<13>;\n"
8850 " .reg .f32 %f<18>;\n"
8851 " .reg .pred %p<9>;\n"
8852 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
8853 " .loc 3 807 0\n"
8854 "$LDWbegin_packed_float_reduce_512_true_false:\n"
8855 " .loc 3 637 0\n"
8856 " cvt.u32.u16 %r1, %ctaid.x;\n"
8857 " mul.lo.u32 %r2, %r1, 1024;\n"
8858 " cvt.u32.u16 %r3, %tid.x;\n"
8859 " add.u32 %r4, %r2, %r3;\n"
8860 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_512_true_false_n];\n"
8861 " setp.ge.u32 %p1, %r4, %r5;\n"
8862 " @%p1 bra $Lt_83_17922;\n"
8863 " mul.lo.u32 %r6, %r4, 4;\n"
8864 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_512_true_false_n];\n"
8865 " mul.lo.u32 %r7, %r5, 4;\n"
8866 " mov.u16 %rh1, %nctaid.x;\n"
8867 " mul.wide.u16 %r8, %rh1, 4096;\n"
8868 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_512_true_false_g_idata];\n"
8869 "$Lt_83_14850:\n"
8870 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
8871 " .loc 3 677 0\n"
8872 " cvt.u8.u32 %r9, %r6;\n"
8873 " cvt.u64.u32 %rd2, %r9;\n"
8874 " .loc 3 637 0\n"
8875 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_512_true_false_g_idata];\n"
8876 " .loc 3 677 0\n"
8877 " add.u64 %rd3, %rd2, %rd1;\n"
8878 " ld.global.u8 %rh2, [%rd3+0];\n"
8879 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
8880 " .loc 3 678 0\n"
8881 " ld.global.u8 %rh3, [%rd3+1];\n"
8882 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
8883 " .loc 3 679 0\n"
8884 " ld.global.u8 %rh4, [%rd3+2];\n"
8885 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
8886 " .loc 3 680 0\n"
8887 " ld.global.u8 %rh5, [%rd3+3];\n"
8888 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
8889 " add.u32 %r6, %r6, %r8;\n"
8890 " setp.lt.u32 %p2, %r6, %r7;\n"
8891 " @%p2 bra $Lt_83_14850;\n"
8892 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
8893 " bra.uni $Lt_83_14338;\n"
8894 "$Lt_83_17922:\n"
8895 " mov.f32 %f1, 0f00000000; // 0\n"
8896 "$Lt_83_14338:\n"
8897 " .loc 3 692 0\n"
8898 " mov.u64 %rd4, __smem;\n"
8899 " cvt.u64.u32 %rd5, %r3;\n"
8900 " mul.wide.u32 %rd6, %r3, 4;\n"
8901 " add.u64 %rd7, %rd4, %rd6;\n"
8902 " st.shared.f32 [%rd7+0], %f1;\n"
8903 " .loc 3 693 0\n"
8904 " bar.sync 0;\n"
8905 " mov.u32 %r10, 255;\n"
8906 " setp.gt.u32 %p3, %r3, %r10;\n"
8907 " @%p3 bra $Lt_83_15362;\n"
8908 " .loc 3 697 0\n"
8909 " ld.shared.f32 %f2, [%rd7+1024];\n"
8910 " add.f32 %f1, %f2, %f1;\n"
8911 " st.shared.f32 [%rd7+0], %f1;\n"
8912 "$Lt_83_15362:\n"
8913 " bar.sync 0;\n"
8914 " mov.u32 %r11, 127;\n"
8915 " setp.gt.u32 %p4, %r3, %r11;\n"
8916 " @%p4 bra $Lt_83_15874;\n"
8917 " .loc 3 698 0\n"
8918 " ld.shared.f32 %f3, [%rd7+512];\n"
8919 " add.f32 %f1, %f3, %f1;\n"
8920 " st.shared.f32 [%rd7+0], %f1;\n"
8921 "$Lt_83_15874:\n"
8922 " bar.sync 0;\n"
8923 " mov.u32 %r12, 63;\n"
8924 " setp.gt.u32 %p5, %r3, %r12;\n"
8925 " @%p5 bra $Lt_83_16386;\n"
8926 " .loc 3 699 0\n"
8927 " ld.shared.f32 %f4, [%rd7+256];\n"
8928 " add.f32 %f1, %f4, %f1;\n"
8929 " st.shared.f32 [%rd7+0], %f1;\n"
8930 "$Lt_83_16386:\n"
8931 " bar.sync 0;\n"
8932 " mov.u32 %r13, 31;\n"
8933 " setp.gt.u32 %p6, %r3, %r13;\n"
8934 " @%p6 bra $Lt_83_16898;\n"
8935 " .loc 3 709 0\n"
8936 " ld.volatile.shared.f32 %f5, [%rd7+128];\n"
8937 " add.f32 %f6, %f5, %f1;\n"
8938 " st.volatile.shared.f32 [%rd7+0], %f6;\n"
8939 " .loc 3 710 0\n"
8940 " ld.volatile.shared.f32 %f7, [%rd7+64];\n"
8941 " add.f32 %f8, %f7, %f6;\n"
8942 " st.volatile.shared.f32 [%rd7+0], %f8;\n"
8943 " .loc 3 711 0\n"
8944 " ld.volatile.shared.f32 %f9, [%rd7+32];\n"
8945 " add.f32 %f10, %f9, %f8;\n"
8946 " st.volatile.shared.f32 [%rd7+0], %f10;\n"
8947 " .loc 3 712 0\n"
8948 " ld.volatile.shared.f32 %f11, [%rd7+16];\n"
8949 " add.f32 %f12, %f11, %f10;\n"
8950 " st.volatile.shared.f32 [%rd7+0], %f12;\n"
8951 " .loc 3 713 0\n"
8952 " ld.volatile.shared.f32 %f13, [%rd7+8];\n"
8953 " add.f32 %f14, %f13, %f12;\n"
8954 " st.volatile.shared.f32 [%rd7+0], %f14;\n"
8955 " .loc 3 714 0\n"
8956 " ld.volatile.shared.f32 %f15, [%rd7+4];\n"
8957 " add.f32 %f1, %f15, %f14;\n"
8958 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
8959 "$Lt_83_16898:\n"
8960 " mov.u32 %r14, 0;\n"
8961 " setp.ne.u32 %p7, %r3, %r14;\n"
8962 " @%p7 bra $Lt_83_17410;\n"
8963 " .loc 3 719 0\n"
8964 " ld.shared.f32 %f16, [__smem+0];\n"
8965 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_512_true_false_g_odata];\n"
8966 " cvt.u64.u32 %rd9, %r1;\n"
8967 " mul.wide.u32 %rd10, %r1, 4;\n"
8968 " add.u64 %rd11, %rd8, %rd10;\n"
8969 " st.global.f32 [%rd11+0], %f16;\n"
8970 "$Lt_83_17410:\n"
8971 " .loc 3 808 0\n"
8972 " exit;\n"
8973 "$LDWend_packed_float_reduce_512_true_false:\n"
8974 " } // packed_float_reduce_512_true_false\n"
8975 "\n"
8976 " .entry packed_float_reduce_512_true_true (\n"
8977 " .param .u64 __cudaparm_packed_float_reduce_512_true_true_g_idata,\n"
8978 " .param .u64 __cudaparm_packed_float_reduce_512_true_true_g_odata,\n"
8979 " .param .u32 __cudaparm_packed_float_reduce_512_true_true_n)\n"
8980 " {\n"
8981 " .reg .u16 %rh<7>;\n"
8982 " .reg .u32 %r<16>;\n"
8983 " .reg .u64 %rd<13>;\n"
8984 " .reg .f32 %f<18>;\n"
8985 " .reg .pred %p<9>;\n"
8986 " .local .align 4 .b8 __cuda_local_var_22761_19_pack_0[4];\n"
8987 " .loc 3 809 0\n"
8988 "$LDWbegin_packed_float_reduce_512_true_true:\n"
8989 " .loc 3 637 0\n"
8990 " cvt.u32.u16 %r1, %ctaid.x;\n"
8991 " mul.lo.u32 %r2, %r1, 1024;\n"
8992 " cvt.u32.u16 %r3, %tid.x;\n"
8993 " add.u32 %r4, %r2, %r3;\n"
8994 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_512_true_true_n];\n"
8995 " setp.ge.u32 %p1, %r4, %r5;\n"
8996 " @%p1 bra $Lt_84_17922;\n"
8997 " mul.lo.u32 %r6, %r4, 4;\n"
8998 " ld.param.u32 %r5, [__cudaparm_packed_float_reduce_512_true_true_n];\n"
8999 " mul.lo.u32 %r7, %r5, 4;\n"
9000 " mov.u16 %rh1, %nctaid.x;\n"
9001 " mul.wide.u16 %r8, %rh1, 4096;\n"
9002 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_512_true_true_g_idata];\n"
9003 "$Lt_84_14850:\n"
9004 " //<loop> Loop body line 637, nesting depth: 1, estimated iterations: unknown\n"
9005 " .loc 3 677 0\n"
9006 " cvt.u8.u32 %r9, %r6;\n"
9007 " cvt.u64.u32 %rd2, %r9;\n"
9008 " .loc 3 637 0\n"
9009 " ld.param.u64 %rd1, [__cudaparm_packed_float_reduce_512_true_true_g_idata];\n"
9010 " .loc 3 677 0\n"
9011 " add.u64 %rd3, %rd2, %rd1;\n"
9012 " ld.global.u8 %rh2, [%rd3+0];\n"
9013 " st.local.u8 [__cuda_local_var_22761_19_pack_0+3], %rh2;\n"
9014 " .loc 3 678 0\n"
9015 " ld.global.u8 %rh3, [%rd3+1];\n"
9016 " st.local.u8 [__cuda_local_var_22761_19_pack_0+2], %rh3;\n"
9017 " .loc 3 679 0\n"
9018 " ld.global.u8 %rh4, [%rd3+2];\n"
9019 " st.local.u8 [__cuda_local_var_22761_19_pack_0+1], %rh4;\n"
9020 " .loc 3 680 0\n"
9021 " ld.global.u8 %rh5, [%rd3+3];\n"
9022 " st.local.u8 [__cuda_local_var_22761_19_pack_0+0], %rh5;\n"
9023 " add.u32 %r6, %r6, %r8;\n"
9024 " setp.lt.u32 %p2, %r6, %r7;\n"
9025 " @%p2 bra $Lt_84_14850;\n"
9026 " ld.local.f32 %f1, [__cuda_local_var_22761_19_pack_0+0];\n"
9027 " bra.uni $Lt_84_14338;\n"
9028 "$Lt_84_17922:\n"
9029 " mov.f32 %f1, 0f00000000; // 0\n"
9030 "$Lt_84_14338:\n"
9031 " .loc 3 692 0\n"
9032 " mov.u64 %rd4, __smem;\n"
9033 " cvt.u64.u32 %rd5, %r3;\n"
9034 " mul.wide.u32 %rd6, %r3, 4;\n"
9035 " add.u64 %rd7, %rd4, %rd6;\n"
9036 " st.shared.f32 [%rd7+0], %f1;\n"
9037 " .loc 3 693 0\n"
9038 " bar.sync 0;\n"
9039 " mov.u32 %r10, 255;\n"
9040 " setp.gt.u32 %p3, %r3, %r10;\n"
9041 " @%p3 bra $Lt_84_15362;\n"
9042 " .loc 3 697 0\n"
9043 " ld.shared.f32 %f2, [%rd7+1024];\n"
9044 " add.f32 %f1, %f2, %f1;\n"
9045 " st.shared.f32 [%rd7+0], %f1;\n"
9046 "$Lt_84_15362:\n"
9047 " bar.sync 0;\n"
9048 " mov.u32 %r11, 127;\n"
9049 " setp.gt.u32 %p4, %r3, %r11;\n"
9050 " @%p4 bra $Lt_84_15874;\n"
9051 " .loc 3 698 0\n"
9052 " ld.shared.f32 %f3, [%rd7+512];\n"
9053 " add.f32 %f1, %f3, %f1;\n"
9054 " st.shared.f32 [%rd7+0], %f1;\n"
9055 "$Lt_84_15874:\n"
9056 " bar.sync 0;\n"
9057 " mov.u32 %r12, 63;\n"
9058 " setp.gt.u32 %p5, %r3, %r12;\n"
9059 " @%p5 bra $Lt_84_16386;\n"
9060 " .loc 3 699 0\n"
9061 " ld.shared.f32 %f4, [%rd7+256];\n"
9062 " add.f32 %f1, %f4, %f1;\n"
9063 " st.shared.f32 [%rd7+0], %f1;\n"
9064 "$Lt_84_16386:\n"
9065 " bar.sync 0;\n"
9066 " mov.u32 %r13, 31;\n"
9067 " setp.gt.u32 %p6, %r3, %r13;\n"
9068 " @%p6 bra $Lt_84_16898;\n"
9069 " .loc 3 709 0\n"
9070 " ld.volatile.shared.f32 %f5, [%rd7+128];\n"
9071 " add.f32 %f6, %f5, %f1;\n"
9072 " st.volatile.shared.f32 [%rd7+0], %f6;\n"
9073 " .loc 3 710 0\n"
9074 " ld.volatile.shared.f32 %f7, [%rd7+64];\n"
9075 " add.f32 %f8, %f7, %f6;\n"
9076 " st.volatile.shared.f32 [%rd7+0], %f8;\n"
9077 " .loc 3 711 0\n"
9078 " ld.volatile.shared.f32 %f9, [%rd7+32];\n"
9079 " add.f32 %f10, %f9, %f8;\n"
9080 " st.volatile.shared.f32 [%rd7+0], %f10;\n"
9081 " .loc 3 712 0\n"
9082 " ld.volatile.shared.f32 %f11, [%rd7+16];\n"
9083 " add.f32 %f12, %f11, %f10;\n"
9084 " st.volatile.shared.f32 [%rd7+0], %f12;\n"
9085 " .loc 3 713 0\n"
9086 " ld.volatile.shared.f32 %f13, [%rd7+8];\n"
9087 " add.f32 %f14, %f13, %f12;\n"
9088 " st.volatile.shared.f32 [%rd7+0], %f14;\n"
9089 " .loc 3 714 0\n"
9090 " ld.volatile.shared.f32 %f15, [%rd7+4];\n"
9091 " add.f32 %f1, %f15, %f14;\n"
9092 " st.volatile.shared.f32 [%rd7+0], %f1;\n"
9093 "$Lt_84_16898:\n"
9094 " mov.u32 %r14, 0;\n"
9095 " setp.ne.u32 %p7, %r3, %r14;\n"
9096 " @%p7 bra $Lt_84_17410;\n"
9097 " .loc 3 719 0\n"
9098 " ld.shared.f32 %f16, [__smem+0];\n"
9099 " ld.param.u64 %rd8, [__cudaparm_packed_float_reduce_512_true_true_g_odata];\n"
9100 " cvt.u64.u32 %rd9, %r1;\n"
9101 " mul.wide.u32 %rd10, %r1, 4;\n"
9102 " add.u64 %rd11, %rd8, %rd10;\n"
9103 " st.global.f32 [%rd11+0], %f16;\n"
9104 "$Lt_84_17410:\n"
9105 " .loc 3 810 0\n"
9106 " exit;\n"
9107 "$LDWend_packed_float_reduce_512_true_true:\n"
9108 " } // packed_float_reduce_512_true_true\n"
9109 "\n"
9110 ;