.version 3.1 .target sm_35 .address_size 64 // Cray Fortran : Version 8.2.2 (u82103f82232i82242p82428a82022z82428) .file 1 "vecAdd_reduction.f90" .entry main_$ck_L29_2( .param .s64 $$arg_ptr_acc_b_t6_t55, .param .s64 $$arg_ptr_acc_a_t8_t56, .param .s64 $$arg_ptr_acc_c_t10_t57, .param .s64 $$arg_ptr_reduc_ctl_t15_t58, .param .s64 $$arg_ptr_reduc_val_t18_t59, .param .s64 $$arg_ptr_accshare_t21_t60 ) { .shared .s32 reduc_share_ctl_c2__f1s125; .shared .u64 $$_redfold_t38__f1s221[128]; // .reg .pred %p<13>; .reg .b64 %bd<10>; .reg .s32 %s<8>; .reg .s64 %sd<34>; .reg .u32 %u<20>; .reg .f64 %fd<29>; // .loc 1 29 0 .loc 1 29 0 mov.u32 %u7, %ntid.x; // vecAdd_reduction.f90:29 mov.u32 %u8, %ctaid.x; // vecAdd_reduction.f90:29 mov.u32 %u9, %tid.x; // vecAdd_reduction.f90:29 mad.lo.u32 %u10, %u7, %u8, %u9; // vecAdd_reduction.f90:29 cvt.s64.u32 %sd1, %u10; // vecAdd_reduction.f90:29 : $$id_t14 mov.f64 %fd1, 0d0000000000000000; // vecAdd_reduction.f90:29 : $$_sum_t12 setp.ge.s64 %p1, %sd1, 100000; // vecAdd_reduction.f90:29 @%p1 bra $main_$ck_L29_2__l8__; // vecAdd_reduction.f90:29 .loc 1 30 0 shl.b64 %bd1, %sd1, 3; // vecAdd_reduction.f90:30 ld.param.s64 %sd2, [$$arg_ptr_acc_b_t6_t55];// vecAdd_reduction.f90:30 : $$arg_ptr_acc_b_t6_t55 add.s64 %sd3, %sd2, %bd1; // vecAdd_reduction.f90:30 ld.param.s64 %sd4, [$$arg_ptr_acc_a_t8_t56];// vecAdd_reduction.f90:30 : $$arg_ptr_acc_a_t8_t56 add.s64 %sd5, %sd4, %bd1; // vecAdd_reduction.f90:30 ld.global.nc.f64 %fd7, [%sd5]; // vecAdd_reduction.f90:30 : 0[$$arg_ptr_acc_a_t8_t56,$$id_t14,ex].L ld.global.nc.f64 %fd8, [%sd3]; // vecAdd_reduction.f90:30 : 0[$$arg_ptr_acc_b_t6_t55,$$id_t14,ex].L add.rn.f64 %fd9, %fd7, %fd8; // vecAdd_reduction.f90:30 ld.param.s64 %sd6, [$$arg_ptr_acc_c_t10_t57];// vecAdd_reduction.f90:30 : $$arg_ptr_acc_c_t10_t57 add.s64 %sd7, %sd6, %bd1; // vecAdd_reduction.f90:30 st.global.f64 [%sd7], %fd9; // vecAdd_reduction.f90:30 : 0[$$arg_ptr_acc_c_t10_t57,$$id_t14].L .loc 1 31 0 mov.f64 %fd1, %fd9; // vecAdd_reduction.f90:31 : $$_sum_t12 $main_$ck_L29_2__l8__: .loc 1 32 0 mov.u32 %u11, %tid.x; // vecAdd_reduction.f90:32 mov.u32 %u1, %tid.x; // vecAdd_reduction.f90:32 : $$lcs_9_t49 cvt.s64.u32 %sd8, %tid.x; // vecAdd_reduction.f90:32 shl.b64 %bd2, %sd8, 3; // vecAdd_reduction.f90:32 mov.s64 %sd9, $$_redfold_t38__f1s221;// vecAdd_reduction.f90:32 add.s64 %sd10, %bd2, %sd9; // vecAdd_reduction.f90:32 st.volatile.shared.f64 [%sd10], %fd1; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_9_t49].L bar.sync 0; // vecAdd_reduction.f90:32 ld.volatile.shared.f64 %fd2, [%sd10]; // vecAdd_reduction.f90:32 : $redfold_left_temp_t39 setp.ge.u32 %p2, %u11, 64; // vecAdd_reduction.f90:32 @%p2 bra $main_$ck_L29_2__l11__; // vecAdd_reduction.f90:32 .loc 1 32 0 cvt.s64.u32 %sd11, %u1; // vecAdd_reduction.f90:32 shl.b64 %bd3, %sd11, 3; // vecAdd_reduction.f90:32 mov.s64 %sd12, $$_redfold_t38__f1s221;// vecAdd_reduction.f90:32 add.s64 %sd13, %bd3, %sd12; // vecAdd_reduction.f90:32 ld.volatile.shared.f64 %fd10, [%sd13 + 512]; // vecAdd_reduction.f90:32 : 64[ _&$$_redfold_t38,$$lcs_9_t49].L add.rn.f64 %fd2, %fd2, %fd10; // vecAdd_reduction.f90:32 : $redfold_left_temp_t39 st.volatile.shared.f64 [%sd13], %fd2; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_9_t49].L $main_$ck_L29_2__l11__: .loc 1 32 0 bar.sync 0; // vecAdd_reduction.f90:32 mov.u32 %u12, %tid.x; // vecAdd_reduction.f90:32 mov.u32 %u2, %tid.x; // vecAdd_reduction.f90:32 : $$lcs_10_t50 setp.ge.u32 %p3, %u12, 32; // vecAdd_reduction.f90:32 @%p3 bra $main_$ck_L29_2__l14__; // vecAdd_reduction.f90:32 .loc 1 32 0 cvt.s64.u32 %sd14, %u2; // vecAdd_reduction.f90:32 shl.b64 %bd4, %sd14, 3; // vecAdd_reduction.f90:32 mov.s64 %sd15, $$_redfold_t38__f1s221;// vecAdd_reduction.f90:32 add.s64 %sd16, %bd4, %sd15; // vecAdd_reduction.f90:32 ld.volatile.shared.f64 %fd11, [%sd16 + 256]; // vecAdd_reduction.f90:32 : 32[ _&$$_redfold_t38,$$lcs_10_t50].L add.rn.f64 %fd2, %fd2, %fd11; // vecAdd_reduction.f90:32 : $redfold_left_temp_t39 st.volatile.shared.f64 [%sd16], %fd2; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_10_t50].L ld.volatile.shared.f64 %fd12, [%sd16 + 128]; // vecAdd_reduction.f90:32 : 16[ _&$$_redfold_t38,$$lcs_10_t50].L add.rn.f64 %fd2, %fd2, %fd12; // vecAdd_reduction.f90:32 : $redfold_left_temp_t39 st.volatile.shared.f64 [%sd16], %fd2; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_10_t50].L ld.volatile.shared.f64 %fd13, [%sd16 + 64]; // vecAdd_reduction.f90:32 : 8[ _&$$_redfold_t38,$$lcs_10_t50].L add.rn.f64 %fd2, %fd2, %fd13; // vecAdd_reduction.f90:32 : $redfold_left_temp_t39 st.volatile.shared.f64 [%sd16], %fd2; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_10_t50].L ld.volatile.shared.f64 %fd14, [%sd16 + 32]; // vecAdd_reduction.f90:32 : 4[ _&$$_redfold_t38,$$lcs_10_t50].L add.rn.f64 %fd2, %fd2, %fd14; // vecAdd_reduction.f90:32 : $redfold_left_temp_t39 st.volatile.shared.f64 [%sd16], %fd2; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_10_t50].L ld.volatile.shared.f64 %fd15, [%sd16 + 16]; // vecAdd_reduction.f90:32 : 2[ _&$$_redfold_t38,$$lcs_10_t50].L add.rn.f64 %fd2, %fd2, %fd15; // vecAdd_reduction.f90:32 : $redfold_left_temp_t39 st.volatile.shared.f64 [%sd16], %fd2; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_10_t50].L ld.volatile.shared.f64 %fd16, [%sd16 + 8]; // vecAdd_reduction.f90:32 : 1[ _&$$_redfold_t38,$$lcs_10_t50].L add.rn.f64 %fd17, %fd2, %fd16; // vecAdd_reduction.f90:32 st.volatile.shared.f64 [%sd16], %fd17; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_10_t50].L mov.f64 %fd2, %fd17; // vecAdd_reduction.f90:32 : $redfold_left_temp_t39 $main_$ck_L29_2__l14__: .loc 1 32 0 bar.sync 0; // vecAdd_reduction.f90:32 ld.volatile.shared.f64 %fd4, [$$_redfold_t38__f1s221];// vecAdd_reduction.f90:32 : $$reduc_pvt_t17 mov.f64 %fd3, %fd4; // vecAdd_reduction.f90:32 : $redfinal_t41 bar.sync 0; // vecAdd_reduction.f90:32 mov.u32 %u13, %tid.x; // vecAdd_reduction.f90:32 setp.ne.u32 %p4, %u13, 0; // vecAdd_reduction.f90:32 @%p4 bra $main_$ck_L29_2__l17__; // vecAdd_reduction.f90:32 .loc 1 32 0 cvt.s64.u32 %sd17, %ctaid.x; // vecAdd_reduction.f90:32 shl.b64 %bd5, %sd17, 3; // vecAdd_reduction.f90:32 ld.param.s64 %sd18, [$$arg_ptr_reduc_val_t18_t59];// vecAdd_reduction.f90:32 : $$arg_ptr_reduc_val_t18_t59 add.s64 %sd19, %sd18, %bd5; // vecAdd_reduction.f90:32 st.global.f64 [%sd19], %fd3; // vecAdd_reduction.f90:32 : 0[$$arg_ptr_reduc_val_t18_t59,_acc_read_hw_reg(%ctaid.x)].L $main_$ck_L29_2__l17__: .loc 1 32 0 membar.gl; // vecAdd_reduction.f90:32 mov.u32 %u14, %tid.x; // vecAdd_reduction.f90:32 setp.ne.u32 %p5, %u14, 0; // vecAdd_reduction.f90:32 @%p5 bra $main_$ck_L29_2__l20__; // vecAdd_reduction.f90:32 .loc 1 32 0 cvt.s32.u32 %s2, %nctaid.x; // vecAdd_reduction.f90:32 sub.s32 %s3, 1, %s2; // vecAdd_reduction.f90:32 mov.u32 %u15, %ctaid.x; // vecAdd_reduction.f90:32 setp.eq.u32 %p6, %u15, 0; // vecAdd_reduction.f90:32 selp.s32 %s4, %s3, 1, %p6; // vecAdd_reduction.f90:32 ld.param.s64 %sd20, [$$arg_ptr_reduc_ctl_t15_t58];// vecAdd_reduction.f90:32 : $$arg_ptr_reduc_ctl_t15_t58 atom.global.add.s32 %s5, [%sd20], %s4; // vecAdd_reduction.f90:32 add.s32 %s6, %s5, %s4; // vecAdd_reduction.f90:32 st.volatile.shared.s32 [reduc_share_ctl_c2__f1s125], %s6;// vecAdd_reduction.f90:32 : reduc_share_ctl_c2 $main_$ck_L29_2__l20__: .loc 1 32 0 bar.sync 0; // vecAdd_reduction.f90:32 ld.volatile.shared.s32 %s7, [reduc_share_ctl_c2__f1s125];// vecAdd_reduction.f90:32 : reduc_share_ctl_c2 setp.ne.s32 %p7, %s7, 0; // vecAdd_reduction.f90:32 @%p7 bra $main_$ck_L29_2__l102__; // vecAdd_reduction.f90:32 .loc 1 32 0 mov.f64 %fd4, 0d0000000000000000; // vecAdd_reduction.f90:32 : $$reduc_pvt_t17 cvt.s32.u32 %s1, %tid.x; // vecAdd_reduction.f90:32 : $$induc_p13_t35 mov.u32 %u16, %nctaid.x; // vecAdd_reduction.f90:32 mov.u32 %u4, %nctaid.x; // vecAdd_reduction.f90:32 : $$lis_b2_t37 setp.ge.u32 %p8, %s1, %u16; // vecAdd_reduction.f90:32 @%p8 bra $main_$ck_L29_2__l31__; // vecAdd_reduction.f90:32 .loc 1 32 0 mov.u32 %u3, %ntid.x; // vecAdd_reduction.f90:32 : $$lis_b1_t36 $main_$ck_L29_2__l27__: .loc 1 32 0 cvt.s64.s32 %sd21, %s1; // vecAdd_reduction.f90:32 shl.b64 %bd6, %sd21, 3; // vecAdd_reduction.f90:32 ld.param.s64 %sd22, [$$arg_ptr_reduc_val_t18_t59];// vecAdd_reduction.f90:32 : $$arg_ptr_reduc_val_t18_t59 add.s64 %sd23, %sd22, %bd6; // vecAdd_reduction.f90:32 ld.global.f64 %fd18, [%sd23]; // vecAdd_reduction.f90:32 : 0[$$arg_ptr_reduc_val_t18_t59,$$induc_p13_t35].L add.rn.f64 %fd4, %fd18, %fd4; // vecAdd_reduction.f90:32 : $$reduc_pvt_t17 add.u32 %s1, %s1, %u3; // vecAdd_reduction.f90:32 : $$induc_p13_t35 setp.lt.u32 %p9, %s1, %u4; // vecAdd_reduction.f90:32 @%p9 bra $main_$ck_L29_2__l27__; // vecAdd_reduction.f90:32 $main_$ck_L29_2__l31__: .loc 1 32 0 mov.u32 %u17, %tid.x; // vecAdd_reduction.f90:32 mov.u32 %u5, %tid.x; // vecAdd_reduction.f90:32 : $$lcs_13_t53 cvt.s64.u32 %sd24, %tid.x; // vecAdd_reduction.f90:32 shl.b64 %bd7, %sd24, 3; // vecAdd_reduction.f90:32 mov.s64 %sd25, $$_redfold_t38__f1s221;// vecAdd_reduction.f90:32 add.s64 %sd26, %bd7, %sd25; // vecAdd_reduction.f90:32 st.volatile.shared.f64 [%sd26], %fd4; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_13_t53].L bar.sync 0; // vecAdd_reduction.f90:32 ld.volatile.shared.f64 %fd5, [%sd26]; // vecAdd_reduction.f90:32 : $redfold_left_temp_t42 setp.ge.u32 %p10, %u17, 64; // vecAdd_reduction.f90:32 @%p10 bra $main_$ck_L29_2__l34__; // vecAdd_reduction.f90:32 .loc 1 32 0 cvt.s64.u32 %sd27, %u5; // vecAdd_reduction.f90:32 shl.b64 %bd8, %sd27, 3; // vecAdd_reduction.f90:32 mov.s64 %sd28, $$_redfold_t38__f1s221;// vecAdd_reduction.f90:32 add.s64 %sd29, %bd8, %sd28; // vecAdd_reduction.f90:32 ld.volatile.shared.f64 %fd19, [%sd29 + 512]; // vecAdd_reduction.f90:32 : 64[ _&$$_redfold_t38,$$lcs_13_t53].L add.rn.f64 %fd5, %fd5, %fd19; // vecAdd_reduction.f90:32 : $redfold_left_temp_t42 st.volatile.shared.f64 [%sd29], %fd5; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_13_t53].L $main_$ck_L29_2__l34__: .loc 1 32 0 bar.sync 0; // vecAdd_reduction.f90:32 mov.u32 %u18, %tid.x; // vecAdd_reduction.f90:32 mov.u32 %u6, %tid.x; // vecAdd_reduction.f90:32 : $$lcs_14_t54 setp.ge.u32 %p11, %u18, 32; // vecAdd_reduction.f90:32 @%p11 bra $main_$ck_L29_2__l37__; // vecAdd_reduction.f90:32 .loc 1 32 0 cvt.s64.u32 %sd30, %u6; // vecAdd_reduction.f90:32 shl.b64 %bd9, %sd30, 3; // vecAdd_reduction.f90:32 mov.s64 %sd31, $$_redfold_t38__f1s221;// vecAdd_reduction.f90:32 add.s64 %sd32, %bd9, %sd31; // vecAdd_reduction.f90:32 ld.volatile.shared.f64 %fd20, [%sd32 + 256]; // vecAdd_reduction.f90:32 : 32[ _&$$_redfold_t38,$$lcs_14_t54].L add.rn.f64 %fd5, %fd5, %fd20; // vecAdd_reduction.f90:32 : $redfold_left_temp_t42 st.volatile.shared.f64 [%sd32], %fd5; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_14_t54].L ld.volatile.shared.f64 %fd21, [%sd32 + 128]; // vecAdd_reduction.f90:32 : 16[ _&$$_redfold_t38,$$lcs_14_t54].L add.rn.f64 %fd5, %fd5, %fd21; // vecAdd_reduction.f90:32 : $redfold_left_temp_t42 st.volatile.shared.f64 [%sd32], %fd5; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_14_t54].L ld.volatile.shared.f64 %fd22, [%sd32 + 64]; // vecAdd_reduction.f90:32 : 8[ _&$$_redfold_t38,$$lcs_14_t54].L add.rn.f64 %fd5, %fd5, %fd22; // vecAdd_reduction.f90:32 : $redfold_left_temp_t42 st.volatile.shared.f64 [%sd32], %fd5; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_14_t54].L ld.volatile.shared.f64 %fd23, [%sd32 + 32]; // vecAdd_reduction.f90:32 : 4[ _&$$_redfold_t38,$$lcs_14_t54].L add.rn.f64 %fd5, %fd5, %fd23; // vecAdd_reduction.f90:32 : $redfold_left_temp_t42 st.volatile.shared.f64 [%sd32], %fd5; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_14_t54].L ld.volatile.shared.f64 %fd24, [%sd32 + 16]; // vecAdd_reduction.f90:32 : 2[ _&$$_redfold_t38,$$lcs_14_t54].L add.rn.f64 %fd5, %fd5, %fd24; // vecAdd_reduction.f90:32 : $redfold_left_temp_t42 st.volatile.shared.f64 [%sd32], %fd5; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_14_t54].L ld.volatile.shared.f64 %fd25, [%sd32 + 8]; // vecAdd_reduction.f90:32 : 1[ _&$$_redfold_t38,$$lcs_14_t54].L add.rn.f64 %fd26, %fd5, %fd25; // vecAdd_reduction.f90:32 st.volatile.shared.f64 [%sd32], %fd26; // vecAdd_reduction.f90:32 : 0[ _&$$_redfold_t38,$$lcs_14_t54].L mov.f64 %fd5, %fd26; // vecAdd_reduction.f90:32 : $redfold_left_temp_t42 $main_$ck_L29_2__l37__: .loc 1 32 0 bar.sync 0; // vecAdd_reduction.f90:32 ld.volatile.shared.f64 %fd4, [$$_redfold_t38__f1s221];// vecAdd_reduction.f90:32 : $$reduc_pvt_t17 mov.f64 %fd6, %fd4; // vecAdd_reduction.f90:32 : $redfinal_t44 bar.sync 0; // vecAdd_reduction.f90:32 mov.u32 %u19, %tid.x; // vecAdd_reduction.f90:32 setp.ne.u32 %p12, %u19, 0; // vecAdd_reduction.f90:32 @%p12 bra $main_$ck_L29_2__l102__; // vecAdd_reduction.f90:32 .loc 1 32 0 ld.param.s64 %sd33, [$$arg_ptr_accshare_t21_t60];// vecAdd_reduction.f90:32 : $$arg_ptr_accshare_t21_t60 ld.global.f64 %fd27, [%sd33]; // vecAdd_reduction.f90:32 : 0[$$arg_ptr_accshare_t21_t60,0].L add.rn.f64 %fd28, %fd27, %fd6; // vecAdd_reduction.f90:32 st.global.f64 [%sd33], %fd28; // vecAdd_reduction.f90:32 : 0[$$arg_ptr_accshare_t21_t60,0].L $main_$ck_L29_2__l102__: exit; } // main_$ck_L29_2