//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19856038
// Cuda compilation tools, release 7.5, V7.5.17
// Based on LLVM 3.4svn
//

.version 3.0
.target sm_20
.address_size 64

	// .globl	_Z8distanceiiii
// elt_prod_conj$__cuda_local_var_44209_45_non_const_sfc has been demoted
// elt_prod_conj$__cuda_local_var_44210_45_non_const_sc1 has been demoted
// elt_prod_conj$__cuda_local_var_44211_45_non_const_sc2 has been demoted
// elt_prod_conj_v2$__cuda_local_var_44242_45_non_const_sfc has been demoted
// reduce_max_final$__cuda_local_var_44307_33_non_const_sdata has been demoted
// reduce_max_final$__cuda_local_var_44308_30_non_const_idxData has been demoted
// reduce_max_main$__cuda_local_var_44442_33_non_const_sdata has been demoted
// reduce_max_main$__cuda_local_var_44443_30_non_const_idxData has been demoted
// reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow has been demoted
// reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol has been demoted
// reduce_max_filter_final$__cuda_local_var_44585_30_non_const_smaxesVal has been demoted
// reduce_max_filter_final$__cuda_local_var_44586_33_non_const_sdata has been demoted
// reduce_max_filter_final$__cuda_local_var_44587_30_non_const_idxData has been demoted
// reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow has been demoted
// reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol has been demoted
// reduce_max_filter_main$__cuda_local_var_44797_30_non_const_smaxesVal has been demoted
// reduce_max_filter_main$__cuda_local_var_44798_33_non_const_sdata has been demoted
// reduce_max_filter_main$__cuda_local_var_44799_30_non_const_idxData has been demoted
// elt_prod_conjf$__cuda_local_var_45052_39_non_const_sfc has been demoted
// elt_prod_conjf$__cuda_local_var_45053_39_non_const_sc1 has been demoted
// elt_prod_conjf$__cuda_local_var_45054_39_non_const_sc2 has been demoted
// elt_prod_conj_v2f$__cuda_local_var_45085_39_non_const_sfc has been demoted
// reduce_max_finalf$__cuda_local_var_45150_32_non_const_sdata has been demoted
// reduce_max_finalf$__cuda_local_var_45151_30_non_const_idxData has been demoted
// reduce_max_mainf$__cuda_local_var_45285_32_non_const_sdata has been demoted
// reduce_max_mainf$__cuda_local_var_45286_30_non_const_idxData has been demoted
// reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow has been demoted
// reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol has been demoted
// reduce_max_filter_finalf$__cuda_local_var_45427_30_non_const_smaxesVal has been demoted
// reduce_max_filter_finalf$__cuda_local_var_45428_32_non_const_sdata has been demoted
// reduce_max_filter_finalf$__cuda_local_var_45429_30_non_const_idxData has been demoted
// reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow has been demoted
// reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol has been demoted
// reduce_max_filter_mainf$__cuda_local_var_45638_30_non_const_smaxesVal has been demoted
// reduce_max_filter_mainf$__cuda_local_var_45639_32_non_const_sdata has been demoted
// reduce_max_filter_mainf$__cuda_local_var_45640_30_non_const_idxData has been demoted

.visible .func  (.param .b64 func_retval0) _Z8distanceiiii(
	.param .b32 _Z8distanceiiii_param_0,
	.param .b32 _Z8distanceiiii_param_1,
	.param .b32 _Z8distanceiiii_param_2,
	.param .b32 _Z8distanceiiii_param_3
)
{
	.reg .b32 	%r<7>;
	.reg .f64 	%fd<5>;


	ld.param.u32 	%r1, [_Z8distanceiiii_param_0];
	ld.param.u32 	%r2, [_Z8distanceiiii_param_1];
	ld.param.u32 	%r3, [_Z8distanceiiii_param_2];
	ld.param.u32 	%r4, [_Z8distanceiiii_param_3];
	sub.s32 	%r5, %r1, %r2;
	cvt.rn.f64.s32	%fd1, %r5;
	sub.s32 	%r6, %r3, %r4;
	cvt.rn.f64.s32	%fd2, %r6;
	mul.f64 	%fd3, %fd2, %fd2;
	fma.rn.f64 	%fd4, %fd1, %fd1, %fd3;
	st.param.f64	[func_retval0+0], %fd4;
	ret;
}

	// .globl	_Z13checkDistancePiS_iii
.visible .func  (.param .b32 func_retval0) _Z13checkDistancePiS_iii(
	.param .b64 _Z13checkDistancePiS_iii_param_0,
	.param .b64 _Z13checkDistancePiS_iii_param_1,
	.param .b32 _Z13checkDistancePiS_iii_param_2,
	.param .b32 _Z13checkDistancePiS_iii_param_3,
	.param .b32 _Z13checkDistancePiS_iii_param_4
)
{
	.reg .pred 	%p<5>;
	.reg .b16 	%rs<7>;
	.reg .b32 	%r<13>;
	.reg .b64 	%rd<7>;


	ld.param.u64 	%rd1, [_Z13checkDistancePiS_iii_param_0];
	ld.param.u64 	%rd2, [_Z13checkDistancePiS_iii_param_1];
	ld.param.u32 	%r5, [_Z13checkDistancePiS_iii_param_2];
	ld.param.u32 	%r7, [_Z13checkDistancePiS_iii_param_3];
	ld.param.u32 	%r8, [_Z13checkDistancePiS_iii_param_4];
	div.s32 	%r1, %r7, %r8;
	rem.s32 	%r2, %r7, %r8;
	mov.u16 	%rs2, 1;
	mov.u32 	%r12, 0;
	setp.lt.s32	%p1, %r5, 1;
	mov.u16 	%rs6, %rs2;
	@%p1 bra 	BB1_4;

BB1_1:
	mul.wide.s32 	%rd3, %r12, 4;
	add.s64 	%rd4, %rd1, %rd3;
	ld.u32 	%r9, [%rd4];
	setp.ne.s32	%p2, %r9, %r1;
	@%p2 bra 	BB1_3;

	add.s64 	%rd6, %rd2, %rd3;
	ld.u32 	%r10, [%rd6];
	setp.eq.s32	%p3, %r10, %r2;
	mov.u16 	%rs3, 0;
	mov.u16 	%rs6, %rs3;
	@%p3 bra 	BB1_4;

BB1_3:
	add.s32 	%r12, %r12, 1;
	setp.lt.s32	%p4, %r12, %r5;
	mov.u16 	%rs5, %rs2;
	mov.u16 	%rs6, %rs5;
	@%p4 bra 	BB1_1;

BB1_4:
	cvt.u32.u16	%r11, %rs6;
	st.param.b32	[func_retval0+0], %r11;
	ret;
}

	// .globl	_Z13checkDistancePViS0_iii
.visible .func  (.param .b32 func_retval0) _Z13checkDistancePViS0_iii(
	.param .b64 _Z13checkDistancePViS0_iii_param_0,
	.param .b64 _Z13checkDistancePViS0_iii_param_1,
	.param .b32 _Z13checkDistancePViS0_iii_param_2,
	.param .b32 _Z13checkDistancePViS0_iii_param_3,
	.param .b32 _Z13checkDistancePViS0_iii_param_4
)
{
	.reg .pred 	%p<5>;
	.reg .b16 	%rs<7>;
	.reg .b32 	%r<13>;
	.reg .b64 	%rd<7>;


	ld.param.u64 	%rd1, [_Z13checkDistancePViS0_iii_param_0];
	ld.param.u64 	%rd2, [_Z13checkDistancePViS0_iii_param_1];
	ld.param.u32 	%r5, [_Z13checkDistancePViS0_iii_param_2];
	ld.param.u32 	%r7, [_Z13checkDistancePViS0_iii_param_3];
	ld.param.u32 	%r8, [_Z13checkDistancePViS0_iii_param_4];
	div.s32 	%r1, %r7, %r8;
	rem.s32 	%r2, %r7, %r8;
	mov.u16 	%rs2, 1;
	mov.u32 	%r12, 0;
	setp.lt.s32	%p1, %r5, 1;
	mov.u16 	%rs6, %rs2;
	@%p1 bra 	BB2_4;

BB2_1:
	mul.wide.s32 	%rd3, %r12, 4;
	add.s64 	%rd4, %rd1, %rd3;
	ld.volatile.u32 	%r9, [%rd4];
	setp.ne.s32	%p2, %r9, %r1;
	@%p2 bra 	BB2_3;

	add.s64 	%rd6, %rd2, %rd3;
	ld.volatile.u32 	%r10, [%rd6];
	setp.eq.s32	%p3, %r10, %r2;
	mov.u16 	%rs3, 0;
	mov.u16 	%rs6, %rs3;
	@%p3 bra 	BB2_4;

BB2_3:
	add.s32 	%r12, %r12, 1;
	setp.lt.s32	%p4, %r12, %r5;
	mov.u16 	%rs5, %rs2;
	mov.u16 	%rs6, %rs5;
	@%p4 bra 	BB2_1;

BB2_4:
	cvt.u32.u16	%r11, %rs6;
	st.param.b32	[func_retval0+0], %r11;
	ret;
}

	// .globl	_Z9distancefiiii
.visible .func  (.param .b32 func_retval0) _Z9distancefiiii(
	.param .b32 _Z9distancefiiii_param_0,
	.param .b32 _Z9distancefiiii_param_1,
	.param .b32 _Z9distancefiiii_param_2,
	.param .b32 _Z9distancefiiii_param_3
)
{
	.reg .f32 	%f<5>;
	.reg .b32 	%r<7>;


	ld.param.u32 	%r1, [_Z9distancefiiii_param_0];
	ld.param.u32 	%r2, [_Z9distancefiiii_param_1];
	ld.param.u32 	%r3, [_Z9distancefiiii_param_2];
	ld.param.u32 	%r4, [_Z9distancefiiii_param_3];
	sub.s32 	%r5, %r1, %r2;
	cvt.rn.f32.s32	%f1, %r5;
	sub.s32 	%r6, %r3, %r4;
	cvt.rn.f32.s32	%f2, %r6;
	mul.f32 	%f3, %f2, %f2;
	fma.rn.f32 	%f4, %f1, %f1, %f3;
	st.param.f32	[func_retval0+0], %f4;
	ret;
}

	// .globl	_Z14checkDistancefPiS_iii
.visible .func  (.param .b32 func_retval0) _Z14checkDistancefPiS_iii(
	.param .b64 _Z14checkDistancefPiS_iii_param_0,
	.param .b64 _Z14checkDistancefPiS_iii_param_1,
	.param .b32 _Z14checkDistancefPiS_iii_param_2,
	.param .b32 _Z14checkDistancefPiS_iii_param_3,
	.param .b32 _Z14checkDistancefPiS_iii_param_4
)
{
	.reg .pred 	%p<5>;
	.reg .b16 	%rs<7>;
	.reg .b32 	%r<13>;
	.reg .b64 	%rd<7>;


	ld.param.u64 	%rd1, [_Z14checkDistancefPiS_iii_param_0];
	ld.param.u64 	%rd2, [_Z14checkDistancefPiS_iii_param_1];
	ld.param.u32 	%r5, [_Z14checkDistancefPiS_iii_param_2];
	ld.param.u32 	%r7, [_Z14checkDistancefPiS_iii_param_3];
	ld.param.u32 	%r8, [_Z14checkDistancefPiS_iii_param_4];
	div.s32 	%r1, %r7, %r8;
	rem.s32 	%r2, %r7, %r8;
	mov.u16 	%rs2, 1;
	mov.u32 	%r12, 0;
	setp.lt.s32	%p1, %r5, 1;
	mov.u16 	%rs6, %rs2;
	@%p1 bra 	BB4_4;

BB4_1:
	mul.wide.s32 	%rd3, %r12, 4;
	add.s64 	%rd4, %rd1, %rd3;
	ld.u32 	%r9, [%rd4];
	setp.ne.s32	%p2, %r9, %r1;
	@%p2 bra 	BB4_3;

	add.s64 	%rd6, %rd2, %rd3;
	ld.u32 	%r10, [%rd6];
	setp.eq.s32	%p3, %r10, %r2;
	mov.u16 	%rs3, 0;
	mov.u16 	%rs6, %rs3;
	@%p3 bra 	BB4_4;

BB4_3:
	add.s32 	%r12, %r12, 1;
	setp.lt.s32	%p4, %r12, %r5;
	mov.u16 	%rs5, %rs2;
	mov.u16 	%rs6, %rs5;
	@%p4 bra 	BB4_1;

BB4_4:
	cvt.u32.u16	%r11, %rs6;
	st.param.b32	[func_retval0+0], %r11;
	ret;
}

	// .globl	_Z14checkDistancefPViS0_iii
.visible .func  (.param .b32 func_retval0) _Z14checkDistancefPViS0_iii(
	.param .b64 _Z14checkDistancefPViS0_iii_param_0,
	.param .b64 _Z14checkDistancefPViS0_iii_param_1,
	.param .b32 _Z14checkDistancefPViS0_iii_param_2,
	.param .b32 _Z14checkDistancefPViS0_iii_param_3,
	.param .b32 _Z14checkDistancefPViS0_iii_param_4
)
{
	.reg .pred 	%p<5>;
	.reg .b16 	%rs<7>;
	.reg .b32 	%r<13>;
	.reg .b64 	%rd<7>;


	ld.param.u64 	%rd1, [_Z14checkDistancefPViS0_iii_param_0];
	ld.param.u64 	%rd2, [_Z14checkDistancefPViS0_iii_param_1];
	ld.param.u32 	%r5, [_Z14checkDistancefPViS0_iii_param_2];
	ld.param.u32 	%r7, [_Z14checkDistancefPViS0_iii_param_3];
	ld.param.u32 	%r8, [_Z14checkDistancefPViS0_iii_param_4];
	div.s32 	%r1, %r7, %r8;
	rem.s32 	%r2, %r7, %r8;
	mov.u16 	%rs2, 1;
	mov.u32 	%r12, 0;
	setp.lt.s32	%p1, %r5, 1;
	mov.u16 	%rs6, %rs2;
	@%p1 bra 	BB5_4;

BB5_1:
	mul.wide.s32 	%rd3, %r12, 4;
	add.s64 	%rd4, %rd1, %rd3;
	ld.volatile.u32 	%r9, [%rd4];
	setp.ne.s32	%p2, %r9, %r1;
	@%p2 bra 	BB5_3;

	add.s64 	%rd6, %rd2, %rd3;
	ld.volatile.u32 	%r10, [%rd6];
	setp.eq.s32	%p3, %r10, %r2;
	mov.u16 	%rs3, 0;
	mov.u16 	%rs6, %rs3;
	@%p3 bra 	BB5_4;

BB5_3:
	add.s32 	%r12, %r12, 1;
	setp.lt.s32	%p4, %r12, %r5;
	mov.u16 	%rs5, %rs2;
	mov.u16 	%rs6, %rs5;
	@%p4 bra 	BB5_1;

BB5_4:
	cvt.u32.u16	%r11, %rs6;
	st.param.b32	[func_retval0+0], %r11;
	ret;
}

	// .globl	elt_prod_conj
.visible .entry elt_prod_conj(
	.param .u64 elt_prod_conj_param_0,
	.param .u64 elt_prod_conj_param_1,
	.param .u64 elt_prod_conj_param_2,
	.param .u32 elt_prod_conj_param_3
)
{
	.reg .pred 	%p<10>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<41>;
	.reg .b64 	%rd<23>;
	// demoted variable
	.shared .align 16 .b8 elt_prod_conj$__cuda_local_var_44209_45_non_const_sfc[4096];
	// demoted variable
	.shared .align 16 .b8 elt_prod_conj$__cuda_local_var_44210_45_non_const_sc1[4096];
	// demoted variable
	.shared .align 16 .b8 elt_prod_conj$__cuda_local_var_44211_45_non_const_sc2[4096];

	ld.param.u64 	%rd6, [elt_prod_conj_param_0];
	ld.param.u64 	%rd7, [elt_prod_conj_param_1];
	ld.param.u64 	%rd8, [elt_prod_conj_param_2];
	ld.param.u32 	%r3, [elt_prod_conj_param_3];
	mov.u32 	%r4, %ctaid.x;
	shl.b32 	%r5, %r4, 8;
	mov.u32 	%r1, %tid.x;
	add.s32 	%r2, %r5, %r1;
	setp.ge.s32	%p1, %r2, %r3;
	@%p1 bra 	BB6_5;

	cvta.to.global.u64 	%rd9, %rd7;
	cvt.u64.u32	%rd1, %r1;
	mul.wide.u32 	%rd10, %r1, 16;
	mov.u64 	%rd11, elt_prod_conj$__cuda_local_var_44210_45_non_const_sc1;
	add.s64 	%rd2, %rd11, %rd10;
	cvt.s64.s32	%rd3, %r2;
	mul.wide.s32 	%rd12, %r2, 16;
	add.s64 	%rd13, %rd9, %rd12;
	ld.global.v2.f64 	{%fd6, %fd7}, [%rd13];
	st.shared.v2.f64 	[%rd2], {%fd6, %fd7};
	mov.u64 	%rd14, elt_prod_conj$__cuda_local_var_44211_45_non_const_sc2;
	add.s64 	%rd4, %rd14, %rd10;
	cvta.to.global.u64 	%rd15, %rd8;
	add.s64 	%rd16, %rd15, %rd12;
	ld.global.v2.f64 	{%fd10, %fd11}, [%rd16];
	st.shared.v2.f64 	[%rd4], {%fd10, %fd11};
	bar.sync 	0;
	ld.shared.v2.f64 	{%fd14, %fd15}, [%rd4];
	ld.shared.v2.f64 	{%fd18, %fd19}, [%rd2];
	mul.f64 	%fd22, %fd19, %fd15;
	fma.rn.f64 	%fd39, %fd18, %fd14, %fd22;
	mul.f64 	%fd23, %fd18, %fd15;
	mul.f64 	%fd24, %fd19, %fd14;
	sub.f64 	%fd2, %fd24, %fd23;
	shl.b64 	%rd17, %rd1, 4;
	mov.u64 	%rd18, elt_prod_conj$__cuda_local_var_44209_45_non_const_sfc;
	add.s64 	%rd5, %rd18, %rd17;
	st.shared.v2.f64 	[%rd5], {%fd39, %fd2};
	abs.f64 	%fd25, %fd39;
	abs.f64 	%fd26, %fd2;
	setp.gt.f64	%p2, %fd25, %fd26;
	selp.f64	%fd27, %fd25, %fd26, %p2;
	selp.f64	%fd28, %fd26, %fd25, %p2;
	div.rn.f64 	%fd29, %fd28, %fd27;
	fma.rn.f64 	%fd30, %fd29, %fd29, 0d3FF0000000000000;
	sqrt.rn.f64 	%fd31, %fd30;
	mul.f64 	%fd32, %fd27, %fd31;
	setp.eq.f64	%p3, %fd27, 0d0000000000000000;
	setp.gt.f64	%p4, %fd27, 0d7FEFFFFFFFFFFFFF;
	or.pred  	%p5, %p3, %p4;
	setp.gt.f64	%p6, %fd28, 0d7FEFFFFFFFFFFFFF;
	or.pred  	%p7, %p5, %p6;
	add.f64 	%fd33, %fd27, %fd28;
	selp.f64	%fd40, %fd33, %fd32, %p7;
	setp.eq.f64	%p8, %fd40, 0d0000000000000000;
	@%p8 bra 	BB6_3;

	abs.f64 	%fd34, %fd40;
	setp.le.f64	%p9, %fd34, 0d7FF0000000000000;
	@%p9 bra 	BB6_4;

BB6_3:
	mov.u64 	%rd19, 4372995238176751616;
	st.shared.u64 	[%rd5], %rd19;
	mov.f64 	%fd40, 0d3CB0000000000000;
	mov.f64 	%fd39, %fd40;

BB6_4:
	cvta.to.global.u64 	%rd20, %rd6;
	shl.b64 	%rd21, %rd3, 4;
	add.s64 	%rd22, %rd20, %rd21;
	div.rn.f64 	%fd37, %fd2, %fd40;
	div.rn.f64 	%fd38, %fd39, %fd40;
	st.global.v2.f64 	[%rd22], {%fd38, %fd37};

BB6_5:
	ret;
}

	// .globl	elt_prod_conj_v2
.visible .entry elt_prod_conj_v2(
	.param .u64 elt_prod_conj_v2_param_0,
	.param .u64 elt_prod_conj_v2_param_1,
	.param .u64 elt_prod_conj_v2_param_2,
	.param .u32 elt_prod_conj_v2_param_3
)
{
	.reg .pred 	%p<5>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<30>;
	.reg .b64 	%rd<17>;
	// demoted variable
	.shared .align 16 .b8 elt_prod_conj_v2$__cuda_local_var_44242_45_non_const_sfc[4096];

	ld.param.u64 	%rd3, [elt_prod_conj_v2_param_0];
	ld.param.u64 	%rd4, [elt_prod_conj_v2_param_1];
	ld.param.u64 	%rd5, [elt_prod_conj_v2_param_2];
	ld.param.u32 	%r3, [elt_prod_conj_v2_param_3];
	mov.u32 	%r4, %ctaid.x;
	shl.b32 	%r5, %r4, 8;
	mov.u32 	%r1, %tid.x;
	add.s32 	%r2, %r5, %r1;
	setp.ge.s32	%p1, %r2, %r3;
	@%p1 bra 	BB7_4;

	cvta.to.global.u64 	%rd6, %rd4;
	cvt.s64.s32	%rd1, %r2;
	mul.wide.s32 	%rd7, %r2, 16;
	add.s64 	%rd8, %rd6, %rd7;
	cvta.to.global.u64 	%rd9, %rd5;
	add.s64 	%rd10, %rd9, %rd7;
	ld.global.v2.f64 	{%fd6, %fd7}, [%rd10];
	ld.global.v2.f64 	{%fd10, %fd11}, [%rd8];
	mul.f64 	%fd14, %fd11, %fd7;
	mul.f64 	%fd15, %fd10, %fd7;
	mul.f64 	%fd16, %fd11, %fd6;
	mul.wide.u32 	%rd11, %r1, 16;
	mov.u64 	%rd12, elt_prod_conj_v2$__cuda_local_var_44242_45_non_const_sfc;
	add.s64 	%rd2, %rd12, %rd11;
	sub.f64 	%fd17, %fd16, %fd15;
	fma.rn.f64 	%fd18, %fd10, %fd6, %fd14;
	st.shared.v2.f64 	[%rd2], {%fd18, %fd17};
	bar.sync 	0;
	ld.shared.v2.f64 	{%fd19, %fd20}, [%rd2];
	mov.f64 	%fd28, %fd19;
	mul.f64 	%fd21, %fd20, %fd20;
	fma.rn.f64 	%fd22, %fd19, %fd19, %fd21;
	sqrt.rn.f64 	%fd29, %fd22;
	abs.f64 	%fd23, %fd29;
	setp.gtu.f64	%p2, %fd23, 0d7FF0000000000000;
	setp.eq.f64	%p3, %fd29, 0d0000000000000000;
	or.pred  	%p4, %p2, %p3;
	@!%p4 bra 	BB7_3;
	bra.uni 	BB7_2;

BB7_2:
	mov.u64 	%rd13, 4372995238176751616;
	st.shared.u64 	[%rd2], %rd13;
	mov.f64 	%fd29, 0d3CB0000000000000;
	mov.f64 	%fd28, %fd29;

BB7_3:
	cvta.to.global.u64 	%rd14, %rd3;
	shl.b64 	%rd15, %rd1, 4;
	add.s64 	%rd16, %rd14, %rd15;
	div.rn.f64 	%fd26, %fd20, %fd29;
	div.rn.f64 	%fd27, %fd28, %fd29;
	st.global.v2.f64 	[%rd16], {%fd27, %fd26};

BB7_4:
	ret;
}

	// .globl	elt_prod_conj_v3
.visible .entry elt_prod_conj_v3(
	.param .u64 elt_prod_conj_v3_param_0,
	.param .u64 elt_prod_conj_v3_param_1,
	.param .u64 elt_prod_conj_v3_param_2,
	.param .u32 elt_prod_conj_v3_param_3
)
{
	.reg .pred 	%p<12>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<33>;
	.reg .b64 	%rd<13>;


	ld.param.u64 	%rd2, [elt_prod_conj_v3_param_0];
	ld.param.u64 	%rd3, [elt_prod_conj_v3_param_1];
	ld.param.u64 	%rd4, [elt_prod_conj_v3_param_2];
	ld.param.u32 	%r2, [elt_prod_conj_v3_param_3];
	mov.u32 	%r3, %ctaid.x;
	shl.b32 	%r4, %r3, 8;
	mov.u32 	%r5, %tid.x;
	add.s32 	%r1, %r4, %r5;
	setp.ge.s32	%p1, %r1, %r2;
	@%p1 bra 	BB8_4;

	cvta.to.global.u64 	%rd5, %rd3;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd6, %r1, 16;
	add.s64 	%rd7, %rd5, %rd6;
	cvta.to.global.u64 	%rd8, %rd4;
	add.s64 	%rd9, %rd8, %rd6;
	ld.global.v2.f64 	{%fd6, %fd7}, [%rd9];
	ld.global.v2.f64 	{%fd10, %fd11}, [%rd7];
	mul.f64 	%fd14, %fd11, %fd7;
	fma.rn.f64 	%fd1, %fd10, %fd6, %fd14;
	mul.f64 	%fd15, %fd10, %fd7;
	mul.f64 	%fd16, %fd11, %fd6;
	sub.f64 	%fd2, %fd16, %fd15;
	mul.f64 	%fd17, %fd2, %fd2;
	fma.rn.f64 	%fd18, %fd1, %fd1, %fd17;
	sqrt.rn.f64 	%fd32, %fd18;
	abs.f64 	%fd19, %fd32;
	setp.gtu.f64	%p2, %fd19, 0d7FF0000000000000;
	setp.eq.f64	%p3, %fd32, 0d0000000000000000;
	or.pred  	%p4, %p2, %p3;
	@!%p4 bra 	BB8_3;
	bra.uni 	BB8_2;

BB8_2:
	abs.f64 	%fd20, %fd1;
	abs.f64 	%fd21, %fd2;
	setp.gt.f64	%p5, %fd20, %fd21;
	selp.f64	%fd22, %fd20, %fd21, %p5;
	selp.f64	%fd23, %fd21, %fd20, %p5;
	div.rn.f64 	%fd24, %fd23, %fd22;
	fma.rn.f64 	%fd25, %fd24, %fd24, 0d3FF0000000000000;
	sqrt.rn.f64 	%fd26, %fd25;
	mul.f64 	%fd27, %fd22, %fd26;
	setp.eq.f64	%p6, %fd22, 0d0000000000000000;
	setp.gt.f64	%p7, %fd22, 0d7FEFFFFFFFFFFFFF;
	or.pred  	%p8, %p6, %p7;
	setp.gt.f64	%p9, %fd23, 0d7FEFFFFFFFFFFFFF;
	or.pred  	%p10, %p8, %p9;
	add.f64 	%fd28, %fd22, %fd23;
	selp.f64	%fd32, %fd28, %fd27, %p10;

BB8_3:
	cvta.to.global.u64 	%rd10, %rd2;
	setp.eq.f64	%p11, %fd32, 0d0000000000000000;
	selp.f64	%fd29, 0d3CB0000000000000, %fd32, %p11;
	shl.b64 	%rd11, %rd1, 4;
	add.s64 	%rd12, %rd10, %rd11;
	div.rn.f64 	%fd30, %fd2, %fd29;
	div.rn.f64 	%fd31, %fd1, %fd29;
	st.global.v2.f64 	[%rd12], {%fd31, %fd30};

BB8_4:
	ret;
}

	// .globl	reduce_max_final
.visible .entry reduce_max_final(
	.param .u64 reduce_max_final_param_0,
	.param .u64 reduce_max_final_param_1,
	.param .u64 reduce_max_final_param_2,
	.param .u32 reduce_max_final_param_3,
	.param .u32 reduce_max_final_param_4
)
{
	.reg .pred 	%p<29>;
	.reg .b32 	%r<38>;
	.reg .f64 	%fd<78>;
	.reg .b64 	%rd<30>;
	// demoted variable
	.shared .align 8 .b8 reduce_max_final$__cuda_local_var_44307_33_non_const_sdata[2048];
	// demoted variable
	.shared .align 4 .b8 reduce_max_final$__cuda_local_var_44308_30_non_const_idxData[1024];

	ld.param.u64 	%rd5, [reduce_max_final_param_0];
	ld.param.u64 	%rd6, [reduce_max_final_param_1];
	ld.param.u64 	%rd7, [reduce_max_final_param_2];
	ld.param.u32 	%r11, [reduce_max_final_param_3];
	ld.param.u32 	%r12, [reduce_max_final_param_4];
	mov.u32 	%r14, %tid.x;
	shl.b32 	%r15, %r12, 1;
	mov.u32 	%r16, %ctaid.x;
	mad.lo.s32 	%r36, %r16, %r15, %r14;
	mov.f64 	%fd74, 0d0000000000000000;
	mov.f64 	%fd77, %fd74;
	setp.ge.u32	%p1, %r36, %r11;
	@%p1 bra 	BB9_7;

BB9_1:
	mov.f64 	%fd60, %fd77;
	mov.f64 	%fd1, %fd60;
	cvta.to.global.u64 	%rd8, %rd5;
	cvt.u64.u32	%rd1, %r36;
	mul.wide.u32 	%rd9, %r36, 8;
	add.s64 	%rd10, %rd8, %rd9;
	ld.global.f64 	%fd2, [%rd10];
	setp.geu.f64	%p2, %fd1, %fd2;
	mov.f64 	%fd75, %fd1;
	@%p2 bra 	BB9_3;

	cvta.to.global.u64 	%rd11, %rd7;
	shl.b64 	%rd12, %rd1, 2;
	add.s64 	%rd13, %rd11, %rd12;
	ld.global.u32 	%r37, [%rd13];
	mov.f64 	%fd75, %fd2;

BB9_3:
	mov.f64 	%fd3, %fd75;
	add.s32 	%r6, %r36, %r12;
	setp.ge.u32	%p3, %r6, %r11;
	mov.f64 	%fd76, %fd3;
	@%p3 bra 	BB9_6;

	cvt.u64.u32	%rd2, %r6;
	mul.wide.u32 	%rd15, %r6, 8;
	add.s64 	%rd16, %rd8, %rd15;
	ld.global.f64 	%fd4, [%rd16];
	setp.geu.f64	%p4, %fd3, %fd4;
	mov.f64 	%fd59, %fd3;
	mov.f64 	%fd76, %fd59;
	@%p4 bra 	BB9_6;

	cvta.to.global.u64 	%rd17, %rd7;
	shl.b64 	%rd18, %rd2, 2;
	add.s64 	%rd19, %rd17, %rd18;
	ld.global.u32 	%r37, [%rd19];
	mov.f64 	%fd76, %fd4;

BB9_6:
	mov.f64 	%fd77, %fd76;
	mov.u32 	%r18, %nctaid.x;
	mad.lo.s32 	%r36, %r18, %r15, %r36;
	setp.lt.u32	%p5, %r36, %r11;
	mov.f64 	%fd74, %fd77;
	@%p5 bra 	BB9_1;

BB9_7:
	mov.f64 	%fd72, %fd74;
	mul.wide.u32 	%rd20, %r14, 8;
	mov.u64 	%rd21, reduce_max_final$__cuda_local_var_44307_33_non_const_sdata;
	add.s64 	%rd3, %rd21, %rd20;
	st.shared.f64 	[%rd3], %fd72;
	mul.wide.u32 	%rd22, %r14, 4;
	mov.u64 	%rd23, reduce_max_final$__cuda_local_var_44308_30_non_const_idxData;
	add.s64 	%rd4, %rd23, %rd22;
	st.shared.u32 	[%rd4], %r37;
	bar.sync 	0;
	setp.lt.s32	%p6, %r12, 512;
	@%p6 bra 	BB9_12;

	setp.gt.u32	%p7, %r14, 255;
	mov.f64 	%fd73, %fd72;
	@%p7 bra 	BB9_11;

	ld.shared.f64 	%fd7, [%rd3+2048];
	setp.geu.f64	%p8, %fd72, %fd7;
	mov.f64 	%fd57, %fd72;
	mov.f64 	%fd73, %fd57;
	@%p8 bra 	BB9_11;

	st.shared.f64 	[%rd3], %fd7;
	ld.shared.u32 	%r21, [%rd4+1024];
	st.shared.u32 	[%rd4], %r21;
	mov.f64 	%fd73, %fd7;

BB9_11:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB9_12:
	mov.f64 	%fd70, %fd72;
	setp.lt.s32	%p9, %r12, 256;
	@%p9 bra 	BB9_17;

	setp.gt.u32	%p10, %r14, 127;
	mov.f64 	%fd71, %fd70;
	@%p10 bra 	BB9_16;

	ld.shared.f64 	%fd10, [%rd3+1024];
	setp.geu.f64	%p11, %fd70, %fd10;
	mov.f64 	%fd53, %fd70;
	mov.f64 	%fd71, %fd53;
	@%p11 bra 	BB9_16;

	st.shared.f64 	[%rd3], %fd10;
	ld.shared.u32 	%r23, [%rd4+512];
	st.shared.u32 	[%rd4], %r23;
	mov.f64 	%fd71, %fd10;

BB9_16:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB9_17:
	mov.f64 	%fd68, %fd70;
	setp.lt.s32	%p12, %r12, 128;
	@%p12 bra 	BB9_22;

	setp.gt.u32	%p13, %r14, 63;
	mov.f64 	%fd69, %fd68;
	@%p13 bra 	BB9_21;

	ld.shared.f64 	%fd13, [%rd3+512];
	setp.geu.f64	%p14, %fd68, %fd13;
	mov.f64 	%fd49, %fd68;
	mov.f64 	%fd69, %fd49;
	@%p14 bra 	BB9_21;

	st.shared.f64 	[%rd3], %fd13;
	ld.shared.u32 	%r25, [%rd4+256];
	st.shared.u32 	[%rd4], %r25;
	mov.f64 	%fd69, %fd13;

BB9_21:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB9_22:
	mov.f64 	%fd67, %fd68;
	setp.gt.u32	%p15, %r14, 31;
	@%p15 bra 	BB9_42;

	setp.lt.s32	%p16, %r12, 64;
	@%p16 bra 	BB9_26;

	ld.volatile.shared.f64 	%fd28, [%rd3+256];
	setp.geu.f64	%p17, %fd67, %fd28;
	@%p17 bra 	BB9_26;

	ld.volatile.shared.f64 	%fd67, [%rd3+256];
	st.volatile.shared.f64 	[%rd3], %fd67;
	ld.volatile.shared.u32 	%r27, [%rd4+128];
	st.volatile.shared.u32 	[%rd4], %r27;

BB9_26:
	mov.f64 	%fd66, %fd67;
	setp.lt.s32	%p18, %r12, 32;
	@%p18 bra 	BB9_29;

	ld.volatile.shared.f64 	%fd29, [%rd3+128];
	setp.geu.f64	%p19, %fd66, %fd29;
	@%p19 bra 	BB9_29;

	ld.volatile.shared.f64 	%fd66, [%rd3+128];
	st.volatile.shared.f64 	[%rd3], %fd66;
	ld.volatile.shared.u32 	%r28, [%rd4+64];
	st.volatile.shared.u32 	[%rd4], %r28;

BB9_29:
	mov.f64 	%fd65, %fd66;
	setp.lt.s32	%p20, %r12, 16;
	@%p20 bra 	BB9_32;

	ld.volatile.shared.f64 	%fd30, [%rd3+64];
	setp.geu.f64	%p21, %fd65, %fd30;
	@%p21 bra 	BB9_32;

	ld.volatile.shared.f64 	%fd65, [%rd3+64];
	st.volatile.shared.f64 	[%rd3], %fd65;
	ld.volatile.shared.u32 	%r29, [%rd4+32];
	st.volatile.shared.u32 	[%rd4], %r29;

BB9_32:
	mov.f64 	%fd64, %fd65;
	setp.lt.s32	%p22, %r12, 8;
	@%p22 bra 	BB9_35;

	ld.volatile.shared.f64 	%fd31, [%rd3+32];
	setp.geu.f64	%p23, %fd64, %fd31;
	@%p23 bra 	BB9_35;

	ld.volatile.shared.f64 	%fd64, [%rd3+32];
	st.volatile.shared.f64 	[%rd3], %fd64;
	ld.volatile.shared.u32 	%r30, [%rd4+16];
	st.volatile.shared.u32 	[%rd4], %r30;

BB9_35:
	mov.f64 	%fd63, %fd64;
	setp.lt.s32	%p24, %r12, 4;
	@%p24 bra 	BB9_38;

	ld.volatile.shared.f64 	%fd32, [%rd3+16];
	setp.geu.f64	%p25, %fd63, %fd32;
	@%p25 bra 	BB9_38;

	ld.volatile.shared.f64 	%fd63, [%rd3+16];
	st.volatile.shared.f64 	[%rd3], %fd63;
	ld.volatile.shared.u32 	%r31, [%rd4+8];
	st.volatile.shared.u32 	[%rd4], %r31;

BB9_38:
	setp.lt.s32	%p26, %r12, 2;
	@%p26 bra 	BB9_41;

	ld.volatile.shared.f64 	%fd33, [%rd3+8];
	setp.geu.f64	%p27, %fd63, %fd33;
	@%p27 bra 	BB9_41;

	ld.volatile.shared.f64 	%fd34, [%rd3+8];
	st.volatile.shared.f64 	[%rd3], %fd34;
	ld.volatile.shared.u32 	%r32, [%rd4+4];
	st.volatile.shared.u32 	[%rd4], %r32;

BB9_41:
	bar.sync 	0;

BB9_42:
	setp.ne.s32	%p28, %r14, 0;
	@%p28 bra 	BB9_44;

	ld.shared.f64 	%fd35, [reduce_max_final$__cuda_local_var_44307_33_non_const_sdata];
	cvta.to.global.u64 	%rd24, %rd6;
	mul.wide.u32 	%rd25, %r16, 8;
	add.s64 	%rd26, %rd24, %rd25;
	st.global.f64 	[%rd26], %fd35;
	ld.shared.u32 	%r35, [reduce_max_final$__cuda_local_var_44308_30_non_const_idxData];
	cvta.to.global.u64 	%rd27, %rd7;
	mul.wide.u32 	%rd28, %r16, 4;
	add.s64 	%rd29, %rd27, %rd28;
	st.global.u32 	[%rd29], %r35;

BB9_44:
	ret;
}

	// .globl	reduce_max_main
.visible .entry reduce_max_main(
	.param .u64 reduce_max_main_param_0,
	.param .u64 reduce_max_main_param_1,
	.param .u64 reduce_max_main_param_2,
	.param .u32 reduce_max_main_param_3,
	.param .u32 reduce_max_main_param_4
)
{
	.reg .pred 	%p<29>;
	.reg .b32 	%r<39>;
	.reg .f64 	%fd<75>;
	.reg .b64 	%rd<22>;
	// demoted variable
	.shared .align 8 .b8 reduce_max_main$__cuda_local_var_44442_33_non_const_sdata[2048];
	// demoted variable
	.shared .align 4 .b8 reduce_max_main$__cuda_local_var_44443_30_non_const_idxData[1024];

	ld.param.u64 	%rd3, [reduce_max_main_param_0];
	ld.param.u64 	%rd4, [reduce_max_main_param_1];
	ld.param.u64 	%rd5, [reduce_max_main_param_2];
	ld.param.u32 	%r10, [reduce_max_main_param_3];
	ld.param.u32 	%r11, [reduce_max_main_param_4];
	mov.u32 	%r13, %tid.x;
	mov.u32 	%r14, %ctaid.x;
	mad.lo.s32 	%r33, %r14, %r11, %r13;
	mov.f64 	%fd72, 0d0000000000000000;
	mov.f64 	%fd73, %fd72;
	setp.ge.u32	%p1, %r33, %r10;
	@%p1 bra 	BB10_4;

BB10_1:
	mov.f64 	%fd1, %fd73;
	mov.u32 	%r2, %r37;
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r33, 8;
	add.s64 	%rd8, %rd6, %rd7;
	ld.global.f64 	%fd27, [%rd8];
	setp.lt.f64	%p2, %fd1, %fd27;
	selp.f64	%fd74, %fd27, %fd1, %p2;
	selp.b32	%r38, %r33, %r2, %p2;
	add.s32 	%r5, %r33, %r11;
	setp.ge.u32	%p3, %r5, %r10;
	@%p3 bra 	BB10_3;

	mul.wide.u32 	%rd10, %r5, 8;
	add.s64 	%rd11, %rd6, %rd10;
	ld.global.f64 	%fd28, [%rd11];
	setp.lt.f64	%p4, %fd74, %fd28;
	selp.f64	%fd74, %fd28, %fd74, %p4;
	selp.b32	%r38, %r5, %r38, %p4;

BB10_3:
	mov.f64 	%fd73, %fd74;
	mov.u32 	%r37, %r38;
	mov.u32 	%r15, %nctaid.x;
	mad.lo.s32 	%r33, %r15, %r11, %r33;
	setp.lt.u32	%p5, %r33, %r10;
	mov.u32 	%r36, %r37;
	mov.f64 	%fd72, %fd73;
	@%p5 bra 	BB10_1;

BB10_4:
	mov.f64 	%fd70, %fd72;
	mul.wide.u32 	%rd12, %r13, 8;
	mov.u64 	%rd13, reduce_max_main$__cuda_local_var_44442_33_non_const_sdata;
	add.s64 	%rd1, %rd13, %rd12;
	st.shared.f64 	[%rd1], %fd70;
	mul.wide.u32 	%rd14, %r13, 4;
	mov.u64 	%rd15, reduce_max_main$__cuda_local_var_44443_30_non_const_idxData;
	add.s64 	%rd2, %rd15, %rd14;
	st.shared.u32 	[%rd2], %r36;
	bar.sync 	0;
	setp.lt.s32	%p6, %r11, 512;
	@%p6 bra 	BB10_9;

	setp.gt.u32	%p7, %r13, 255;
	mov.f64 	%fd71, %fd70;
	@%p7 bra 	BB10_8;

	ld.shared.f64 	%fd6, [%rd1+2048];
	setp.geu.f64	%p8, %fd70, %fd6;
	mov.f64 	%fd41, %fd70;
	mov.f64 	%fd71, %fd41;
	@%p8 bra 	BB10_8;

	st.shared.f64 	[%rd1], %fd6;
	ld.shared.u32 	%r18, [%rd2+1024];
	st.shared.u32 	[%rd2], %r18;
	mov.f64 	%fd71, %fd6;

BB10_8:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB10_9:
	mov.f64 	%fd68, %fd70;
	setp.lt.s32	%p9, %r11, 256;
	@%p9 bra 	BB10_14;

	setp.gt.u32	%p10, %r13, 127;
	mov.f64 	%fd69, %fd68;
	@%p10 bra 	BB10_13;

	ld.shared.f64 	%fd9, [%rd1+1024];
	setp.geu.f64	%p11, %fd68, %fd9;
	mov.f64 	%fd45, %fd68;
	mov.f64 	%fd69, %fd45;
	@%p11 bra 	BB10_13;

	st.shared.f64 	[%rd1], %fd9;
	ld.shared.u32 	%r20, [%rd2+512];
	st.shared.u32 	[%rd2], %r20;
	mov.f64 	%fd69, %fd9;

BB10_13:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB10_14:
	mov.f64 	%fd66, %fd68;
	setp.lt.s32	%p12, %r11, 128;
	@%p12 bra 	BB10_19;

	setp.gt.u32	%p13, %r13, 63;
	mov.f64 	%fd67, %fd66;
	@%p13 bra 	BB10_18;

	ld.shared.f64 	%fd12, [%rd1+512];
	setp.geu.f64	%p14, %fd66, %fd12;
	mov.f64 	%fd49, %fd66;
	mov.f64 	%fd67, %fd49;
	@%p14 bra 	BB10_18;

	st.shared.f64 	[%rd1], %fd12;
	ld.shared.u32 	%r22, [%rd2+256];
	st.shared.u32 	[%rd2], %r22;
	mov.f64 	%fd67, %fd12;

BB10_18:
	mov.f64 	%fd66, %fd67;
	bar.sync 	0;

BB10_19:
	mov.f64 	%fd65, %fd66;
	setp.gt.u32	%p15, %r13, 31;
	@%p15 bra 	BB10_39;

	setp.lt.s32	%p16, %r11, 64;
	@%p16 bra 	BB10_23;

	ld.volatile.shared.f64 	%fd29, [%rd1+256];
	setp.geu.f64	%p17, %fd65, %fd29;
	@%p17 bra 	BB10_23;

	ld.volatile.shared.f64 	%fd65, [%rd1+256];
	st.volatile.shared.f64 	[%rd1], %fd65;
	ld.volatile.shared.u32 	%r24, [%rd2+128];
	st.volatile.shared.u32 	[%rd2], %r24;

BB10_23:
	mov.f64 	%fd64, %fd65;
	setp.lt.s32	%p18, %r11, 32;
	@%p18 bra 	BB10_26;

	ld.volatile.shared.f64 	%fd30, [%rd1+128];
	setp.geu.f64	%p19, %fd64, %fd30;
	@%p19 bra 	BB10_26;

	ld.volatile.shared.f64 	%fd64, [%rd1+128];
	st.volatile.shared.f64 	[%rd1], %fd64;
	ld.volatile.shared.u32 	%r25, [%rd2+64];
	st.volatile.shared.u32 	[%rd2], %r25;

BB10_26:
	mov.f64 	%fd63, %fd64;
	setp.lt.s32	%p20, %r11, 16;
	@%p20 bra 	BB10_29;

	ld.volatile.shared.f64 	%fd31, [%rd1+64];
	setp.geu.f64	%p21, %fd63, %fd31;
	@%p21 bra 	BB10_29;

	ld.volatile.shared.f64 	%fd63, [%rd1+64];
	st.volatile.shared.f64 	[%rd1], %fd63;
	ld.volatile.shared.u32 	%r26, [%rd2+32];
	st.volatile.shared.u32 	[%rd2], %r26;

BB10_29:
	mov.f64 	%fd62, %fd63;
	setp.lt.s32	%p22, %r11, 8;
	@%p22 bra 	BB10_32;

	ld.volatile.shared.f64 	%fd32, [%rd1+32];
	setp.geu.f64	%p23, %fd62, %fd32;
	@%p23 bra 	BB10_32;

	ld.volatile.shared.f64 	%fd62, [%rd1+32];
	st.volatile.shared.f64 	[%rd1], %fd62;
	ld.volatile.shared.u32 	%r27, [%rd2+16];
	st.volatile.shared.u32 	[%rd2], %r27;

BB10_32:
	mov.f64 	%fd61, %fd62;
	setp.lt.s32	%p24, %r11, 4;
	@%p24 bra 	BB10_35;

	ld.volatile.shared.f64 	%fd33, [%rd1+16];
	setp.geu.f64	%p25, %fd61, %fd33;
	@%p25 bra 	BB10_35;

	ld.volatile.shared.f64 	%fd61, [%rd1+16];
	st.volatile.shared.f64 	[%rd1], %fd61;
	ld.volatile.shared.u32 	%r28, [%rd2+8];
	st.volatile.shared.u32 	[%rd2], %r28;

BB10_35:
	setp.lt.s32	%p26, %r11, 2;
	@%p26 bra 	BB10_38;

	ld.volatile.shared.f64 	%fd34, [%rd1+8];
	setp.geu.f64	%p27, %fd61, %fd34;
	@%p27 bra 	BB10_38;

	ld.volatile.shared.f64 	%fd35, [%rd1+8];
	st.volatile.shared.f64 	[%rd1], %fd35;
	ld.volatile.shared.u32 	%r29, [%rd2+4];
	st.volatile.shared.u32 	[%rd2], %r29;

BB10_38:
	bar.sync 	0;

BB10_39:
	setp.ne.s32	%p28, %r13, 0;
	@%p28 bra 	BB10_41;

	ld.shared.f64 	%fd36, [reduce_max_main$__cuda_local_var_44442_33_non_const_sdata];
	cvta.to.global.u64 	%rd16, %rd4;
	mul.wide.u32 	%rd17, %r14, 8;
	add.s64 	%rd18, %rd16, %rd17;
	st.global.f64 	[%rd18], %fd36;
	ld.shared.u32 	%r32, [reduce_max_main$__cuda_local_var_44443_30_non_const_idxData];
	cvta.to.global.u64 	%rd19, %rd5;
	mul.wide.u32 	%rd20, %r14, 4;
	add.s64 	%rd21, %rd19, %rd20;
	st.global.u32 	[%rd21], %r32;

BB10_41:
	ret;
}

	// .globl	reduce_max_filter_final
.visible .entry reduce_max_filter_final(
	.param .u64 reduce_max_filter_final_param_0,
	.param .u64 reduce_max_filter_final_param_1,
	.param .u64 reduce_max_filter_final_param_2,
	.param .u32 reduce_max_filter_final_param_3,
	.param .u32 reduce_max_filter_final_param_4,
	.param .u32 reduce_max_filter_final_param_5,
	.param .u64 reduce_max_filter_final_param_6,
	.param .u32 reduce_max_filter_final_param_7
)
{
	.reg .pred 	%p<78>;
	.reg .b32 	%r<173>;
	.reg .f64 	%fd<102>;
	.reg .b64 	%rd<134>;
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow[40];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol[40];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_final$__cuda_local_var_44585_30_non_const_smaxesVal[40];
	// demoted variable
	.shared .align 8 .b8 reduce_max_filter_final$__cuda_local_var_44586_33_non_const_sdata[2048];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_final$__cuda_local_var_44587_30_non_const_idxData[1024];

	ld.param.u64 	%rd18, [reduce_max_filter_final_param_0];
	ld.param.u64 	%rd19, [reduce_max_filter_final_param_1];
	ld.param.u64 	%rd20, [reduce_max_filter_final_param_2];
	ld.param.u32 	%r67, [reduce_max_filter_final_param_3];
	ld.param.u32 	%r68, [reduce_max_filter_final_param_4];
	ld.param.u32 	%r69, [reduce_max_filter_final_param_5];
	ld.param.u64 	%rd21, [reduce_max_filter_final_param_6];
	ld.param.u32 	%r70, [reduce_max_filter_final_param_7];
	mov.u32 	%r71, %tid.x;
	setp.ge.u32	%p1, %r71, %r70;
	@%p1 bra 	BB11_2;

	cvta.to.global.u64 	%rd22, %rd21;
	mul.wide.u32 	%rd23, %r71, 4;
	add.s64 	%rd24, %rd22, %rd23;
	ld.global.u32 	%r73, [%rd24];
	mov.u64 	%rd25, reduce_max_filter_final$__cuda_local_var_44585_30_non_const_smaxesVal;
	add.s64 	%rd26, %rd25, %rd23;
	st.shared.u32 	[%rd26], %r73;
	div.u32 	%r74, %r73, %r68;
	mov.u64 	%rd27, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd28, %rd27, %rd23;
	st.shared.u32 	[%rd28], %r74;
	rem.u32 	%r75, %r73, %r68;
	mov.u64 	%rd29, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd30, %rd29, %rd23;
	st.shared.u32 	[%rd30], %r75;

BB11_2:
	bar.sync 	0;
	shl.b32 	%r77, %r69, 1;
	mov.u32 	%r78, %ctaid.x;
	mad.lo.s32 	%r142, %r78, %r77, %r71;
	mov.f64 	%fd95, 0d0000000000000000;
	setp.ge.u32	%p2, %r142, %r67;
	@%p2 bra 	BB11_22;

	setp.gt.s32	%p3, %r70, 0;
	mov.f64 	%fd101, 0d0000000000000000;
	mov.f64 	%fd98, %fd101;
	@%p3 bra 	BB11_10;
	bra.uni 	BB11_4;

BB11_10:
	mov.f64 	%fd75, %fd98;
	mov.f64 	%fd6, %fd75;
	mov.u32 	%r148, %r160;
	mov.u32 	%r10, %r148;
	cvta.to.global.u64 	%rd43, %rd18;
	cvt.u64.u32	%rd3, %r142;
	mul.wide.u32 	%rd44, %r142, 8;
	add.s64 	%rd45, %rd43, %rd44;
	ld.global.f64 	%fd7, [%rd45];
	setp.geu.f64	%p8, %fd6, %fd7;
	mov.u32 	%r158, %r10;
	mov.f64 	%fd96, %fd6;
	@%p8 bra 	BB11_15;

	cvta.to.global.u64 	%rd46, %rd20;
	shl.b64 	%rd47, %rd3, 2;
	add.s64 	%rd48, %rd46, %rd47;
	ld.global.u32 	%r12, [%rd48];
	div.s32 	%r13, %r12, %r68;
	rem.s32 	%r14, %r12, %r68;
	mov.u32 	%r143, 0;

BB11_12:
	cvt.s64.s32	%rd4, %r143;
	mul.wide.s32 	%rd49, %r143, 4;
	mov.u64 	%rd50, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd51, %rd50, %rd49;
	ld.shared.u32 	%r88, [%rd51];
	setp.ne.s32	%p9, %r88, %r13;
	@%p9 bra 	BB11_14;

	shl.b64 	%rd52, %rd4, 2;
	mov.u64 	%rd53, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd54, %rd53, %rd52;
	ld.shared.u32 	%r89, [%rd54];
	setp.eq.s32	%p10, %r89, %r14;
	mov.u32 	%r149, %r10;
	mov.u32 	%r158, %r149;
	mov.f64 	%fd76, %fd6;
	mov.f64 	%fd96, %fd76;
	@%p10 bra 	BB11_15;

BB11_14:
	add.s32 	%r143, %r143, 1;
	setp.lt.s32	%p11, %r143, %r70;
	mov.u32 	%r158, %r12;
	mov.f64 	%fd96, %fd7;
	@%p11 bra 	BB11_12;

BB11_15:
	mov.f64 	%fd8, %fd96;
	mov.u32 	%r17, %r158;
	add.s32 	%r18, %r142, %r69;
	setp.ge.u32	%p12, %r18, %r67;
	mov.u32 	%r159, %r17;
	mov.f64 	%fd97, %fd8;
	@%p12 bra 	BB11_21;

	cvt.u64.u32	%rd5, %r18;
	mul.wide.u32 	%rd56, %r18, 8;
	add.s64 	%rd57, %rd43, %rd56;
	ld.global.f64 	%fd9, [%rd57];
	setp.geu.f64	%p13, %fd8, %fd9;
	mov.u32 	%r146, %r17;
	mov.u32 	%r159, %r146;
	mov.f64 	%fd73, %fd8;
	mov.f64 	%fd97, %fd73;
	@%p13 bra 	BB11_21;

	cvta.to.global.u64 	%rd58, %rd20;
	shl.b64 	%rd59, %rd5, 2;
	add.s64 	%rd60, %rd58, %rd59;
	ld.global.u32 	%r19, [%rd60];
	div.s32 	%r20, %r19, %r68;
	rem.s32 	%r21, %r19, %r68;
	mov.u32 	%r144, 0;

BB11_18:
	cvt.s64.s32	%rd6, %r144;
	mul.wide.s32 	%rd61, %r144, 4;
	mov.u64 	%rd62, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd63, %rd62, %rd61;
	ld.shared.u32 	%r91, [%rd63];
	setp.ne.s32	%p14, %r91, %r20;
	@%p14 bra 	BB11_20;

	shl.b64 	%rd64, %rd6, 2;
	mov.u64 	%rd65, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd66, %rd65, %rd64;
	ld.shared.u32 	%r92, [%rd66];
	setp.eq.s32	%p15, %r92, %r21;
	mov.u32 	%r147, %r17;
	mov.u32 	%r159, %r147;
	mov.f64 	%fd74, %fd8;
	mov.f64 	%fd97, %fd74;
	@%p15 bra 	BB11_21;

BB11_20:
	add.s32 	%r144, %r144, 1;
	setp.lt.s32	%p16, %r144, %r70;
	mov.u32 	%r159, %r19;
	mov.f64 	%fd97, %fd9;
	@%p16 bra 	BB11_18;

BB11_21:
	mov.f64 	%fd98, %fd97;
	mov.u32 	%r160, %r159;
	mov.u32 	%r94, %nctaid.x;
	mad.lo.s32 	%r142, %r94, %r77, %r142;
	setp.lt.u32	%p17, %r142, %r67;
	mov.u32 	%r157, %r160;
	mov.f64 	%fd95, %fd98;
	@%p17 bra 	BB11_10;
	bra.uni 	BB11_22;

BB11_4:
	mov.f64 	%fd81, %fd101;
	mov.f64 	%fd1, %fd81;
	mov.u32 	%r154, %r163;
	mov.u32 	%r161, %r154;
	cvta.to.global.u64 	%rd31, %rd18;
	cvt.u64.u32	%rd1, %r142;
	mul.wide.u32 	%rd32, %r142, 8;
	add.s64 	%rd33, %rd31, %rd32;
	ld.global.f64 	%fd2, [%rd33];
	setp.geu.f64	%p4, %fd1, %fd2;
	mov.f64 	%fd99, %fd1;
	@%p4 bra 	BB11_6;

	cvta.to.global.u64 	%rd34, %rd20;
	shl.b64 	%rd35, %rd1, 2;
	add.s64 	%rd36, %rd34, %rd35;
	ld.global.u32 	%r161, [%rd36];
	mov.f64 	%fd99, %fd2;

BB11_6:
	mov.f64 	%fd3, %fd99;
	mov.u32 	%r162, %r161;
	add.s32 	%r6, %r142, %r69;
	setp.ge.u32	%p5, %r6, %r67;
	mov.f64 	%fd100, %fd3;
	@%p5 bra 	BB11_9;

	cvt.u64.u32	%rd2, %r6;
	mul.wide.u32 	%rd38, %r6, 8;
	add.s64 	%rd39, %rd31, %rd38;
	ld.global.f64 	%fd4, [%rd39];
	setp.geu.f64	%p6, %fd3, %fd4;
	mov.f64 	%fd80, %fd3;
	mov.f64 	%fd100, %fd80;
	@%p6 bra 	BB11_9;

	cvta.to.global.u64 	%rd40, %rd20;
	shl.b64 	%rd41, %rd2, 2;
	add.s64 	%rd42, %rd40, %rd41;
	ld.global.u32 	%r162, [%rd42];
	mov.f64 	%fd100, %fd4;

BB11_9:
	mov.f64 	%fd101, %fd100;
	mov.u32 	%r163, %r162;
	mov.u32 	%r86, %nctaid.x;
	mad.lo.s32 	%r142, %r86, %r77, %r142;
	setp.lt.u32	%p7, %r142, %r67;
	mov.u32 	%r157, %r163;
	mov.f64 	%fd95, %fd101;
	@%p7 bra 	BB11_4;

BB11_22:
	mov.f64 	%fd93, %fd95;
	mul.wide.u32 	%rd67, %r71, 8;
	mov.u64 	%rd68, reduce_max_filter_final$__cuda_local_var_44586_33_non_const_sdata;
	add.s64 	%rd7, %rd68, %rd67;
	st.shared.f64 	[%rd7], %fd93;
	mul.wide.u32 	%rd69, %r71, 4;
	mov.u64 	%rd70, reduce_max_filter_final$__cuda_local_var_44587_30_non_const_idxData;
	add.s64 	%rd8, %rd70, %rd69;
	st.shared.u32 	[%rd8], %r157;
	bar.sync 	0;
	setp.lt.s32	%p18, %r69, 512;
	@%p18 bra 	BB11_31;

	setp.gt.u32	%p19, %r71, 255;
	mov.f64 	%fd94, %fd93;
	@%p19 bra 	BB11_30;

	ld.shared.f64 	%fd12, [%rd7+2048];
	setp.geu.f64	%p20, %fd93, %fd12;
	mov.f64 	%fd70, %fd93;
	mov.f64 	%fd94, %fd70;
	@%p20 bra 	BB11_30;

	ld.shared.u32 	%r27, [%rd8+1024];
	div.s32 	%r28, %r27, %r68;
	rem.s32 	%r29, %r27, %r68;
	mov.u32 	%r164, 0;
	setp.lt.s32	%p21, %r70, 1;
	@%p21 bra 	BB11_29;

BB11_26:
	cvt.s64.s32	%rd9, %r164;
	mul.wide.s32 	%rd71, %r164, 4;
	mov.u64 	%rd72, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd73, %rd72, %rd71;
	ld.shared.u32 	%r98, [%rd73];
	setp.ne.s32	%p22, %r98, %r28;
	@%p22 bra 	BB11_28;

	shl.b64 	%rd74, %rd9, 2;
	mov.u64 	%rd75, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd76, %rd75, %rd74;
	ld.shared.u32 	%r99, [%rd76];
	setp.eq.s32	%p23, %r99, %r29;
	mov.f64 	%fd71, %fd93;
	mov.f64 	%fd94, %fd71;
	@%p23 bra 	BB11_30;

BB11_28:
	add.s32 	%r164, %r164, 1;
	setp.lt.s32	%p24, %r164, %r70;
	@%p24 bra 	BB11_26;

BB11_29:
	st.shared.f64 	[%rd7], %fd12;
	st.shared.u32 	[%rd8], %r27;
	mov.f64 	%fd94, %fd12;

BB11_30:
	mov.f64 	%fd93, %fd94;
	bar.sync 	0;

BB11_31:
	mov.f64 	%fd91, %fd93;
	setp.lt.s32	%p25, %r69, 256;
	@%p25 bra 	BB11_40;

	setp.gt.u32	%p26, %r71, 127;
	mov.f64 	%fd92, %fd91;
	@%p26 bra 	BB11_39;

	ld.shared.f64 	%fd15, [%rd7+1024];
	setp.geu.f64	%p27, %fd91, %fd15;
	mov.f64 	%fd65, %fd91;
	mov.f64 	%fd92, %fd65;
	@%p27 bra 	BB11_39;

	ld.shared.u32 	%r32, [%rd8+512];
	div.s32 	%r33, %r32, %r68;
	rem.s32 	%r34, %r32, %r68;
	mov.u32 	%r165, 0;
	setp.lt.s32	%p28, %r70, 1;
	@%p28 bra 	BB11_38;

BB11_35:
	cvt.s64.s32	%rd10, %r165;
	mul.wide.s32 	%rd77, %r165, 4;
	mov.u64 	%rd78, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd79, %rd78, %rd77;
	ld.shared.u32 	%r102, [%rd79];
	setp.ne.s32	%p29, %r102, %r33;
	@%p29 bra 	BB11_37;

	shl.b64 	%rd80, %rd10, 2;
	mov.u64 	%rd81, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd82, %rd81, %rd80;
	ld.shared.u32 	%r103, [%rd82];
	setp.eq.s32	%p30, %r103, %r34;
	mov.f64 	%fd66, %fd91;
	mov.f64 	%fd92, %fd66;
	@%p30 bra 	BB11_39;

BB11_37:
	add.s32 	%r165, %r165, 1;
	setp.lt.s32	%p31, %r165, %r70;
	@%p31 bra 	BB11_35;

BB11_38:
	st.shared.f64 	[%rd7], %fd15;
	st.shared.u32 	[%rd8], %r32;
	mov.f64 	%fd92, %fd15;

BB11_39:
	mov.f64 	%fd91, %fd92;
	bar.sync 	0;

BB11_40:
	mov.f64 	%fd89, %fd91;
	setp.lt.s32	%p32, %r69, 128;
	@%p32 bra 	BB11_49;

	setp.gt.u32	%p33, %r71, 63;
	mov.f64 	%fd90, %fd89;
	@%p33 bra 	BB11_48;

	ld.shared.f64 	%fd18, [%rd7+512];
	setp.geu.f64	%p34, %fd89, %fd18;
	mov.f64 	%fd60, %fd89;
	mov.f64 	%fd90, %fd60;
	@%p34 bra 	BB11_48;

	ld.shared.u32 	%r37, [%rd8+256];
	div.s32 	%r38, %r37, %r68;
	rem.s32 	%r39, %r37, %r68;
	mov.u32 	%r166, 0;
	setp.lt.s32	%p35, %r70, 1;
	@%p35 bra 	BB11_47;

BB11_44:
	cvt.s64.s32	%rd11, %r166;
	mul.wide.s32 	%rd83, %r166, 4;
	mov.u64 	%rd84, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd85, %rd84, %rd83;
	ld.shared.u32 	%r106, [%rd85];
	setp.ne.s32	%p36, %r106, %r38;
	@%p36 bra 	BB11_46;

	shl.b64 	%rd86, %rd11, 2;
	mov.u64 	%rd87, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd88, %rd87, %rd86;
	ld.shared.u32 	%r107, [%rd88];
	setp.eq.s32	%p37, %r107, %r39;
	mov.f64 	%fd61, %fd89;
	mov.f64 	%fd90, %fd61;
	@%p37 bra 	BB11_48;

BB11_46:
	add.s32 	%r166, %r166, 1;
	setp.lt.s32	%p38, %r166, %r70;
	@%p38 bra 	BB11_44;

BB11_47:
	st.shared.f64 	[%rd7], %fd18;
	st.shared.u32 	[%rd8], %r37;
	mov.f64 	%fd90, %fd18;

BB11_48:
	mov.f64 	%fd89, %fd90;
	bar.sync 	0;

BB11_49:
	mov.f64 	%fd20, %fd89;
	setp.gt.u32	%p39, %r71, 31;
	@%p39 bra 	BB11_93;

	setp.lt.s32	%p40, %r69, 64;
	mov.f64 	%fd88, %fd20;
	@%p40 bra 	BB11_57;

	ld.volatile.shared.f64 	%fd34, [%rd7+256];
	setp.geu.f64	%p41, %fd20, %fd34;
	mov.f64 	%fd55, %fd20;
	mov.f64 	%fd88, %fd55;
	@%p41 bra 	BB11_57;

	ld.volatile.shared.u32 	%r110, [%rd8+128];
	div.s32 	%r42, %r110, %r68;
	rem.s32 	%r43, %r110, %r68;
	mov.u32 	%r167, 0;
	setp.lt.s32	%p42, %r70, 1;
	@%p42 bra 	BB11_56;

BB11_53:
	cvt.s64.s32	%rd12, %r167;
	mul.wide.s32 	%rd89, %r167, 4;
	mov.u64 	%rd90, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd91, %rd90, %rd89;
	ld.volatile.shared.u32 	%r111, [%rd91];
	setp.ne.s32	%p43, %r111, %r42;
	@%p43 bra 	BB11_55;

	shl.b64 	%rd92, %rd12, 2;
	mov.u64 	%rd93, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd94, %rd93, %rd92;
	ld.volatile.shared.u32 	%r112, [%rd94];
	setp.eq.s32	%p44, %r112, %r43;
	mov.f64 	%fd56, %fd20;
	mov.f64 	%fd88, %fd56;
	@%p44 bra 	BB11_57;

BB11_55:
	add.s32 	%r167, %r167, 1;
	setp.lt.s32	%p45, %r167, %r70;
	@%p45 bra 	BB11_53;

BB11_56:
	ld.volatile.shared.f64 	%fd88, [%rd7+256];
	st.volatile.shared.f64 	[%rd7], %fd88;
	ld.volatile.shared.u32 	%r113, [%rd8+128];
	st.volatile.shared.u32 	[%rd8], %r113;

BB11_57:
	mov.f64 	%fd22, %fd88;
	setp.lt.s32	%p46, %r69, 32;
	mov.f64 	%fd87, %fd22;
	@%p46 bra 	BB11_64;

	ld.volatile.shared.f64 	%fd35, [%rd7+128];
	setp.geu.f64	%p47, %fd22, %fd35;
	mov.f64 	%fd52, %fd22;
	mov.f64 	%fd87, %fd52;
	@%p47 bra 	BB11_64;

	ld.volatile.shared.u32 	%r115, [%rd8+64];
	div.s32 	%r46, %r115, %r68;
	rem.s32 	%r47, %r115, %r68;
	mov.u32 	%r168, 0;
	setp.lt.s32	%p48, %r70, 1;
	@%p48 bra 	BB11_63;

BB11_60:
	cvt.s64.s32	%rd13, %r168;
	mul.wide.s32 	%rd95, %r168, 4;
	mov.u64 	%rd96, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd97, %rd96, %rd95;
	ld.volatile.shared.u32 	%r116, [%rd97];
	setp.ne.s32	%p49, %r116, %r46;
	@%p49 bra 	BB11_62;

	shl.b64 	%rd98, %rd13, 2;
	mov.u64 	%rd99, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd100, %rd99, %rd98;
	ld.volatile.shared.u32 	%r117, [%rd100];
	setp.eq.s32	%p50, %r117, %r47;
	mov.f64 	%fd53, %fd22;
	mov.f64 	%fd87, %fd53;
	@%p50 bra 	BB11_64;

BB11_62:
	add.s32 	%r168, %r168, 1;
	setp.lt.s32	%p51, %r168, %r70;
	@%p51 bra 	BB11_60;

BB11_63:
	ld.volatile.shared.f64 	%fd87, [%rd7+128];
	st.volatile.shared.f64 	[%rd7], %fd87;
	ld.volatile.shared.u32 	%r118, [%rd8+64];
	st.volatile.shared.u32 	[%rd8], %r118;

BB11_64:
	mov.f64 	%fd24, %fd87;
	setp.lt.s32	%p52, %r69, 16;
	mov.f64 	%fd86, %fd24;
	@%p52 bra 	BB11_71;

	ld.volatile.shared.f64 	%fd36, [%rd7+64];
	setp.geu.f64	%p53, %fd24, %fd36;
	mov.f64 	%fd49, %fd24;
	mov.f64 	%fd86, %fd49;
	@%p53 bra 	BB11_71;

	ld.volatile.shared.u32 	%r120, [%rd8+32];
	div.s32 	%r50, %r120, %r68;
	rem.s32 	%r51, %r120, %r68;
	mov.u32 	%r169, 0;
	setp.lt.s32	%p54, %r70, 1;
	@%p54 bra 	BB11_70;

BB11_67:
	cvt.s64.s32	%rd14, %r169;
	mul.wide.s32 	%rd101, %r169, 4;
	mov.u64 	%rd102, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd103, %rd102, %rd101;
	ld.volatile.shared.u32 	%r121, [%rd103];
	setp.ne.s32	%p55, %r121, %r50;
	@%p55 bra 	BB11_69;

	shl.b64 	%rd104, %rd14, 2;
	mov.u64 	%rd105, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd106, %rd105, %rd104;
	ld.volatile.shared.u32 	%r122, [%rd106];
	setp.eq.s32	%p56, %r122, %r51;
	mov.f64 	%fd50, %fd24;
	mov.f64 	%fd86, %fd50;
	@%p56 bra 	BB11_71;

BB11_69:
	add.s32 	%r169, %r169, 1;
	setp.lt.s32	%p57, %r169, %r70;
	@%p57 bra 	BB11_67;

BB11_70:
	ld.volatile.shared.f64 	%fd86, [%rd7+64];
	st.volatile.shared.f64 	[%rd7], %fd86;
	ld.volatile.shared.u32 	%r123, [%rd8+32];
	st.volatile.shared.u32 	[%rd8], %r123;

BB11_71:
	mov.f64 	%fd26, %fd86;
	setp.lt.s32	%p58, %r69, 8;
	mov.f64 	%fd85, %fd26;
	@%p58 bra 	BB11_78;

	ld.volatile.shared.f64 	%fd37, [%rd7+32];
	setp.geu.f64	%p59, %fd26, %fd37;
	mov.f64 	%fd46, %fd26;
	mov.f64 	%fd85, %fd46;
	@%p59 bra 	BB11_78;

	ld.volatile.shared.u32 	%r125, [%rd8+16];
	div.s32 	%r54, %r125, %r68;
	rem.s32 	%r55, %r125, %r68;
	mov.u32 	%r170, 0;
	setp.lt.s32	%p60, %r70, 1;
	@%p60 bra 	BB11_77;

BB11_74:
	cvt.s64.s32	%rd15, %r170;
	mul.wide.s32 	%rd107, %r170, 4;
	mov.u64 	%rd108, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd109, %rd108, %rd107;
	ld.volatile.shared.u32 	%r126, [%rd109];
	setp.ne.s32	%p61, %r126, %r54;
	@%p61 bra 	BB11_76;

	shl.b64 	%rd110, %rd15, 2;
	mov.u64 	%rd111, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd112, %rd111, %rd110;
	ld.volatile.shared.u32 	%r127, [%rd112];
	setp.eq.s32	%p62, %r127, %r55;
	mov.f64 	%fd47, %fd26;
	mov.f64 	%fd85, %fd47;
	@%p62 bra 	BB11_78;

BB11_76:
	add.s32 	%r170, %r170, 1;
	setp.lt.s32	%p63, %r170, %r70;
	@%p63 bra 	BB11_74;

BB11_77:
	ld.volatile.shared.f64 	%fd85, [%rd7+32];
	st.volatile.shared.f64 	[%rd7], %fd85;
	ld.volatile.shared.u32 	%r128, [%rd8+16];
	st.volatile.shared.u32 	[%rd8], %r128;

BB11_78:
	mov.f64 	%fd28, %fd85;
	setp.lt.s32	%p64, %r69, 4;
	mov.f64 	%fd84, %fd28;
	@%p64 bra 	BB11_85;

	ld.volatile.shared.f64 	%fd38, [%rd7+16];
	setp.geu.f64	%p65, %fd28, %fd38;
	mov.f64 	%fd43, %fd28;
	mov.f64 	%fd84, %fd43;
	@%p65 bra 	BB11_85;

	ld.volatile.shared.u32 	%r130, [%rd8+8];
	div.s32 	%r58, %r130, %r68;
	rem.s32 	%r59, %r130, %r68;
	mov.u32 	%r171, 0;
	setp.lt.s32	%p66, %r70, 1;
	@%p66 bra 	BB11_84;

BB11_81:
	cvt.s64.s32	%rd16, %r171;
	mul.wide.s32 	%rd113, %r171, 4;
	mov.u64 	%rd114, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd115, %rd114, %rd113;
	ld.volatile.shared.u32 	%r131, [%rd115];
	setp.ne.s32	%p67, %r131, %r58;
	@%p67 bra 	BB11_83;

	shl.b64 	%rd116, %rd16, 2;
	mov.u64 	%rd117, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd118, %rd117, %rd116;
	ld.volatile.shared.u32 	%r132, [%rd118];
	setp.eq.s32	%p68, %r132, %r59;
	mov.f64 	%fd44, %fd28;
	mov.f64 	%fd84, %fd44;
	@%p68 bra 	BB11_85;

BB11_83:
	add.s32 	%r171, %r171, 1;
	setp.lt.s32	%p69, %r171, %r70;
	@%p69 bra 	BB11_81;

BB11_84:
	ld.volatile.shared.f64 	%fd84, [%rd7+16];
	st.volatile.shared.f64 	[%rd7], %fd84;
	ld.volatile.shared.u32 	%r133, [%rd8+8];
	st.volatile.shared.u32 	[%rd8], %r133;

BB11_85:
	setp.lt.s32	%p70, %r69, 2;
	@%p70 bra 	BB11_92;

	ld.volatile.shared.f64 	%fd39, [%rd7+8];
	setp.geu.f64	%p71, %fd84, %fd39;
	@%p71 bra 	BB11_92;

	ld.volatile.shared.u32 	%r135, [%rd8+4];
	div.s32 	%r62, %r135, %r68;
	rem.s32 	%r63, %r135, %r68;
	mov.u32 	%r172, 0;
	setp.lt.s32	%p72, %r70, 1;
	@%p72 bra 	BB11_91;

BB11_88:
	cvt.s64.s32	%rd17, %r172;
	mul.wide.s32 	%rd119, %r172, 4;
	mov.u64 	%rd120, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow;
	add.s64 	%rd121, %rd120, %rd119;
	ld.volatile.shared.u32 	%r136, [%rd121];
	setp.ne.s32	%p73, %r136, %r62;
	@%p73 bra 	BB11_90;

	shl.b64 	%rd122, %rd17, 2;
	mov.u64 	%rd123, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol;
	add.s64 	%rd124, %rd123, %rd122;
	ld.volatile.shared.u32 	%r137, [%rd124];
	setp.eq.s32	%p74, %r137, %r63;
	@%p74 bra 	BB11_92;

BB11_90:
	add.s32 	%r172, %r172, 1;
	setp.lt.s32	%p75, %r172, %r70;
	@%p75 bra 	BB11_88;

BB11_91:
	ld.volatile.shared.f64 	%fd40, [%rd7+8];
	st.volatile.shared.f64 	[%rd7], %fd40;
	ld.volatile.shared.u32 	%r138, [%rd8+4];
	st.volatile.shared.u32 	[%rd8], %r138;

BB11_92:
	bar.sync 	0;

BB11_93:
	setp.ne.s32	%p76, %r71, 0;
	@%p76 bra 	BB11_96;

	ld.shared.f64 	%fd41, [reduce_max_filter_final$__cuda_local_var_44586_33_non_const_sdata];
	cvta.to.global.u64 	%rd125, %rd19;
	mul.wide.u32 	%rd126, %r78, 8;
	add.s64 	%rd127, %rd125, %rd126;
	st.global.f64 	[%rd127], %fd41;
	ld.shared.u32 	%r66, [reduce_max_filter_final$__cuda_local_var_44587_30_non_const_idxData];
	cvta.to.global.u64 	%rd128, %rd20;
	mul.wide.u32 	%rd129, %r78, 4;
	add.s64 	%rd130, %rd128, %rd129;
	st.global.u32 	[%rd130], %r66;
	mov.u32 	%r141, %nctaid.x;
	setp.ne.s32	%p77, %r141, 1;
	@%p77 bra 	BB11_96;

	cvta.to.global.u64 	%rd131, %rd21;
	mul.wide.s32 	%rd132, %r70, 4;
	add.s64 	%rd133, %rd131, %rd132;
	st.global.u32 	[%rd133], %r66;

BB11_96:
	ret;
}

	// .globl	reduce_max_filter_main
.visible .entry reduce_max_filter_main(
	.param .u64 reduce_max_filter_main_param_0,
	.param .u64 reduce_max_filter_main_param_1,
	.param .u64 reduce_max_filter_main_param_2,
	.param .u32 reduce_max_filter_main_param_3,
	.param .u32 reduce_max_filter_main_param_4,
	.param .u32 reduce_max_filter_main_param_5,
	.param .u64 reduce_max_filter_main_param_6,
	.param .u32 reduce_max_filter_main_param_7
)
{
	.reg .pred 	%p<87>;
	.reg .b32 	%r<147>;
	.reg .f64 	%fd<88>;
	.reg .b64 	%rd<112>;
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow[40];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol[40];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_main$__cuda_local_var_44797_30_non_const_smaxesVal[40];
	// demoted variable
	.shared .align 8 .b8 reduce_max_filter_main$__cuda_local_var_44798_33_non_const_sdata[2048];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_main$__cuda_local_var_44799_30_non_const_idxData[1024];

	ld.param.u64 	%rd14, [reduce_max_filter_main_param_0];
	ld.param.u64 	%rd15, [reduce_max_filter_main_param_1];
	ld.param.u64 	%rd16, [reduce_max_filter_main_param_2];
	ld.param.u32 	%r60, [reduce_max_filter_main_param_3];
	ld.param.u32 	%r61, [reduce_max_filter_main_param_4];
	ld.param.u32 	%r62, [reduce_max_filter_main_param_5];
	ld.param.u64 	%rd17, [reduce_max_filter_main_param_6];
	ld.param.u32 	%r63, [reduce_max_filter_main_param_7];
	mov.u32 	%r1, %tid.x;
	setp.ge.u32	%p3, %r1, %r63;
	@%p3 bra 	BB12_2;

	cvta.to.global.u64 	%rd18, %rd17;
	mul.wide.u32 	%rd19, %r1, 4;
	add.s64 	%rd20, %rd18, %rd19;
	ld.global.u32 	%r65, [%rd20];
	mov.u64 	%rd21, reduce_max_filter_main$__cuda_local_var_44797_30_non_const_smaxesVal;
	add.s64 	%rd22, %rd21, %rd19;
	st.shared.u32 	[%rd22], %r65;
	div.u32 	%r66, %r65, %r60;
	mov.u64 	%rd23, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd24, %rd23, %rd19;
	st.shared.u32 	[%rd24], %r66;
	rem.u32 	%r67, %r65, %r60;
	mov.u64 	%rd25, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd26, %rd25, %rd19;
	st.shared.u32 	[%rd26], %r67;

BB12_2:
	mov.u32 	%r68, %ctaid.x;
	mad.lo.s32 	%r134, %r68, %r62, %r1;
	bar.sync 	0;
	mul.lo.s32 	%r70, %r61, %r60;
	mov.f64 	%fd84, 0dFFF0000000000000;
	setp.ge.u32	%p4, %r134, %r70;
	@%p4 bra 	BB12_18;

	mov.f64 	%fd87, 0dFFF0000000000000;

BB12_4:
	mov.f64 	%fd70, %fd87;
	mov.f64 	%fd85, %fd70;
	cvta.to.global.u64 	%rd27, %rd14;
	mul.wide.u32 	%rd28, %r134, 8;
	add.s64 	%rd29, %rd27, %rd28;
	ld.global.f64 	%fd2, [%rd29];
	setp.geu.f64	%p5, %fd85, %fd2;
	@%p5 bra 	BB12_10;

	div.s32 	%r6, %r134, %r60;
	rem.s32 	%r7, %r134, %r60;
	mov.pred 	%p6, -1;
	mov.u32 	%r135, 0;
	setp.lt.s32	%p7, %r63, 1;
	mov.pred 	%p84, %p6;
	@%p7 bra 	BB12_9;

BB12_6:
	cvt.s64.s32	%rd1, %r135;
	mul.wide.s32 	%rd30, %r135, 4;
	mov.u64 	%rd31, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd32, %rd31, %rd30;
	ld.shared.u32 	%r75, [%rd32];
	setp.ne.s32	%p8, %r75, %r6;
	@%p8 bra 	BB12_8;

	shl.b64 	%rd33, %rd1, 2;
	mov.u64 	%rd34, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd35, %rd34, %rd33;
	ld.shared.u32 	%r76, [%rd35];
	setp.eq.s32	%p10, %r76, %r7;
	mov.pred 	%p9, 0;
	mov.pred 	%p84, %p9;
	@%p10 bra 	BB12_9;

BB12_8:
	add.s32 	%r135, %r135, 1;
	setp.lt.s32	%p12, %r135, %r63;
	mov.pred 	%p83, %p6;
	mov.pred 	%p84, %p83;
	@%p12 bra 	BB12_6;

BB12_9:
	selp.f64	%fd85, %fd2, %fd85, %p84;
	selp.b32	%r137, %r134, %r137, %p84;

BB12_10:
	mov.f64 	%fd4, %fd85;
	add.s32 	%r78, %r134, %r62;
	setp.ge.u32	%p13, %r78, %r70;
	mov.f64 	%fd86, %fd4;
	@%p13 bra 	BB12_17;

	mul.wide.u32 	%rd37, %r78, 8;
	add.s64 	%rd38, %rd27, %rd37;
	ld.global.f64 	%fd5, [%rd38];
	setp.geu.f64	%p14, %fd4, %fd5;
	mov.f64 	%fd69, %fd4;
	mov.f64 	%fd86, %fd69;
	@%p14 bra 	BB12_17;

	div.s32 	%r12, %r78, %r60;
	rem.s32 	%r13, %r78, %r60;
	mov.pred 	%p15, -1;
	mov.u32 	%r136, 0;
	setp.lt.s32	%p16, %r63, 1;
	mov.pred 	%p86, %p15;
	@%p16 bra 	BB12_16;

BB12_13:
	cvt.s64.s32	%rd2, %r136;
	mul.wide.s32 	%rd39, %r136, 4;
	mov.u64 	%rd40, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd41, %rd40, %rd39;
	ld.shared.u32 	%r82, [%rd41];
	setp.ne.s32	%p17, %r82, %r12;
	@%p17 bra 	BB12_15;

	shl.b64 	%rd42, %rd2, 2;
	mov.u64 	%rd43, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd44, %rd43, %rd42;
	ld.shared.u32 	%r83, [%rd44];
	setp.eq.s32	%p19, %r83, %r13;
	mov.pred 	%p18, 0;
	mov.pred 	%p86, %p18;
	@%p19 bra 	BB12_16;

BB12_15:
	add.s32 	%r136, %r136, 1;
	setp.lt.s32	%p21, %r136, %r63;
	mov.pred 	%p85, %p15;
	mov.pred 	%p86, %p85;
	@%p21 bra 	BB12_13;

BB12_16:
	selp.f64	%fd86, %fd5, %fd4, %p86;
	selp.b32	%r137, %r78, %r137, %p86;

BB12_17:
	mov.f64 	%fd87, %fd86;
	mov.u32 	%r85, %nctaid.x;
	mad.lo.s32 	%r134, %r85, %r62, %r134;
	setp.lt.u32	%p22, %r134, %r70;
	mov.f64 	%fd84, %fd87;
	@%p22 bra 	BB12_4;

BB12_18:
	mov.f64 	%fd82, %fd84;
	mul.wide.u32 	%rd45, %r1, 8;
	mov.u64 	%rd46, reduce_max_filter_main$__cuda_local_var_44798_33_non_const_sdata;
	add.s64 	%rd3, %rd46, %rd45;
	st.shared.f64 	[%rd3], %fd82;
	mul.wide.u32 	%rd47, %r1, 4;
	mov.u64 	%rd48, reduce_max_filter_main$__cuda_local_var_44799_30_non_const_idxData;
	add.s64 	%rd4, %rd48, %rd47;
	st.shared.u32 	[%rd4], %r137;
	bar.sync 	0;
	setp.lt.s32	%p23, %r62, 512;
	@%p23 bra 	BB12_27;

	setp.gt.u32	%p24, %r1, 255;
	mov.f64 	%fd83, %fd82;
	@%p24 bra 	BB12_26;

	ld.shared.f64 	%fd9, [%rd3+2048];
	setp.geu.f64	%p25, %fd82, %fd9;
	mov.f64 	%fd66, %fd82;
	mov.f64 	%fd83, %fd66;
	@%p25 bra 	BB12_26;

	ld.shared.u32 	%r20, [%rd4+1024];
	div.s32 	%r21, %r20, %r60;
	rem.s32 	%r22, %r20, %r60;
	mov.u32 	%r138, 0;
	setp.lt.s32	%p26, %r63, 1;
	@%p26 bra 	BB12_25;

BB12_22:
	cvt.s64.s32	%rd5, %r138;
	mul.wide.s32 	%rd49, %r138, 4;
	mov.u64 	%rd50, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd51, %rd50, %rd49;
	ld.shared.u32 	%r90, [%rd51];
	setp.ne.s32	%p27, %r90, %r21;
	@%p27 bra 	BB12_24;

	shl.b64 	%rd52, %rd5, 2;
	mov.u64 	%rd53, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd54, %rd53, %rd52;
	ld.shared.u32 	%r91, [%rd54];
	setp.eq.s32	%p28, %r91, %r22;
	mov.f64 	%fd67, %fd82;
	mov.f64 	%fd83, %fd67;
	@%p28 bra 	BB12_26;

BB12_24:
	add.s32 	%r138, %r138, 1;
	setp.lt.s32	%p29, %r138, %r63;
	@%p29 bra 	BB12_22;

BB12_25:
	st.shared.f64 	[%rd3], %fd9;
	st.shared.u32 	[%rd4], %r20;
	mov.f64 	%fd83, %fd9;

BB12_26:
	mov.f64 	%fd82, %fd83;
	bar.sync 	0;

BB12_27:
	mov.f64 	%fd80, %fd82;
	setp.lt.s32	%p30, %r62, 256;
	@%p30 bra 	BB12_36;

	setp.gt.u32	%p31, %r1, 127;
	mov.f64 	%fd81, %fd80;
	@%p31 bra 	BB12_35;

	ld.shared.f64 	%fd12, [%rd3+1024];
	setp.geu.f64	%p32, %fd80, %fd12;
	mov.f64 	%fd61, %fd80;
	mov.f64 	%fd81, %fd61;
	@%p32 bra 	BB12_35;

	ld.shared.u32 	%r25, [%rd4+512];
	div.s32 	%r26, %r25, %r60;
	rem.s32 	%r27, %r25, %r60;
	mov.u32 	%r139, 0;
	setp.lt.s32	%p33, %r63, 1;
	@%p33 bra 	BB12_34;

BB12_31:
	cvt.s64.s32	%rd6, %r139;
	mul.wide.s32 	%rd55, %r139, 4;
	mov.u64 	%rd56, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd57, %rd56, %rd55;
	ld.shared.u32 	%r94, [%rd57];
	setp.ne.s32	%p34, %r94, %r26;
	@%p34 bra 	BB12_33;

	shl.b64 	%rd58, %rd6, 2;
	mov.u64 	%rd59, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd60, %rd59, %rd58;
	ld.shared.u32 	%r95, [%rd60];
	setp.eq.s32	%p35, %r95, %r27;
	mov.f64 	%fd62, %fd80;
	mov.f64 	%fd81, %fd62;
	@%p35 bra 	BB12_35;

BB12_33:
	add.s32 	%r139, %r139, 1;
	setp.lt.s32	%p36, %r139, %r63;
	@%p36 bra 	BB12_31;

BB12_34:
	st.shared.f64 	[%rd3], %fd12;
	st.shared.u32 	[%rd4], %r25;
	mov.f64 	%fd81, %fd12;

BB12_35:
	mov.f64 	%fd80, %fd81;
	bar.sync 	0;

BB12_36:
	mov.f64 	%fd78, %fd80;
	setp.lt.s32	%p37, %r62, 128;
	@%p37 bra 	BB12_45;

	setp.gt.u32	%p38, %r1, 63;
	mov.f64 	%fd79, %fd78;
	@%p38 bra 	BB12_44;

	ld.shared.f64 	%fd15, [%rd3+512];
	setp.geu.f64	%p39, %fd78, %fd15;
	mov.f64 	%fd56, %fd78;
	mov.f64 	%fd79, %fd56;
	@%p39 bra 	BB12_44;

	ld.shared.u32 	%r30, [%rd4+256];
	div.s32 	%r31, %r30, %r60;
	rem.s32 	%r32, %r30, %r60;
	mov.u32 	%r140, 0;
	setp.lt.s32	%p40, %r63, 1;
	@%p40 bra 	BB12_43;

BB12_40:
	cvt.s64.s32	%rd7, %r140;
	mul.wide.s32 	%rd61, %r140, 4;
	mov.u64 	%rd62, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd63, %rd62, %rd61;
	ld.shared.u32 	%r98, [%rd63];
	setp.ne.s32	%p41, %r98, %r31;
	@%p41 bra 	BB12_42;

	shl.b64 	%rd64, %rd7, 2;
	mov.u64 	%rd65, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd66, %rd65, %rd64;
	ld.shared.u32 	%r99, [%rd66];
	setp.eq.s32	%p42, %r99, %r32;
	mov.f64 	%fd57, %fd78;
	mov.f64 	%fd79, %fd57;
	@%p42 bra 	BB12_44;

BB12_42:
	add.s32 	%r140, %r140, 1;
	setp.lt.s32	%p43, %r140, %r63;
	@%p43 bra 	BB12_40;

BB12_43:
	st.shared.f64 	[%rd3], %fd15;
	st.shared.u32 	[%rd4], %r30;
	mov.f64 	%fd79, %fd15;

BB12_44:
	mov.f64 	%fd78, %fd79;
	bar.sync 	0;

BB12_45:
	mov.f64 	%fd17, %fd78;
	setp.gt.u32	%p44, %r1, 31;
	@%p44 bra 	BB12_89;

	setp.lt.s32	%p45, %r62, 64;
	mov.f64 	%fd77, %fd17;
	@%p45 bra 	BB12_53;

	ld.volatile.shared.f64 	%fd30, [%rd3+256];
	setp.geu.f64	%p46, %fd17, %fd30;
	mov.f64 	%fd51, %fd17;
	mov.f64 	%fd77, %fd51;
	@%p46 bra 	BB12_53;

	ld.volatile.shared.u32 	%r102, [%rd4+128];
	div.s32 	%r35, %r102, %r60;
	rem.s32 	%r36, %r102, %r60;
	mov.u32 	%r141, 0;
	setp.lt.s32	%p47, %r63, 1;
	@%p47 bra 	BB12_52;

BB12_49:
	cvt.s64.s32	%rd8, %r141;
	mul.wide.s32 	%rd67, %r141, 4;
	mov.u64 	%rd68, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd69, %rd68, %rd67;
	ld.volatile.shared.u32 	%r103, [%rd69];
	setp.ne.s32	%p48, %r103, %r35;
	@%p48 bra 	BB12_51;

	shl.b64 	%rd70, %rd8, 2;
	mov.u64 	%rd71, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd72, %rd71, %rd70;
	ld.volatile.shared.u32 	%r104, [%rd72];
	setp.eq.s32	%p49, %r104, %r36;
	mov.f64 	%fd52, %fd17;
	mov.f64 	%fd77, %fd52;
	@%p49 bra 	BB12_53;

BB12_51:
	add.s32 	%r141, %r141, 1;
	setp.lt.s32	%p50, %r141, %r63;
	@%p50 bra 	BB12_49;

BB12_52:
	ld.volatile.shared.f64 	%fd77, [%rd3+256];
	st.volatile.shared.f64 	[%rd3], %fd77;
	ld.volatile.shared.u32 	%r105, [%rd4+128];
	st.volatile.shared.u32 	[%rd4], %r105;

BB12_53:
	mov.f64 	%fd19, %fd77;
	setp.lt.s32	%p51, %r62, 32;
	mov.f64 	%fd76, %fd19;
	@%p51 bra 	BB12_60;

	ld.volatile.shared.f64 	%fd31, [%rd3+128];
	setp.geu.f64	%p52, %fd19, %fd31;
	mov.f64 	%fd48, %fd19;
	mov.f64 	%fd76, %fd48;
	@%p52 bra 	BB12_60;

	ld.volatile.shared.u32 	%r107, [%rd4+64];
	div.s32 	%r39, %r107, %r60;
	rem.s32 	%r40, %r107, %r60;
	mov.u32 	%r142, 0;
	setp.lt.s32	%p53, %r63, 1;
	@%p53 bra 	BB12_59;

BB12_56:
	cvt.s64.s32	%rd9, %r142;
	mul.wide.s32 	%rd73, %r142, 4;
	mov.u64 	%rd74, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd75, %rd74, %rd73;
	ld.volatile.shared.u32 	%r108, [%rd75];
	setp.ne.s32	%p54, %r108, %r39;
	@%p54 bra 	BB12_58;

	shl.b64 	%rd76, %rd9, 2;
	mov.u64 	%rd77, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd78, %rd77, %rd76;
	ld.volatile.shared.u32 	%r109, [%rd78];
	setp.eq.s32	%p55, %r109, %r40;
	mov.f64 	%fd49, %fd19;
	mov.f64 	%fd76, %fd49;
	@%p55 bra 	BB12_60;

BB12_58:
	add.s32 	%r142, %r142, 1;
	setp.lt.s32	%p56, %r142, %r63;
	@%p56 bra 	BB12_56;

BB12_59:
	ld.volatile.shared.f64 	%fd76, [%rd3+128];
	st.volatile.shared.f64 	[%rd3], %fd76;
	ld.volatile.shared.u32 	%r110, [%rd4+64];
	st.volatile.shared.u32 	[%rd4], %r110;

BB12_60:
	mov.f64 	%fd21, %fd76;
	setp.lt.s32	%p57, %r62, 16;
	mov.f64 	%fd75, %fd21;
	@%p57 bra 	BB12_67;

	ld.volatile.shared.f64 	%fd32, [%rd3+64];
	setp.geu.f64	%p58, %fd21, %fd32;
	mov.f64 	%fd45, %fd21;
	mov.f64 	%fd75, %fd45;
	@%p58 bra 	BB12_67;

	ld.volatile.shared.u32 	%r112, [%rd4+32];
	div.s32 	%r43, %r112, %r60;
	rem.s32 	%r44, %r112, %r60;
	mov.u32 	%r143, 0;
	setp.lt.s32	%p59, %r63, 1;
	@%p59 bra 	BB12_66;

BB12_63:
	cvt.s64.s32	%rd10, %r143;
	mul.wide.s32 	%rd79, %r143, 4;
	mov.u64 	%rd80, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd81, %rd80, %rd79;
	ld.volatile.shared.u32 	%r113, [%rd81];
	setp.ne.s32	%p60, %r113, %r43;
	@%p60 bra 	BB12_65;

	shl.b64 	%rd82, %rd10, 2;
	mov.u64 	%rd83, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd84, %rd83, %rd82;
	ld.volatile.shared.u32 	%r114, [%rd84];
	setp.eq.s32	%p61, %r114, %r44;
	mov.f64 	%fd46, %fd21;
	mov.f64 	%fd75, %fd46;
	@%p61 bra 	BB12_67;

BB12_65:
	add.s32 	%r143, %r143, 1;
	setp.lt.s32	%p62, %r143, %r63;
	@%p62 bra 	BB12_63;

BB12_66:
	ld.volatile.shared.f64 	%fd75, [%rd3+64];
	st.volatile.shared.f64 	[%rd3], %fd75;
	ld.volatile.shared.u32 	%r115, [%rd4+32];
	st.volatile.shared.u32 	[%rd4], %r115;

BB12_67:
	mov.f64 	%fd23, %fd75;
	setp.lt.s32	%p63, %r62, 8;
	mov.f64 	%fd74, %fd23;
	@%p63 bra 	BB12_74;

	ld.volatile.shared.f64 	%fd33, [%rd3+32];
	setp.geu.f64	%p64, %fd23, %fd33;
	mov.f64 	%fd42, %fd23;
	mov.f64 	%fd74, %fd42;
	@%p64 bra 	BB12_74;

	ld.volatile.shared.u32 	%r117, [%rd4+16];
	div.s32 	%r47, %r117, %r60;
	rem.s32 	%r48, %r117, %r60;
	mov.u32 	%r144, 0;
	setp.lt.s32	%p65, %r63, 1;
	@%p65 bra 	BB12_73;

BB12_70:
	cvt.s64.s32	%rd11, %r144;
	mul.wide.s32 	%rd85, %r144, 4;
	mov.u64 	%rd86, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd87, %rd86, %rd85;
	ld.volatile.shared.u32 	%r118, [%rd87];
	setp.ne.s32	%p66, %r118, %r47;
	@%p66 bra 	BB12_72;

	shl.b64 	%rd88, %rd11, 2;
	mov.u64 	%rd89, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd90, %rd89, %rd88;
	ld.volatile.shared.u32 	%r119, [%rd90];
	setp.eq.s32	%p67, %r119, %r48;
	mov.f64 	%fd43, %fd23;
	mov.f64 	%fd74, %fd43;
	@%p67 bra 	BB12_74;

BB12_72:
	add.s32 	%r144, %r144, 1;
	setp.lt.s32	%p68, %r144, %r63;
	@%p68 bra 	BB12_70;

BB12_73:
	ld.volatile.shared.f64 	%fd74, [%rd3+32];
	st.volatile.shared.f64 	[%rd3], %fd74;
	ld.volatile.shared.u32 	%r120, [%rd4+16];
	st.volatile.shared.u32 	[%rd4], %r120;

BB12_74:
	mov.f64 	%fd25, %fd74;
	setp.lt.s32	%p69, %r62, 4;
	mov.f64 	%fd73, %fd25;
	@%p69 bra 	BB12_81;

	ld.volatile.shared.f64 	%fd34, [%rd3+16];
	setp.geu.f64	%p70, %fd25, %fd34;
	mov.f64 	%fd39, %fd25;
	mov.f64 	%fd73, %fd39;
	@%p70 bra 	BB12_81;

	ld.volatile.shared.u32 	%r122, [%rd4+8];
	div.s32 	%r51, %r122, %r60;
	rem.s32 	%r52, %r122, %r60;
	mov.u32 	%r145, 0;
	setp.lt.s32	%p71, %r63, 1;
	@%p71 bra 	BB12_80;

BB12_77:
	cvt.s64.s32	%rd12, %r145;
	mul.wide.s32 	%rd91, %r145, 4;
	mov.u64 	%rd92, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd93, %rd92, %rd91;
	ld.volatile.shared.u32 	%r123, [%rd93];
	setp.ne.s32	%p72, %r123, %r51;
	@%p72 bra 	BB12_79;

	shl.b64 	%rd94, %rd12, 2;
	mov.u64 	%rd95, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd96, %rd95, %rd94;
	ld.volatile.shared.u32 	%r124, [%rd96];
	setp.eq.s32	%p73, %r124, %r52;
	mov.f64 	%fd40, %fd25;
	mov.f64 	%fd73, %fd40;
	@%p73 bra 	BB12_81;

BB12_79:
	add.s32 	%r145, %r145, 1;
	setp.lt.s32	%p74, %r145, %r63;
	@%p74 bra 	BB12_77;

BB12_80:
	ld.volatile.shared.f64 	%fd73, [%rd3+16];
	st.volatile.shared.f64 	[%rd3], %fd73;
	ld.volatile.shared.u32 	%r125, [%rd4+8];
	st.volatile.shared.u32 	[%rd4], %r125;

BB12_81:
	setp.lt.s32	%p75, %r62, 2;
	@%p75 bra 	BB12_88;

	ld.volatile.shared.f64 	%fd35, [%rd3+8];
	setp.geu.f64	%p76, %fd73, %fd35;
	@%p76 bra 	BB12_88;

	ld.volatile.shared.u32 	%r127, [%rd4+4];
	div.s32 	%r55, %r127, %r60;
	rem.s32 	%r56, %r127, %r60;
	mov.u32 	%r146, 0;
	setp.lt.s32	%p77, %r63, 1;
	@%p77 bra 	BB12_87;

BB12_84:
	cvt.s64.s32	%rd13, %r146;
	mul.wide.s32 	%rd97, %r146, 4;
	mov.u64 	%rd98, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow;
	add.s64 	%rd99, %rd98, %rd97;
	ld.volatile.shared.u32 	%r128, [%rd99];
	setp.ne.s32	%p78, %r128, %r55;
	@%p78 bra 	BB12_86;

	shl.b64 	%rd100, %rd13, 2;
	mov.u64 	%rd101, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol;
	add.s64 	%rd102, %rd101, %rd100;
	ld.volatile.shared.u32 	%r129, [%rd102];
	setp.eq.s32	%p79, %r129, %r56;
	@%p79 bra 	BB12_88;

BB12_86:
	add.s32 	%r146, %r146, 1;
	setp.lt.s32	%p80, %r146, %r63;
	@%p80 bra 	BB12_84;

BB12_87:
	ld.volatile.shared.f64 	%fd36, [%rd3+8];
	st.volatile.shared.f64 	[%rd3], %fd36;
	ld.volatile.shared.u32 	%r130, [%rd4+4];
	st.volatile.shared.u32 	[%rd4], %r130;

BB12_88:
	bar.sync 	0;

BB12_89:
	setp.ne.s32	%p81, %r1, 0;
	@%p81 bra 	BB12_92;

	ld.shared.f64 	%fd37, [reduce_max_filter_main$__cuda_local_var_44798_33_non_const_sdata];
	cvta.to.global.u64 	%rd103, %rd15;
	mul.wide.u32 	%rd104, %r68, 8;
	add.s64 	%rd105, %rd103, %rd104;
	st.global.f64 	[%rd105], %fd37;
	ld.shared.u32 	%r59, [reduce_max_filter_main$__cuda_local_var_44799_30_non_const_idxData];
	cvta.to.global.u64 	%rd106, %rd16;
	mul.wide.u32 	%rd107, %r68, 4;
	add.s64 	%rd108, %rd106, %rd107;
	st.global.u32 	[%rd108], %r59;
	mov.u32 	%r133, %nctaid.x;
	setp.ne.s32	%p82, %r133, 1;
	@%p82 bra 	BB12_92;

	cvta.to.global.u64 	%rd109, %rd17;
	mul.wide.s32 	%rd110, %r63, 4;
	add.s64 	%rd111, %rd109, %rd110;
	st.global.u32 	[%rd111], %r59;

BB12_92:
	ret;
}

	// .globl	elt_prod_conjf
.visible .entry elt_prod_conjf(
	.param .u64 elt_prod_conjf_param_0,
	.param .u64 elt_prod_conjf_param_1,
	.param .u64 elt_prod_conjf_param_2,
	.param .u32 elt_prod_conjf_param_3
)
{
	.reg .pred 	%p<10>;
	.reg .f32 	%f<41>;
	.reg .b32 	%r<7>;
	.reg .b64 	%rd<22>;
	// demoted variable
	.shared .align 8 .b8 elt_prod_conjf$__cuda_local_var_45052_39_non_const_sfc[2048];
	// demoted variable
	.shared .align 8 .b8 elt_prod_conjf$__cuda_local_var_45053_39_non_const_sc1[2048];
	// demoted variable
	.shared .align 8 .b8 elt_prod_conjf$__cuda_local_var_45054_39_non_const_sc2[2048];

	ld.param.u64 	%rd6, [elt_prod_conjf_param_0];
	ld.param.u64 	%rd7, [elt_prod_conjf_param_1];
	ld.param.u64 	%rd8, [elt_prod_conjf_param_2];
	ld.param.u32 	%r3, [elt_prod_conjf_param_3];
	mov.u32 	%r4, %ctaid.x;
	shl.b32 	%r5, %r4, 8;
	mov.u32 	%r1, %tid.x;
	add.s32 	%r2, %r5, %r1;
	setp.ge.s32	%p1, %r2, %r3;
	@%p1 bra 	BB13_5;

	cvta.to.global.u64 	%rd9, %rd7;
	cvt.u64.u32	%rd1, %r1;
	mul.wide.u32 	%rd10, %r1, 8;
	mov.u64 	%rd11, elt_prod_conjf$__cuda_local_var_45053_39_non_const_sc1;
	add.s64 	%rd2, %rd11, %rd10;
	cvt.s64.s32	%rd3, %r2;
	mul.wide.s32 	%rd12, %r2, 8;
	add.s64 	%rd13, %rd9, %rd12;
	ld.global.v2.f32 	{%f6, %f7}, [%rd13];
	st.shared.v2.f32 	[%rd2], {%f6, %f7};
	mov.u64 	%rd14, elt_prod_conjf$__cuda_local_var_45054_39_non_const_sc2;
	add.s64 	%rd4, %rd14, %rd10;
	cvta.to.global.u64 	%rd15, %rd8;
	add.s64 	%rd16, %rd15, %rd12;
	ld.global.v2.f32 	{%f10, %f11}, [%rd16];
	st.shared.v2.f32 	[%rd4], {%f10, %f11};
	bar.sync 	0;
	ld.shared.v2.f32 	{%f14, %f15}, [%rd4];
	ld.shared.v2.f32 	{%f18, %f19}, [%rd2];
	mul.f32 	%f22, %f19, %f15;
	fma.rn.f32 	%f39, %f18, %f14, %f22;
	mul.f32 	%f23, %f18, %f15;
	mul.f32 	%f24, %f19, %f14;
	sub.f32 	%f2, %f24, %f23;
	shl.b64 	%rd17, %rd1, 3;
	mov.u64 	%rd18, elt_prod_conjf$__cuda_local_var_45052_39_non_const_sfc;
	add.s64 	%rd5, %rd18, %rd17;
	st.shared.v2.f32 	[%rd5], {%f39, %f2};
	abs.f32 	%f25, %f39;
	abs.f32 	%f26, %f2;
	setp.gt.f32	%p2, %f25, %f26;
	selp.f32	%f27, %f25, %f26, %p2;
	selp.f32	%f28, %f26, %f25, %p2;
	div.rn.f32 	%f29, %f28, %f27;
	fma.rn.f32 	%f30, %f29, %f29, 0f3F800000;
	sqrt.rn.f32 	%f31, %f30;
	mul.f32 	%f32, %f27, %f31;
	setp.eq.f32	%p3, %f27, 0f00000000;
	setp.gt.f32	%p4, %f27, 0f7F7FFFFF;
	or.pred  	%p5, %p3, %p4;
	setp.gt.f32	%p6, %f28, 0f7F7FFFFF;
	or.pred  	%p7, %p5, %p6;
	add.f32 	%f33, %f27, %f28;
	selp.f32	%f40, %f33, %f32, %p7;
	setp.eq.f32	%p8, %f40, 0f00000000;
	@%p8 bra 	BB13_3;

	abs.f32 	%f34, %f40;
	setp.le.f32	%p9, %f34, 0f7F800000;
	@%p9 bra 	BB13_4;

BB13_3:
	mov.u32 	%r6, 872415232;
	st.shared.u32 	[%rd5], %r6;
	mov.f32 	%f40, 0f34000000;
	mov.f32 	%f39, %f40;

BB13_4:
	cvta.to.global.u64 	%rd19, %rd6;
	shl.b64 	%rd20, %rd3, 3;
	add.s64 	%rd21, %rd19, %rd20;
	div.rn.f32 	%f37, %f2, %f40;
	div.rn.f32 	%f38, %f39, %f40;
	st.global.v2.f32 	[%rd21], {%f38, %f37};

BB13_5:
	ret;
}

	// .globl	elt_prod_conj_v2f
.visible .entry elt_prod_conj_v2f(
	.param .u64 elt_prod_conj_v2f_param_0,
	.param .u64 elt_prod_conj_v2f_param_1,
	.param .u64 elt_prod_conj_v2f_param_2,
	.param .u32 elt_prod_conj_v2f_param_3
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<30>;
	.reg .b32 	%r<7>;
	.reg .b64 	%rd<16>;
	// demoted variable
	.shared .align 8 .b8 elt_prod_conj_v2f$__cuda_local_var_45085_39_non_const_sfc[2048];

	ld.param.u64 	%rd3, [elt_prod_conj_v2f_param_0];
	ld.param.u64 	%rd4, [elt_prod_conj_v2f_param_1];
	ld.param.u64 	%rd5, [elt_prod_conj_v2f_param_2];
	ld.param.u32 	%r3, [elt_prod_conj_v2f_param_3];
	mov.u32 	%r4, %ctaid.x;
	shl.b32 	%r5, %r4, 8;
	mov.u32 	%r1, %tid.x;
	add.s32 	%r2, %r5, %r1;
	setp.ge.s32	%p1, %r2, %r3;
	@%p1 bra 	BB14_4;

	cvta.to.global.u64 	%rd6, %rd4;
	cvt.s64.s32	%rd1, %r2;
	mul.wide.s32 	%rd7, %r2, 8;
	add.s64 	%rd8, %rd6, %rd7;
	cvta.to.global.u64 	%rd9, %rd5;
	add.s64 	%rd10, %rd9, %rd7;
	ld.global.v2.f32 	{%f6, %f7}, [%rd10];
	ld.global.v2.f32 	{%f10, %f11}, [%rd8];
	mul.f32 	%f14, %f11, %f7;
	mul.f32 	%f15, %f10, %f7;
	mul.f32 	%f16, %f11, %f6;
	mul.wide.u32 	%rd11, %r1, 8;
	mov.u64 	%rd12, elt_prod_conj_v2f$__cuda_local_var_45085_39_non_const_sfc;
	add.s64 	%rd2, %rd12, %rd11;
	sub.f32 	%f17, %f16, %f15;
	fma.rn.f32 	%f18, %f10, %f6, %f14;
	st.shared.v2.f32 	[%rd2], {%f18, %f17};
	bar.sync 	0;
	ld.shared.v2.f32 	{%f19, %f20}, [%rd2];
	mov.f32 	%f28, %f19;
	mul.f32 	%f21, %f20, %f20;
	fma.rn.f32 	%f22, %f19, %f19, %f21;
	sqrt.rn.f32 	%f29, %f22;
	abs.f32 	%f23, %f29;
	setp.gtu.f32	%p2, %f23, 0f7F800000;
	setp.eq.f32	%p3, %f29, 0f00000000;
	or.pred  	%p4, %p2, %p3;
	@!%p4 bra 	BB14_3;
	bra.uni 	BB14_2;

BB14_2:
	mov.u32 	%r6, 872415232;
	st.shared.u32 	[%rd2], %r6;
	mov.f32 	%f29, 0f34000000;
	mov.f32 	%f28, %f29;

BB14_3:
	cvta.to.global.u64 	%rd13, %rd3;
	shl.b64 	%rd14, %rd1, 3;
	add.s64 	%rd15, %rd13, %rd14;
	div.rn.f32 	%f26, %f20, %f29;
	div.rn.f32 	%f27, %f28, %f29;
	st.global.v2.f32 	[%rd15], {%f27, %f26};

BB14_4:
	ret;
}

	// .globl	elt_prod_conj_v3f
.visible .entry elt_prod_conj_v3f(
	.param .u64 elt_prod_conj_v3f_param_0,
	.param .u64 elt_prod_conj_v3f_param_1,
	.param .u64 elt_prod_conj_v3f_param_2,
	.param .u32 elt_prod_conj_v3f_param_3
)
{
	.reg .pred 	%p<12>;
	.reg .f32 	%f<33>;
	.reg .b32 	%r<6>;
	.reg .b64 	%rd<13>;


	ld.param.u64 	%rd2, [elt_prod_conj_v3f_param_0];
	ld.param.u64 	%rd3, [elt_prod_conj_v3f_param_1];
	ld.param.u64 	%rd4, [elt_prod_conj_v3f_param_2];
	ld.param.u32 	%r2, [elt_prod_conj_v3f_param_3];
	mov.u32 	%r3, %ctaid.x;
	shl.b32 	%r4, %r3, 8;
	mov.u32 	%r5, %tid.x;
	add.s32 	%r1, %r4, %r5;
	setp.ge.s32	%p1, %r1, %r2;
	@%p1 bra 	BB15_4;

	cvta.to.global.u64 	%rd5, %rd3;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd6, %r1, 8;
	add.s64 	%rd7, %rd5, %rd6;
	cvta.to.global.u64 	%rd8, %rd4;
	add.s64 	%rd9, %rd8, %rd6;
	ld.global.v2.f32 	{%f6, %f7}, [%rd9];
	ld.global.v2.f32 	{%f10, %f11}, [%rd7];
	mul.f32 	%f14, %f11, %f7;
	fma.rn.f32 	%f1, %f10, %f6, %f14;
	mul.f32 	%f15, %f10, %f7;
	mul.f32 	%f16, %f11, %f6;
	sub.f32 	%f2, %f16, %f15;
	mul.f32 	%f17, %f2, %f2;
	fma.rn.f32 	%f18, %f1, %f1, %f17;
	sqrt.rn.f32 	%f32, %f18;
	abs.f32 	%f19, %f32;
	setp.gtu.f32	%p2, %f19, 0f7F800000;
	setp.eq.f32	%p3, %f32, 0f00000000;
	or.pred  	%p4, %p2, %p3;
	@!%p4 bra 	BB15_3;
	bra.uni 	BB15_2;

BB15_2:
	abs.f32 	%f20, %f1;
	abs.f32 	%f21, %f2;
	setp.gt.f32	%p5, %f20, %f21;
	selp.f32	%f22, %f20, %f21, %p5;
	selp.f32	%f23, %f21, %f20, %p5;
	div.rn.f32 	%f24, %f23, %f22;
	fma.rn.f32 	%f25, %f24, %f24, 0f3F800000;
	sqrt.rn.f32 	%f26, %f25;
	mul.f32 	%f27, %f22, %f26;
	setp.eq.f32	%p6, %f22, 0f00000000;
	setp.gt.f32	%p7, %f22, 0f7F7FFFFF;
	or.pred  	%p8, %p6, %p7;
	setp.gt.f32	%p9, %f23, 0f7F7FFFFF;
	or.pred  	%p10, %p8, %p9;
	add.f32 	%f28, %f22, %f23;
	selp.f32	%f32, %f28, %f27, %p10;

BB15_3:
	cvta.to.global.u64 	%rd10, %rd2;
	setp.eq.f32	%p11, %f32, 0f00000000;
	selp.f32	%f29, 0f34000000, %f32, %p11;
	shl.b64 	%rd11, %rd1, 3;
	add.s64 	%rd12, %rd10, %rd11;
	div.rn.f32 	%f30, %f2, %f29;
	div.rn.f32 	%f31, %f1, %f29;
	st.global.v2.f32 	[%rd12], {%f31, %f30};

BB15_4:
	ret;
}

	// .globl	reduce_max_finalf
.visible .entry reduce_max_finalf(
	.param .u64 reduce_max_finalf_param_0,
	.param .u64 reduce_max_finalf_param_1,
	.param .u64 reduce_max_finalf_param_2,
	.param .u32 reduce_max_finalf_param_3,
	.param .u32 reduce_max_finalf_param_4
)
{
	.reg .pred 	%p<29>;
	.reg .f32 	%f<78>;
	.reg .b32 	%r<38>;
	.reg .b64 	%rd<28>;
	// demoted variable
	.shared .align 4 .b8 reduce_max_finalf$__cuda_local_var_45150_32_non_const_sdata[1024];
	// demoted variable
	.shared .align 4 .b8 reduce_max_finalf$__cuda_local_var_45151_30_non_const_idxData[1024];

	ld.param.u64 	%rd5, [reduce_max_finalf_param_0];
	ld.param.u64 	%rd6, [reduce_max_finalf_param_1];
	ld.param.u64 	%rd7, [reduce_max_finalf_param_2];
	ld.param.u32 	%r11, [reduce_max_finalf_param_3];
	ld.param.u32 	%r12, [reduce_max_finalf_param_4];
	mov.u32 	%r14, %tid.x;
	shl.b32 	%r15, %r12, 1;
	mov.u32 	%r16, %ctaid.x;
	mad.lo.s32 	%r36, %r16, %r15, %r14;
	mov.f32 	%f74, 0f00000000;
	mov.f32 	%f77, %f74;
	setp.ge.u32	%p1, %r36, %r11;
	@%p1 bra 	BB16_7;

BB16_1:
	mov.f32 	%f60, %f77;
	mov.f32 	%f1, %f60;
	cvta.to.global.u64 	%rd8, %rd5;
	cvt.u64.u32	%rd1, %r36;
	mul.wide.u32 	%rd9, %r36, 4;
	add.s64 	%rd10, %rd8, %rd9;
	ld.global.f32 	%f2, [%rd10];
	setp.geu.f32	%p2, %f1, %f2;
	mov.f32 	%f75, %f1;
	@%p2 bra 	BB16_3;

	cvta.to.global.u64 	%rd11, %rd7;
	shl.b64 	%rd12, %rd1, 2;
	add.s64 	%rd13, %rd11, %rd12;
	ld.global.u32 	%r37, [%rd13];
	mov.f32 	%f75, %f2;

BB16_3:
	mov.f32 	%f3, %f75;
	add.s32 	%r6, %r36, %r12;
	setp.ge.u32	%p3, %r6, %r11;
	mov.f32 	%f76, %f3;
	@%p3 bra 	BB16_6;

	cvt.u64.u32	%rd2, %r6;
	mul.wide.u32 	%rd15, %r6, 4;
	add.s64 	%rd16, %rd8, %rd15;
	ld.global.f32 	%f4, [%rd16];
	setp.geu.f32	%p4, %f3, %f4;
	mov.f32 	%f59, %f3;
	mov.f32 	%f76, %f59;
	@%p4 bra 	BB16_6;

	cvta.to.global.u64 	%rd17, %rd7;
	shl.b64 	%rd18, %rd2, 2;
	add.s64 	%rd19, %rd17, %rd18;
	ld.global.u32 	%r37, [%rd19];
	mov.f32 	%f76, %f4;

BB16_6:
	mov.f32 	%f77, %f76;
	mov.u32 	%r18, %nctaid.x;
	mad.lo.s32 	%r36, %r18, %r15, %r36;
	setp.lt.u32	%p5, %r36, %r11;
	mov.f32 	%f74, %f77;
	@%p5 bra 	BB16_1;

BB16_7:
	mov.f32 	%f72, %f74;
	mul.wide.u32 	%rd20, %r14, 4;
	mov.u64 	%rd21, reduce_max_finalf$__cuda_local_var_45150_32_non_const_sdata;
	add.s64 	%rd3, %rd21, %rd20;
	st.shared.f32 	[%rd3], %f72;
	mov.u64 	%rd22, reduce_max_finalf$__cuda_local_var_45151_30_non_const_idxData;
	add.s64 	%rd4, %rd22, %rd20;
	st.shared.u32 	[%rd4], %r37;
	bar.sync 	0;
	setp.lt.s32	%p6, %r12, 512;
	@%p6 bra 	BB16_12;

	setp.gt.u32	%p7, %r14, 255;
	mov.f32 	%f73, %f72;
	@%p7 bra 	BB16_11;

	ld.shared.f32 	%f7, [%rd3+1024];
	setp.geu.f32	%p8, %f72, %f7;
	mov.f32 	%f57, %f72;
	mov.f32 	%f73, %f57;
	@%p8 bra 	BB16_11;

	st.shared.f32 	[%rd3], %f7;
	ld.shared.u32 	%r21, [%rd4+1024];
	st.shared.u32 	[%rd4], %r21;
	mov.f32 	%f73, %f7;

BB16_11:
	mov.f32 	%f72, %f73;
	bar.sync 	0;

BB16_12:
	mov.f32 	%f70, %f72;
	setp.lt.s32	%p9, %r12, 256;
	@%p9 bra 	BB16_17;

	setp.gt.u32	%p10, %r14, 127;
	mov.f32 	%f71, %f70;
	@%p10 bra 	BB16_16;

	ld.shared.f32 	%f10, [%rd3+512];
	setp.geu.f32	%p11, %f70, %f10;
	mov.f32 	%f53, %f70;
	mov.f32 	%f71, %f53;
	@%p11 bra 	BB16_16;

	st.shared.f32 	[%rd3], %f10;
	ld.shared.u32 	%r23, [%rd4+512];
	st.shared.u32 	[%rd4], %r23;
	mov.f32 	%f71, %f10;

BB16_16:
	mov.f32 	%f70, %f71;
	bar.sync 	0;

BB16_17:
	mov.f32 	%f68, %f70;
	setp.lt.s32	%p12, %r12, 128;
	@%p12 bra 	BB16_22;

	setp.gt.u32	%p13, %r14, 63;
	mov.f32 	%f69, %f68;
	@%p13 bra 	BB16_21;

	ld.shared.f32 	%f13, [%rd3+256];
	setp.geu.f32	%p14, %f68, %f13;
	mov.f32 	%f49, %f68;
	mov.f32 	%f69, %f49;
	@%p14 bra 	BB16_21;

	st.shared.f32 	[%rd3], %f13;
	ld.shared.u32 	%r25, [%rd4+256];
	st.shared.u32 	[%rd4], %r25;
	mov.f32 	%f69, %f13;

BB16_21:
	mov.f32 	%f68, %f69;
	bar.sync 	0;

BB16_22:
	mov.f32 	%f67, %f68;
	setp.gt.u32	%p15, %r14, 31;
	@%p15 bra 	BB16_42;

	setp.lt.s32	%p16, %r12, 64;
	@%p16 bra 	BB16_26;

	ld.volatile.shared.f32 	%f28, [%rd3+128];
	setp.geu.f32	%p17, %f67, %f28;
	@%p17 bra 	BB16_26;

	ld.volatile.shared.f32 	%f67, [%rd3+128];
	st.volatile.shared.f32 	[%rd3], %f67;
	ld.volatile.shared.u32 	%r27, [%rd4+128];
	st.volatile.shared.u32 	[%rd4], %r27;

BB16_26:
	mov.f32 	%f66, %f67;
	setp.lt.s32	%p18, %r12, 32;
	@%p18 bra 	BB16_29;

	ld.volatile.shared.f32 	%f29, [%rd3+64];
	setp.geu.f32	%p19, %f66, %f29;
	@%p19 bra 	BB16_29;

	ld.volatile.shared.f32 	%f66, [%rd3+64];
	st.volatile.shared.f32 	[%rd3], %f66;
	ld.volatile.shared.u32 	%r28, [%rd4+64];
	st.volatile.shared.u32 	[%rd4], %r28;

BB16_29:
	mov.f32 	%f65, %f66;
	setp.lt.s32	%p20, %r12, 16;
	@%p20 bra 	BB16_32;

	ld.volatile.shared.f32 	%f30, [%rd3+32];
	setp.geu.f32	%p21, %f65, %f30;
	@%p21 bra 	BB16_32;

	ld.volatile.shared.f32 	%f65, [%rd3+32];
	st.volatile.shared.f32 	[%rd3], %f65;
	ld.volatile.shared.u32 	%r29, [%rd4+32];
	st.volatile.shared.u32 	[%rd4], %r29;

BB16_32:
	mov.f32 	%f64, %f65;
	setp.lt.s32	%p22, %r12, 8;
	@%p22 bra 	BB16_35;

	ld.volatile.shared.f32 	%f31, [%rd3+16];
	setp.geu.f32	%p23, %f64, %f31;
	@%p23 bra 	BB16_35;

	ld.volatile.shared.f32 	%f64, [%rd3+16];
	st.volatile.shared.f32 	[%rd3], %f64;
	ld.volatile.shared.u32 	%r30, [%rd4+16];
	st.volatile.shared.u32 	[%rd4], %r30;

BB16_35:
	mov.f32 	%f63, %f64;
	setp.lt.s32	%p24, %r12, 4;
	@%p24 bra 	BB16_38;

	ld.volatile.shared.f32 	%f32, [%rd3+8];
	setp.geu.f32	%p25, %f63, %f32;
	@%p25 bra 	BB16_38;

	ld.volatile.shared.f32 	%f63, [%rd3+8];
	st.volatile.shared.f32 	[%rd3], %f63;
	ld.volatile.shared.u32 	%r31, [%rd4+8];
	st.volatile.shared.u32 	[%rd4], %r31;

BB16_38:
	setp.lt.s32	%p26, %r12, 2;
	@%p26 bra 	BB16_41;

	ld.volatile.shared.f32 	%f33, [%rd3+4];
	setp.geu.f32	%p27, %f63, %f33;
	@%p27 bra 	BB16_41;

	ld.volatile.shared.f32 	%f34, [%rd3+4];
	st.volatile.shared.f32 	[%rd3], %f34;
	ld.volatile.shared.u32 	%r32, [%rd4+4];
	st.volatile.shared.u32 	[%rd4], %r32;

BB16_41:
	bar.sync 	0;

BB16_42:
	setp.ne.s32	%p28, %r14, 0;
	@%p28 bra 	BB16_44;

	ld.shared.f32 	%f35, [reduce_max_finalf$__cuda_local_var_45150_32_non_const_sdata];
	cvta.to.global.u64 	%rd23, %rd6;
	mul.wide.u32 	%rd24, %r16, 4;
	add.s64 	%rd25, %rd23, %rd24;
	st.global.f32 	[%rd25], %f35;
	ld.shared.u32 	%r35, [reduce_max_finalf$__cuda_local_var_45151_30_non_const_idxData];
	cvta.to.global.u64 	%rd26, %rd7;
	add.s64 	%rd27, %rd26, %rd24;
	st.global.u32 	[%rd27], %r35;

BB16_44:
	ret;
}

	// .globl	reduce_max_mainf
.visible .entry reduce_max_mainf(
	.param .u64 reduce_max_mainf_param_0,
	.param .u64 reduce_max_mainf_param_1,
	.param .u64 reduce_max_mainf_param_2,
	.param .u32 reduce_max_mainf_param_3,
	.param .u32 reduce_max_mainf_param_4
)
{
	.reg .pred 	%p<29>;
	.reg .f32 	%f<75>;
	.reg .b32 	%r<39>;
	.reg .b64 	%rd<20>;
	// demoted variable
	.shared .align 4 .b8 reduce_max_mainf$__cuda_local_var_45285_32_non_const_sdata[1024];
	// demoted variable
	.shared .align 4 .b8 reduce_max_mainf$__cuda_local_var_45286_30_non_const_idxData[1024];

	ld.param.u64 	%rd3, [reduce_max_mainf_param_0];
	ld.param.u64 	%rd4, [reduce_max_mainf_param_1];
	ld.param.u64 	%rd5, [reduce_max_mainf_param_2];
	ld.param.u32 	%r10, [reduce_max_mainf_param_3];
	ld.param.u32 	%r11, [reduce_max_mainf_param_4];
	mov.u32 	%r13, %tid.x;
	mov.u32 	%r14, %ctaid.x;
	mad.lo.s32 	%r33, %r14, %r11, %r13;
	mov.f32 	%f72, 0f00000000;
	mov.f32 	%f73, %f72;
	setp.ge.u32	%p1, %r33, %r10;
	@%p1 bra 	BB17_4;

BB17_1:
	mov.f32 	%f1, %f73;
	mov.u32 	%r2, %r37;
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r33, 4;
	add.s64 	%rd8, %rd6, %rd7;
	ld.global.f32 	%f27, [%rd8];
	setp.lt.f32	%p2, %f1, %f27;
	selp.f32	%f74, %f27, %f1, %p2;
	selp.b32	%r38, %r33, %r2, %p2;
	add.s32 	%r5, %r33, %r11;
	setp.ge.u32	%p3, %r5, %r10;
	@%p3 bra 	BB17_3;

	mul.wide.u32 	%rd10, %r5, 4;
	add.s64 	%rd11, %rd6, %rd10;
	ld.global.f32 	%f28, [%rd11];
	setp.lt.f32	%p4, %f74, %f28;
	selp.f32	%f74, %f28, %f74, %p4;
	selp.b32	%r38, %r5, %r38, %p4;

BB17_3:
	mov.f32 	%f73, %f74;
	mov.u32 	%r37, %r38;
	mov.u32 	%r15, %nctaid.x;
	mad.lo.s32 	%r33, %r15, %r11, %r33;
	setp.lt.u32	%p5, %r33, %r10;
	mov.u32 	%r36, %r37;
	mov.f32 	%f72, %f73;
	@%p5 bra 	BB17_1;

BB17_4:
	mov.f32 	%f70, %f72;
	mul.wide.u32 	%rd12, %r13, 4;
	mov.u64 	%rd13, reduce_max_mainf$__cuda_local_var_45285_32_non_const_sdata;
	add.s64 	%rd1, %rd13, %rd12;
	st.shared.f32 	[%rd1], %f70;
	mov.u64 	%rd14, reduce_max_mainf$__cuda_local_var_45286_30_non_const_idxData;
	add.s64 	%rd2, %rd14, %rd12;
	st.shared.u32 	[%rd2], %r36;
	bar.sync 	0;
	setp.lt.s32	%p6, %r11, 512;
	@%p6 bra 	BB17_9;

	setp.gt.u32	%p7, %r13, 255;
	mov.f32 	%f71, %f70;
	@%p7 bra 	BB17_8;

	ld.shared.f32 	%f6, [%rd1+1024];
	setp.geu.f32	%p8, %f70, %f6;
	mov.f32 	%f41, %f70;
	mov.f32 	%f71, %f41;
	@%p8 bra 	BB17_8;

	st.shared.f32 	[%rd1], %f6;
	ld.shared.u32 	%r18, [%rd2+1024];
	st.shared.u32 	[%rd2], %r18;
	mov.f32 	%f71, %f6;

BB17_8:
	mov.f32 	%f70, %f71;
	bar.sync 	0;

BB17_9:
	mov.f32 	%f68, %f70;
	setp.lt.s32	%p9, %r11, 256;
	@%p9 bra 	BB17_14;

	setp.gt.u32	%p10, %r13, 127;
	mov.f32 	%f69, %f68;
	@%p10 bra 	BB17_13;

	ld.shared.f32 	%f9, [%rd1+512];
	setp.geu.f32	%p11, %f68, %f9;
	mov.f32 	%f45, %f68;
	mov.f32 	%f69, %f45;
	@%p11 bra 	BB17_13;

	st.shared.f32 	[%rd1], %f9;
	ld.shared.u32 	%r20, [%rd2+512];
	st.shared.u32 	[%rd2], %r20;
	mov.f32 	%f69, %f9;

BB17_13:
	mov.f32 	%f68, %f69;
	bar.sync 	0;

BB17_14:
	mov.f32 	%f66, %f68;
	setp.lt.s32	%p12, %r11, 128;
	@%p12 bra 	BB17_19;

	setp.gt.u32	%p13, %r13, 63;
	mov.f32 	%f67, %f66;
	@%p13 bra 	BB17_18;

	ld.shared.f32 	%f12, [%rd1+256];
	setp.geu.f32	%p14, %f66, %f12;
	mov.f32 	%f49, %f66;
	mov.f32 	%f67, %f49;
	@%p14 bra 	BB17_18;

	st.shared.f32 	[%rd1], %f12;
	ld.shared.u32 	%r22, [%rd2+256];
	st.shared.u32 	[%rd2], %r22;
	mov.f32 	%f67, %f12;

BB17_18:
	mov.f32 	%f66, %f67;
	bar.sync 	0;

BB17_19:
	mov.f32 	%f65, %f66;
	setp.gt.u32	%p15, %r13, 31;
	@%p15 bra 	BB17_39;

	setp.lt.s32	%p16, %r11, 64;
	@%p16 bra 	BB17_23;

	ld.volatile.shared.f32 	%f29, [%rd1+128];
	setp.geu.f32	%p17, %f65, %f29;
	@%p17 bra 	BB17_23;

	ld.volatile.shared.f32 	%f65, [%rd1+128];
	st.volatile.shared.f32 	[%rd1], %f65;
	ld.volatile.shared.u32 	%r24, [%rd2+128];
	st.volatile.shared.u32 	[%rd2], %r24;

BB17_23:
	mov.f32 	%f64, %f65;
	setp.lt.s32	%p18, %r11, 32;
	@%p18 bra 	BB17_26;

	ld.volatile.shared.f32 	%f30, [%rd1+64];
	setp.geu.f32	%p19, %f64, %f30;
	@%p19 bra 	BB17_26;

	ld.volatile.shared.f32 	%f64, [%rd1+64];
	st.volatile.shared.f32 	[%rd1], %f64;
	ld.volatile.shared.u32 	%r25, [%rd2+64];
	st.volatile.shared.u32 	[%rd2], %r25;

BB17_26:
	mov.f32 	%f63, %f64;
	setp.lt.s32	%p20, %r11, 16;
	@%p20 bra 	BB17_29;

	ld.volatile.shared.f32 	%f31, [%rd1+32];
	setp.geu.f32	%p21, %f63, %f31;
	@%p21 bra 	BB17_29;

	ld.volatile.shared.f32 	%f63, [%rd1+32];
	st.volatile.shared.f32 	[%rd1], %f63;
	ld.volatile.shared.u32 	%r26, [%rd2+32];
	st.volatile.shared.u32 	[%rd2], %r26;

BB17_29:
	mov.f32 	%f62, %f63;
	setp.lt.s32	%p22, %r11, 8;
	@%p22 bra 	BB17_32;

	ld.volatile.shared.f32 	%f32, [%rd1+16];
	setp.geu.f32	%p23, %f62, %f32;
	@%p23 bra 	BB17_32;

	ld.volatile.shared.f32 	%f62, [%rd1+16];
	st.volatile.shared.f32 	[%rd1], %f62;
	ld.volatile.shared.u32 	%r27, [%rd2+16];
	st.volatile.shared.u32 	[%rd2], %r27;

BB17_32:
	mov.f32 	%f61, %f62;
	setp.lt.s32	%p24, %r11, 4;
	@%p24 bra 	BB17_35;

	ld.volatile.shared.f32 	%f33, [%rd1+8];
	setp.geu.f32	%p25, %f61, %f33;
	@%p25 bra 	BB17_35;

	ld.volatile.shared.f32 	%f61, [%rd1+8];
	st.volatile.shared.f32 	[%rd1], %f61;
	ld.volatile.shared.u32 	%r28, [%rd2+8];
	st.volatile.shared.u32 	[%rd2], %r28;

BB17_35:
	setp.lt.s32	%p26, %r11, 2;
	@%p26 bra 	BB17_38;

	ld.volatile.shared.f32 	%f34, [%rd1+4];
	setp.geu.f32	%p27, %f61, %f34;
	@%p27 bra 	BB17_38;

	ld.volatile.shared.f32 	%f35, [%rd1+4];
	st.volatile.shared.f32 	[%rd1], %f35;
	ld.volatile.shared.u32 	%r29, [%rd2+4];
	st.volatile.shared.u32 	[%rd2], %r29;

BB17_38:
	bar.sync 	0;

BB17_39:
	setp.ne.s32	%p28, %r13, 0;
	@%p28 bra 	BB17_41;

	ld.shared.f32 	%f36, [reduce_max_mainf$__cuda_local_var_45285_32_non_const_sdata];
	cvta.to.global.u64 	%rd15, %rd4;
	mul.wide.u32 	%rd16, %r14, 4;
	add.s64 	%rd17, %rd15, %rd16;
	st.global.f32 	[%rd17], %f36;
	ld.shared.u32 	%r32, [reduce_max_mainf$__cuda_local_var_45286_30_non_const_idxData];
	cvta.to.global.u64 	%rd18, %rd5;
	add.s64 	%rd19, %rd18, %rd16;
	st.global.u32 	[%rd19], %r32;

BB17_41:
	ret;
}

	// .globl	reduce_max_filter_finalf
.visible .entry reduce_max_filter_finalf(
	.param .u64 reduce_max_filter_finalf_param_0,
	.param .u64 reduce_max_filter_finalf_param_1,
	.param .u64 reduce_max_filter_finalf_param_2,
	.param .u32 reduce_max_filter_finalf_param_3,
	.param .u32 reduce_max_filter_finalf_param_4,
	.param .u32 reduce_max_filter_finalf_param_5,
	.param .u64 reduce_max_filter_finalf_param_6,
	.param .u32 reduce_max_filter_finalf_param_7
)
{
	.reg .pred 	%p<75>;
	.reg .f32 	%f<90>;
	.reg .b32 	%r<157>;
	.reg .b64 	%rd<118>;
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow[40];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol[40];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_finalf$__cuda_local_var_45427_30_non_const_smaxesVal[40];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_finalf$__cuda_local_var_45428_32_non_const_sdata[1024];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_finalf$__cuda_local_var_45429_30_non_const_idxData[1024];

	ld.param.u64 	%rd16, [reduce_max_filter_finalf_param_0];
	ld.param.u64 	%rd17, [reduce_max_filter_finalf_param_1];
	ld.param.u64 	%rd18, [reduce_max_filter_finalf_param_2];
	ld.param.u32 	%r59, [reduce_max_filter_finalf_param_3];
	ld.param.u32 	%r60, [reduce_max_filter_finalf_param_4];
	ld.param.u32 	%r61, [reduce_max_filter_finalf_param_5];
	ld.param.u64 	%rd19, [reduce_max_filter_finalf_param_6];
	ld.param.u32 	%r62, [reduce_max_filter_finalf_param_7];
	mov.u32 	%r63, %tid.x;
	setp.ge.u32	%p1, %r63, %r62;
	@%p1 bra 	BB18_2;

	cvta.to.global.u64 	%rd20, %rd19;
	mul.wide.u32 	%rd21, %r63, 4;
	add.s64 	%rd22, %rd20, %rd21;
	ld.global.u32 	%r65, [%rd22];
	mov.u64 	%rd23, reduce_max_filter_finalf$__cuda_local_var_45427_30_non_const_smaxesVal;
	add.s64 	%rd24, %rd23, %rd21;
	st.shared.u32 	[%rd24], %r65;
	div.u32 	%r66, %r65, %r60;
	mov.u64 	%rd25, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd26, %rd25, %rd21;
	st.shared.u32 	[%rd26], %r66;
	rem.u32 	%r67, %r65, %r60;
	mov.u64 	%rd27, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd28, %rd27, %rd21;
	st.shared.u32 	[%rd28], %r67;

BB18_2:
	bar.sync 	0;
	shl.b32 	%r69, %r61, 1;
	mov.u32 	%r70, %ctaid.x;
	mad.lo.s32 	%r132, %r70, %r69, %r63;
	mov.f32 	%f86, 0f00000000;
	setp.ge.u32	%p2, %r132, %r59;
	@%p2 bra 	BB18_16;

	mov.f32 	%f89, 0f00000000;

BB18_4:
	mov.f32 	%f71, %f89;
	mov.f32 	%f1, %f71;
	mov.u32 	%r140, %r147;
	mov.u32 	%r2, %r140;
	cvta.to.global.u64 	%rd29, %rd16;
	cvt.u64.u32	%rd1, %r132;
	mul.wide.u32 	%rd30, %r132, 4;
	add.s64 	%rd31, %rd29, %rd30;
	ld.global.f32 	%f2, [%rd31];
	setp.geu.f32	%p3, %f1, %f2;
	mov.u32 	%r145, %r2;
	mov.f32 	%f87, %f1;
	@%p3 bra 	BB18_9;

	cvta.to.global.u64 	%rd32, %rd18;
	shl.b64 	%rd33, %rd1, 2;
	add.s64 	%rd34, %rd32, %rd33;
	ld.global.u32 	%r4, [%rd34];
	div.s32 	%r5, %r4, %r60;
	rem.s32 	%r6, %r4, %r60;
	mov.u32 	%r133, 0;
	setp.lt.s32	%p4, %r62, 1;
	mov.u32 	%r145, %r4;
	mov.f32 	%f87, %f2;
	@%p4 bra 	BB18_9;

BB18_6:
	cvt.s64.s32	%rd2, %r133;
	mul.wide.s32 	%rd35, %r133, 4;
	mov.u64 	%rd36, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd37, %rd36, %rd35;
	ld.shared.u32 	%r78, [%rd37];
	setp.ne.s32	%p5, %r78, %r5;
	@%p5 bra 	BB18_8;

	shl.b64 	%rd38, %rd2, 2;
	mov.u64 	%rd39, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd40, %rd39, %rd38;
	ld.shared.u32 	%r79, [%rd40];
	setp.eq.s32	%p6, %r79, %r6;
	mov.u32 	%r141, %r2;
	mov.u32 	%r145, %r141;
	mov.f32 	%f72, %f1;
	mov.f32 	%f87, %f72;
	@%p6 bra 	BB18_9;

BB18_8:
	add.s32 	%r133, %r133, 1;
	setp.lt.s32	%p7, %r133, %r62;
	mov.u32 	%r135, %r4;
	mov.u32 	%r145, %r135;
	mov.f32 	%f36, %f2;
	mov.f32 	%f87, %f36;
	@%p7 bra 	BB18_6;

BB18_9:
	mov.f32 	%f3, %f87;
	mov.u32 	%r9, %r145;
	add.s32 	%r10, %r132, %r61;
	setp.ge.u32	%p8, %r10, %r59;
	mov.u32 	%r146, %r9;
	mov.f32 	%f88, %f3;
	@%p8 bra 	BB18_15;

	cvt.u64.u32	%rd3, %r10;
	mul.wide.u32 	%rd42, %r10, 4;
	add.s64 	%rd43, %rd29, %rd42;
	ld.global.f32 	%f4, [%rd43];
	setp.geu.f32	%p9, %f3, %f4;
	mov.u32 	%r138, %r9;
	mov.u32 	%r146, %r138;
	mov.f32 	%f69, %f3;
	mov.f32 	%f88, %f69;
	@%p9 bra 	BB18_15;

	cvta.to.global.u64 	%rd44, %rd18;
	shl.b64 	%rd45, %rd3, 2;
	add.s64 	%rd46, %rd44, %rd45;
	ld.global.u32 	%r11, [%rd46];
	div.s32 	%r12, %r11, %r60;
	rem.s32 	%r13, %r11, %r60;
	mov.u32 	%r134, 0;
	setp.lt.s32	%p10, %r62, 1;
	mov.u32 	%r146, %r11;
	mov.f32 	%f88, %f4;
	@%p10 bra 	BB18_15;

BB18_12:
	cvt.s64.s32	%rd4, %r134;
	mul.wide.s32 	%rd47, %r134, 4;
	mov.u64 	%rd48, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd49, %rd48, %rd47;
	ld.shared.u32 	%r81, [%rd49];
	setp.ne.s32	%p11, %r81, %r12;
	@%p11 bra 	BB18_14;

	shl.b64 	%rd50, %rd4, 2;
	mov.u64 	%rd51, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd52, %rd51, %rd50;
	ld.shared.u32 	%r82, [%rd52];
	setp.eq.s32	%p12, %r82, %r13;
	mov.u32 	%r139, %r9;
	mov.u32 	%r146, %r139;
	mov.f32 	%f70, %f3;
	mov.f32 	%f88, %f70;
	@%p12 bra 	BB18_15;

BB18_14:
	add.s32 	%r134, %r134, 1;
	setp.lt.s32	%p13, %r134, %r62;
	mov.u32 	%r136, %r11;
	mov.u32 	%r146, %r136;
	mov.f32 	%f37, %f4;
	mov.f32 	%f88, %f37;
	@%p13 bra 	BB18_12;

BB18_15:
	mov.f32 	%f89, %f88;
	mov.u32 	%r147, %r146;
	mov.u32 	%r84, %nctaid.x;
	mad.lo.s32 	%r132, %r84, %r69, %r132;
	setp.lt.u32	%p14, %r132, %r59;
	mov.u32 	%r144, %r147;
	mov.f32 	%f86, %f89;
	@%p14 bra 	BB18_4;

BB18_16:
	mov.f32 	%f84, %f86;
	mul.wide.u32 	%rd53, %r63, 4;
	mov.u64 	%rd54, reduce_max_filter_finalf$__cuda_local_var_45428_32_non_const_sdata;
	add.s64 	%rd5, %rd54, %rd53;
	st.shared.f32 	[%rd5], %f84;
	mov.u64 	%rd55, reduce_max_filter_finalf$__cuda_local_var_45429_30_non_const_idxData;
	add.s64 	%rd6, %rd55, %rd53;
	st.shared.u32 	[%rd6], %r144;
	bar.sync 	0;
	setp.lt.s32	%p15, %r61, 512;
	@%p15 bra 	BB18_25;

	setp.gt.u32	%p16, %r63, 255;
	mov.f32 	%f85, %f84;
	@%p16 bra 	BB18_24;

	ld.shared.f32 	%f7, [%rd5+1024];
	setp.geu.f32	%p17, %f84, %f7;
	mov.f32 	%f66, %f84;
	mov.f32 	%f85, %f66;
	@%p17 bra 	BB18_24;

	ld.shared.u32 	%r19, [%rd6+1024];
	div.s32 	%r20, %r19, %r60;
	rem.s32 	%r21, %r19, %r60;
	mov.u32 	%r148, 0;
	setp.lt.s32	%p18, %r62, 1;
	@%p18 bra 	BB18_23;

BB18_20:
	cvt.s64.s32	%rd7, %r148;
	mul.wide.s32 	%rd56, %r148, 4;
	mov.u64 	%rd57, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd58, %rd57, %rd56;
	ld.shared.u32 	%r88, [%rd58];
	setp.ne.s32	%p19, %r88, %r20;
	@%p19 bra 	BB18_22;

	shl.b64 	%rd59, %rd7, 2;
	mov.u64 	%rd60, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd61, %rd60, %rd59;
	ld.shared.u32 	%r89, [%rd61];
	setp.eq.s32	%p20, %r89, %r21;
	mov.f32 	%f67, %f84;
	mov.f32 	%f85, %f67;
	@%p20 bra 	BB18_24;

BB18_22:
	add.s32 	%r148, %r148, 1;
	setp.lt.s32	%p21, %r148, %r62;
	@%p21 bra 	BB18_20;

BB18_23:
	st.shared.f32 	[%rd5], %f7;
	st.shared.u32 	[%rd6], %r19;
	mov.f32 	%f85, %f7;

BB18_24:
	mov.f32 	%f84, %f85;
	bar.sync 	0;

BB18_25:
	mov.f32 	%f82, %f84;
	setp.lt.s32	%p22, %r61, 256;
	@%p22 bra 	BB18_34;

	setp.gt.u32	%p23, %r63, 127;
	mov.f32 	%f83, %f82;
	@%p23 bra 	BB18_33;

	ld.shared.f32 	%f10, [%rd5+512];
	setp.geu.f32	%p24, %f82, %f10;
	mov.f32 	%f61, %f82;
	mov.f32 	%f83, %f61;
	@%p24 bra 	BB18_33;

	ld.shared.u32 	%r24, [%rd6+512];
	div.s32 	%r25, %r24, %r60;
	rem.s32 	%r26, %r24, %r60;
	mov.u32 	%r149, 0;
	setp.lt.s32	%p25, %r62, 1;
	@%p25 bra 	BB18_32;

BB18_29:
	cvt.s64.s32	%rd8, %r149;
	mul.wide.s32 	%rd62, %r149, 4;
	mov.u64 	%rd63, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd64, %rd63, %rd62;
	ld.shared.u32 	%r92, [%rd64];
	setp.ne.s32	%p26, %r92, %r25;
	@%p26 bra 	BB18_31;

	shl.b64 	%rd65, %rd8, 2;
	mov.u64 	%rd66, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd67, %rd66, %rd65;
	ld.shared.u32 	%r93, [%rd67];
	setp.eq.s32	%p27, %r93, %r26;
	mov.f32 	%f62, %f82;
	mov.f32 	%f83, %f62;
	@%p27 bra 	BB18_33;

BB18_31:
	add.s32 	%r149, %r149, 1;
	setp.lt.s32	%p28, %r149, %r62;
	@%p28 bra 	BB18_29;

BB18_32:
	st.shared.f32 	[%rd5], %f10;
	st.shared.u32 	[%rd6], %r24;
	mov.f32 	%f83, %f10;

BB18_33:
	mov.f32 	%f82, %f83;
	bar.sync 	0;

BB18_34:
	mov.f32 	%f80, %f82;
	setp.lt.s32	%p29, %r61, 128;
	@%p29 bra 	BB18_43;

	setp.gt.u32	%p30, %r63, 63;
	mov.f32 	%f81, %f80;
	@%p30 bra 	BB18_42;

	ld.shared.f32 	%f13, [%rd5+256];
	setp.geu.f32	%p31, %f80, %f13;
	mov.f32 	%f56, %f80;
	mov.f32 	%f81, %f56;
	@%p31 bra 	BB18_42;

	ld.shared.u32 	%r29, [%rd6+256];
	div.s32 	%r30, %r29, %r60;
	rem.s32 	%r31, %r29, %r60;
	mov.u32 	%r150, 0;
	setp.lt.s32	%p32, %r62, 1;
	@%p32 bra 	BB18_41;

BB18_38:
	cvt.s64.s32	%rd9, %r150;
	mul.wide.s32 	%rd68, %r150, 4;
	mov.u64 	%rd69, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd70, %rd69, %rd68;
	ld.shared.u32 	%r96, [%rd70];
	setp.ne.s32	%p33, %r96, %r30;
	@%p33 bra 	BB18_40;

	shl.b64 	%rd71, %rd9, 2;
	mov.u64 	%rd72, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd73, %rd72, %rd71;
	ld.shared.u32 	%r97, [%rd73];
	setp.eq.s32	%p34, %r97, %r31;
	mov.f32 	%f57, %f80;
	mov.f32 	%f81, %f57;
	@%p34 bra 	BB18_42;

BB18_40:
	add.s32 	%r150, %r150, 1;
	setp.lt.s32	%p35, %r150, %r62;
	@%p35 bra 	BB18_38;

BB18_41:
	st.shared.f32 	[%rd5], %f13;
	st.shared.u32 	[%rd6], %r29;
	mov.f32 	%f81, %f13;

BB18_42:
	mov.f32 	%f80, %f81;
	bar.sync 	0;

BB18_43:
	mov.f32 	%f15, %f80;
	setp.gt.u32	%p36, %r63, 31;
	@%p36 bra 	BB18_87;

	setp.lt.s32	%p37, %r61, 64;
	mov.f32 	%f79, %f15;
	@%p37 bra 	BB18_51;

	ld.volatile.shared.f32 	%f28, [%rd5+128];
	setp.geu.f32	%p38, %f15, %f28;
	mov.f32 	%f51, %f15;
	mov.f32 	%f79, %f51;
	@%p38 bra 	BB18_51;

	ld.volatile.shared.u32 	%r100, [%rd6+128];
	div.s32 	%r34, %r100, %r60;
	rem.s32 	%r35, %r100, %r60;
	mov.u32 	%r151, 0;
	setp.lt.s32	%p39, %r62, 1;
	@%p39 bra 	BB18_50;

BB18_47:
	cvt.s64.s32	%rd10, %r151;
	mul.wide.s32 	%rd74, %r151, 4;
	mov.u64 	%rd75, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd76, %rd75, %rd74;
	ld.volatile.shared.u32 	%r101, [%rd76];
	setp.ne.s32	%p40, %r101, %r34;
	@%p40 bra 	BB18_49;

	shl.b64 	%rd77, %rd10, 2;
	mov.u64 	%rd78, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd79, %rd78, %rd77;
	ld.volatile.shared.u32 	%r102, [%rd79];
	setp.eq.s32	%p41, %r102, %r35;
	mov.f32 	%f52, %f15;
	mov.f32 	%f79, %f52;
	@%p41 bra 	BB18_51;

BB18_49:
	add.s32 	%r151, %r151, 1;
	setp.lt.s32	%p42, %r151, %r62;
	@%p42 bra 	BB18_47;

BB18_50:
	ld.volatile.shared.f32 	%f79, [%rd5+128];
	st.volatile.shared.f32 	[%rd5], %f79;
	ld.volatile.shared.u32 	%r103, [%rd6+128];
	st.volatile.shared.u32 	[%rd6], %r103;

BB18_51:
	mov.f32 	%f17, %f79;
	setp.lt.s32	%p43, %r61, 32;
	mov.f32 	%f78, %f17;
	@%p43 bra 	BB18_58;

	ld.volatile.shared.f32 	%f29, [%rd5+64];
	setp.geu.f32	%p44, %f17, %f29;
	mov.f32 	%f48, %f17;
	mov.f32 	%f78, %f48;
	@%p44 bra 	BB18_58;

	ld.volatile.shared.u32 	%r105, [%rd6+64];
	div.s32 	%r38, %r105, %r60;
	rem.s32 	%r39, %r105, %r60;
	mov.u32 	%r152, 0;
	setp.lt.s32	%p45, %r62, 1;
	@%p45 bra 	BB18_57;

BB18_54:
	cvt.s64.s32	%rd11, %r152;
	mul.wide.s32 	%rd80, %r152, 4;
	mov.u64 	%rd81, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd82, %rd81, %rd80;
	ld.volatile.shared.u32 	%r106, [%rd82];
	setp.ne.s32	%p46, %r106, %r38;
	@%p46 bra 	BB18_56;

	shl.b64 	%rd83, %rd11, 2;
	mov.u64 	%rd84, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd85, %rd84, %rd83;
	ld.volatile.shared.u32 	%r107, [%rd85];
	setp.eq.s32	%p47, %r107, %r39;
	mov.f32 	%f49, %f17;
	mov.f32 	%f78, %f49;
	@%p47 bra 	BB18_58;

BB18_56:
	add.s32 	%r152, %r152, 1;
	setp.lt.s32	%p48, %r152, %r62;
	@%p48 bra 	BB18_54;

BB18_57:
	ld.volatile.shared.f32 	%f78, [%rd5+64];
	st.volatile.shared.f32 	[%rd5], %f78;
	ld.volatile.shared.u32 	%r108, [%rd6+64];
	st.volatile.shared.u32 	[%rd6], %r108;

BB18_58:
	mov.f32 	%f19, %f78;
	setp.lt.s32	%p49, %r61, 16;
	mov.f32 	%f77, %f19;
	@%p49 bra 	BB18_65;

	ld.volatile.shared.f32 	%f30, [%rd5+32];
	setp.geu.f32	%p50, %f19, %f30;
	mov.f32 	%f45, %f19;
	mov.f32 	%f77, %f45;
	@%p50 bra 	BB18_65;

	ld.volatile.shared.u32 	%r110, [%rd6+32];
	div.s32 	%r42, %r110, %r60;
	rem.s32 	%r43, %r110, %r60;
	mov.u32 	%r153, 0;
	setp.lt.s32	%p51, %r62, 1;
	@%p51 bra 	BB18_64;

BB18_61:
	cvt.s64.s32	%rd12, %r153;
	mul.wide.s32 	%rd86, %r153, 4;
	mov.u64 	%rd87, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd88, %rd87, %rd86;
	ld.volatile.shared.u32 	%r111, [%rd88];
	setp.ne.s32	%p52, %r111, %r42;
	@%p52 bra 	BB18_63;

	shl.b64 	%rd89, %rd12, 2;
	mov.u64 	%rd90, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd91, %rd90, %rd89;
	ld.volatile.shared.u32 	%r112, [%rd91];
	setp.eq.s32	%p53, %r112, %r43;
	mov.f32 	%f46, %f19;
	mov.f32 	%f77, %f46;
	@%p53 bra 	BB18_65;

BB18_63:
	add.s32 	%r153, %r153, 1;
	setp.lt.s32	%p54, %r153, %r62;
	@%p54 bra 	BB18_61;

BB18_64:
	ld.volatile.shared.f32 	%f77, [%rd5+32];
	st.volatile.shared.f32 	[%rd5], %f77;
	ld.volatile.shared.u32 	%r113, [%rd6+32];
	st.volatile.shared.u32 	[%rd6], %r113;

BB18_65:
	mov.f32 	%f21, %f77;
	setp.lt.s32	%p55, %r61, 8;
	mov.f32 	%f76, %f21;
	@%p55 bra 	BB18_72;

	ld.volatile.shared.f32 	%f31, [%rd5+16];
	setp.geu.f32	%p56, %f21, %f31;
	mov.f32 	%f42, %f21;
	mov.f32 	%f76, %f42;
	@%p56 bra 	BB18_72;

	ld.volatile.shared.u32 	%r115, [%rd6+16];
	div.s32 	%r46, %r115, %r60;
	rem.s32 	%r47, %r115, %r60;
	mov.u32 	%r154, 0;
	setp.lt.s32	%p57, %r62, 1;
	@%p57 bra 	BB18_71;

BB18_68:
	cvt.s64.s32	%rd13, %r154;
	mul.wide.s32 	%rd92, %r154, 4;
	mov.u64 	%rd93, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd94, %rd93, %rd92;
	ld.volatile.shared.u32 	%r116, [%rd94];
	setp.ne.s32	%p58, %r116, %r46;
	@%p58 bra 	BB18_70;

	shl.b64 	%rd95, %rd13, 2;
	mov.u64 	%rd96, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd97, %rd96, %rd95;
	ld.volatile.shared.u32 	%r117, [%rd97];
	setp.eq.s32	%p59, %r117, %r47;
	mov.f32 	%f43, %f21;
	mov.f32 	%f76, %f43;
	@%p59 bra 	BB18_72;

BB18_70:
	add.s32 	%r154, %r154, 1;
	setp.lt.s32	%p60, %r154, %r62;
	@%p60 bra 	BB18_68;

BB18_71:
	ld.volatile.shared.f32 	%f76, [%rd5+16];
	st.volatile.shared.f32 	[%rd5], %f76;
	ld.volatile.shared.u32 	%r118, [%rd6+16];
	st.volatile.shared.u32 	[%rd6], %r118;

BB18_72:
	mov.f32 	%f23, %f76;
	setp.lt.s32	%p61, %r61, 4;
	mov.f32 	%f75, %f23;
	@%p61 bra 	BB18_79;

	ld.volatile.shared.f32 	%f32, [%rd5+8];
	setp.geu.f32	%p62, %f23, %f32;
	mov.f32 	%f39, %f23;
	mov.f32 	%f75, %f39;
	@%p62 bra 	BB18_79;

	ld.volatile.shared.u32 	%r120, [%rd6+8];
	div.s32 	%r50, %r120, %r60;
	rem.s32 	%r51, %r120, %r60;
	mov.u32 	%r155, 0;
	setp.lt.s32	%p63, %r62, 1;
	@%p63 bra 	BB18_78;

BB18_75:
	cvt.s64.s32	%rd14, %r155;
	mul.wide.s32 	%rd98, %r155, 4;
	mov.u64 	%rd99, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd100, %rd99, %rd98;
	ld.volatile.shared.u32 	%r121, [%rd100];
	setp.ne.s32	%p64, %r121, %r50;
	@%p64 bra 	BB18_77;

	shl.b64 	%rd101, %rd14, 2;
	mov.u64 	%rd102, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd103, %rd102, %rd101;
	ld.volatile.shared.u32 	%r122, [%rd103];
	setp.eq.s32	%p65, %r122, %r51;
	mov.f32 	%f40, %f23;
	mov.f32 	%f75, %f40;
	@%p65 bra 	BB18_79;

BB18_77:
	add.s32 	%r155, %r155, 1;
	setp.lt.s32	%p66, %r155, %r62;
	@%p66 bra 	BB18_75;

BB18_78:
	ld.volatile.shared.f32 	%f75, [%rd5+8];
	st.volatile.shared.f32 	[%rd5], %f75;
	ld.volatile.shared.u32 	%r123, [%rd6+8];
	st.volatile.shared.u32 	[%rd6], %r123;

BB18_79:
	setp.lt.s32	%p67, %r61, 2;
	@%p67 bra 	BB18_86;

	ld.volatile.shared.f32 	%f33, [%rd5+4];
	setp.geu.f32	%p68, %f75, %f33;
	@%p68 bra 	BB18_86;

	ld.volatile.shared.u32 	%r125, [%rd6+4];
	div.s32 	%r54, %r125, %r60;
	rem.s32 	%r55, %r125, %r60;
	mov.u32 	%r156, 0;
	setp.lt.s32	%p69, %r62, 1;
	@%p69 bra 	BB18_85;

BB18_82:
	cvt.s64.s32	%rd15, %r156;
	mul.wide.s32 	%rd104, %r156, 4;
	mov.u64 	%rd105, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow;
	add.s64 	%rd106, %rd105, %rd104;
	ld.volatile.shared.u32 	%r126, [%rd106];
	setp.ne.s32	%p70, %r126, %r54;
	@%p70 bra 	BB18_84;

	shl.b64 	%rd107, %rd15, 2;
	mov.u64 	%rd108, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol;
	add.s64 	%rd109, %rd108, %rd107;
	ld.volatile.shared.u32 	%r127, [%rd109];
	setp.eq.s32	%p71, %r127, %r55;
	@%p71 bra 	BB18_86;

BB18_84:
	add.s32 	%r156, %r156, 1;
	setp.lt.s32	%p72, %r156, %r62;
	@%p72 bra 	BB18_82;

BB18_85:
	ld.volatile.shared.f32 	%f34, [%rd5+4];
	st.volatile.shared.f32 	[%rd5], %f34;
	ld.volatile.shared.u32 	%r128, [%rd6+4];
	st.volatile.shared.u32 	[%rd6], %r128;

BB18_86:
	bar.sync 	0;

BB18_87:
	setp.ne.s32	%p73, %r63, 0;
	@%p73 bra 	BB18_90;

	ld.shared.f32 	%f35, [reduce_max_filter_finalf$__cuda_local_var_45428_32_non_const_sdata];
	cvta.to.global.u64 	%rd110, %rd17;
	mul.wide.u32 	%rd111, %r70, 4;
	add.s64 	%rd112, %rd110, %rd111;
	st.global.f32 	[%rd112], %f35;
	ld.shared.u32 	%r58, [reduce_max_filter_finalf$__cuda_local_var_45429_30_non_const_idxData];
	cvta.to.global.u64 	%rd113, %rd18;
	add.s64 	%rd114, %rd113, %rd111;
	st.global.u32 	[%rd114], %r58;
	mov.u32 	%r131, %nctaid.x;
	setp.ne.s32	%p74, %r131, 1;
	@%p74 bra 	BB18_90;

	cvta.to.global.u64 	%rd115, %rd19;
	mul.wide.s32 	%rd116, %r62, 4;
	add.s64 	%rd117, %rd115, %rd116;
	st.global.u32 	[%rd117], %r58;

BB18_90:
	ret;
}

	// .globl	reduce_max_filter_mainf
.visible .entry reduce_max_filter_mainf(
	.param .u64 reduce_max_filter_mainf_param_0,
	.param .u64 reduce_max_filter_mainf_param_1,
	.param .u64 reduce_max_filter_mainf_param_2,
	.param .u32 reduce_max_filter_mainf_param_3,
	.param .u32 reduce_max_filter_mainf_param_4,
	.param .u32 reduce_max_filter_mainf_param_5,
	.param .u64 reduce_max_filter_mainf_param_6,
	.param .u32 reduce_max_filter_mainf_param_7
)
{
	.reg .pred 	%p<87>;
	.reg .f32 	%f<88>;
	.reg .b32 	%r<147>;
	.reg .b64 	%rd<110>;
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow[40];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol[40];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_mainf$__cuda_local_var_45638_30_non_const_smaxesVal[40];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_mainf$__cuda_local_var_45639_32_non_const_sdata[1024];
	// demoted variable
	.shared .align 4 .b8 reduce_max_filter_mainf$__cuda_local_var_45640_30_non_const_idxData[1024];

	ld.param.u64 	%rd14, [reduce_max_filter_mainf_param_0];
	ld.param.u64 	%rd15, [reduce_max_filter_mainf_param_1];
	ld.param.u64 	%rd16, [reduce_max_filter_mainf_param_2];
	ld.param.u32 	%r60, [reduce_max_filter_mainf_param_3];
	ld.param.u32 	%r61, [reduce_max_filter_mainf_param_4];
	ld.param.u32 	%r62, [reduce_max_filter_mainf_param_5];
	ld.param.u64 	%rd17, [reduce_max_filter_mainf_param_6];
	ld.param.u32 	%r63, [reduce_max_filter_mainf_param_7];
	mov.u32 	%r1, %tid.x;
	setp.ge.u32	%p3, %r1, %r63;
	@%p3 bra 	BB19_2;

	cvta.to.global.u64 	%rd18, %rd17;
	mul.wide.u32 	%rd19, %r1, 4;
	add.s64 	%rd20, %rd18, %rd19;
	ld.global.u32 	%r65, [%rd20];
	mov.u64 	%rd21, reduce_max_filter_mainf$__cuda_local_var_45638_30_non_const_smaxesVal;
	add.s64 	%rd22, %rd21, %rd19;
	st.shared.u32 	[%rd22], %r65;
	div.u32 	%r66, %r65, %r60;
	mov.u64 	%rd23, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd24, %rd23, %rd19;
	st.shared.u32 	[%rd24], %r66;
	rem.u32 	%r67, %r65, %r60;
	mov.u64 	%rd25, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd26, %rd25, %rd19;
	st.shared.u32 	[%rd26], %r67;

BB19_2:
	mov.u32 	%r68, %ctaid.x;
	mad.lo.s32 	%r134, %r68, %r62, %r1;
	bar.sync 	0;
	mul.lo.s32 	%r70, %r61, %r60;
	mov.f32 	%f84, 0fFF800000;
	setp.ge.u32	%p4, %r134, %r70;
	@%p4 bra 	BB19_18;

	mov.f32 	%f87, 0fFF800000;

BB19_4:
	mov.f32 	%f70, %f87;
	mov.f32 	%f85, %f70;
	cvta.to.global.u64 	%rd27, %rd14;
	mul.wide.u32 	%rd28, %r134, 4;
	add.s64 	%rd29, %rd27, %rd28;
	ld.global.f32 	%f2, [%rd29];
	setp.geu.f32	%p5, %f85, %f2;
	@%p5 bra 	BB19_10;

	div.s32 	%r6, %r134, %r60;
	rem.s32 	%r7, %r134, %r60;
	mov.pred 	%p6, -1;
	mov.u32 	%r135, 0;
	setp.lt.s32	%p7, %r63, 1;
	mov.pred 	%p84, %p6;
	@%p7 bra 	BB19_9;

BB19_6:
	cvt.s64.s32	%rd1, %r135;
	mul.wide.s32 	%rd30, %r135, 4;
	mov.u64 	%rd31, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd32, %rd31, %rd30;
	ld.shared.u32 	%r75, [%rd32];
	setp.ne.s32	%p8, %r75, %r6;
	@%p8 bra 	BB19_8;

	shl.b64 	%rd33, %rd1, 2;
	mov.u64 	%rd34, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd35, %rd34, %rd33;
	ld.shared.u32 	%r76, [%rd35];
	setp.eq.s32	%p10, %r76, %r7;
	mov.pred 	%p9, 0;
	mov.pred 	%p84, %p9;
	@%p10 bra 	BB19_9;

BB19_8:
	add.s32 	%r135, %r135, 1;
	setp.lt.s32	%p12, %r135, %r63;
	mov.pred 	%p83, %p6;
	mov.pred 	%p84, %p83;
	@%p12 bra 	BB19_6;

BB19_9:
	selp.f32	%f85, %f2, %f85, %p84;
	selp.b32	%r137, %r134, %r137, %p84;

BB19_10:
	mov.f32 	%f4, %f85;
	add.s32 	%r78, %r134, %r62;
	setp.ge.u32	%p13, %r78, %r70;
	mov.f32 	%f86, %f4;
	@%p13 bra 	BB19_17;

	mul.wide.u32 	%rd37, %r78, 4;
	add.s64 	%rd38, %rd27, %rd37;
	ld.global.f32 	%f5, [%rd38];
	setp.geu.f32	%p14, %f4, %f5;
	mov.f32 	%f69, %f4;
	mov.f32 	%f86, %f69;
	@%p14 bra 	BB19_17;

	div.s32 	%r12, %r78, %r60;
	rem.s32 	%r13, %r78, %r60;
	mov.pred 	%p15, -1;
	mov.u32 	%r136, 0;
	setp.lt.s32	%p16, %r63, 1;
	mov.pred 	%p86, %p15;
	@%p16 bra 	BB19_16;

BB19_13:
	cvt.s64.s32	%rd2, %r136;
	mul.wide.s32 	%rd39, %r136, 4;
	mov.u64 	%rd40, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd41, %rd40, %rd39;
	ld.shared.u32 	%r82, [%rd41];
	setp.ne.s32	%p17, %r82, %r12;
	@%p17 bra 	BB19_15;

	shl.b64 	%rd42, %rd2, 2;
	mov.u64 	%rd43, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd44, %rd43, %rd42;
	ld.shared.u32 	%r83, [%rd44];
	setp.eq.s32	%p19, %r83, %r13;
	mov.pred 	%p18, 0;
	mov.pred 	%p86, %p18;
	@%p19 bra 	BB19_16;

BB19_15:
	add.s32 	%r136, %r136, 1;
	setp.lt.s32	%p21, %r136, %r63;
	mov.pred 	%p85, %p15;
	mov.pred 	%p86, %p85;
	@%p21 bra 	BB19_13;

BB19_16:
	selp.f32	%f86, %f5, %f4, %p86;
	selp.b32	%r137, %r78, %r137, %p86;

BB19_17:
	mov.f32 	%f87, %f86;
	mov.u32 	%r85, %nctaid.x;
	mad.lo.s32 	%r134, %r85, %r62, %r134;
	setp.lt.u32	%p22, %r134, %r70;
	mov.f32 	%f84, %f87;
	@%p22 bra 	BB19_4;

BB19_18:
	mov.f32 	%f82, %f84;
	mul.wide.u32 	%rd45, %r1, 4;
	mov.u64 	%rd46, reduce_max_filter_mainf$__cuda_local_var_45639_32_non_const_sdata;
	add.s64 	%rd3, %rd46, %rd45;
	st.shared.f32 	[%rd3], %f82;
	mov.u64 	%rd47, reduce_max_filter_mainf$__cuda_local_var_45640_30_non_const_idxData;
	add.s64 	%rd4, %rd47, %rd45;
	st.shared.u32 	[%rd4], %r137;
	bar.sync 	0;
	setp.lt.s32	%p23, %r62, 512;
	@%p23 bra 	BB19_27;

	setp.gt.u32	%p24, %r1, 255;
	mov.f32 	%f83, %f82;
	@%p24 bra 	BB19_26;

	ld.shared.f32 	%f9, [%rd3+1024];
	setp.geu.f32	%p25, %f82, %f9;
	mov.f32 	%f66, %f82;
	mov.f32 	%f83, %f66;
	@%p25 bra 	BB19_26;

	ld.shared.u32 	%r20, [%rd4+1024];
	div.s32 	%r21, %r20, %r60;
	rem.s32 	%r22, %r20, %r60;
	mov.u32 	%r138, 0;
	setp.lt.s32	%p26, %r63, 1;
	@%p26 bra 	BB19_25;

BB19_22:
	cvt.s64.s32	%rd5, %r138;
	mul.wide.s32 	%rd48, %r138, 4;
	mov.u64 	%rd49, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd50, %rd49, %rd48;
	ld.shared.u32 	%r90, [%rd50];
	setp.ne.s32	%p27, %r90, %r21;
	@%p27 bra 	BB19_24;

	shl.b64 	%rd51, %rd5, 2;
	mov.u64 	%rd52, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd53, %rd52, %rd51;
	ld.shared.u32 	%r91, [%rd53];
	setp.eq.s32	%p28, %r91, %r22;
	mov.f32 	%f67, %f82;
	mov.f32 	%f83, %f67;
	@%p28 bra 	BB19_26;

BB19_24:
	add.s32 	%r138, %r138, 1;
	setp.lt.s32	%p29, %r138, %r63;
	@%p29 bra 	BB19_22;

BB19_25:
	st.shared.f32 	[%rd3], %f9;
	st.shared.u32 	[%rd4], %r20;
	mov.f32 	%f83, %f9;

BB19_26:
	mov.f32 	%f82, %f83;
	bar.sync 	0;

BB19_27:
	mov.f32 	%f80, %f82;
	setp.lt.s32	%p30, %r62, 256;
	@%p30 bra 	BB19_36;

	setp.gt.u32	%p31, %r1, 127;
	mov.f32 	%f81, %f80;
	@%p31 bra 	BB19_35;

	ld.shared.f32 	%f12, [%rd3+512];
	setp.geu.f32	%p32, %f80, %f12;
	mov.f32 	%f61, %f80;
	mov.f32 	%f81, %f61;
	@%p32 bra 	BB19_35;

	ld.shared.u32 	%r25, [%rd4+512];
	div.s32 	%r26, %r25, %r60;
	rem.s32 	%r27, %r25, %r60;
	mov.u32 	%r139, 0;
	setp.lt.s32	%p33, %r63, 1;
	@%p33 bra 	BB19_34;

BB19_31:
	cvt.s64.s32	%rd6, %r139;
	mul.wide.s32 	%rd54, %r139, 4;
	mov.u64 	%rd55, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd56, %rd55, %rd54;
	ld.shared.u32 	%r94, [%rd56];
	setp.ne.s32	%p34, %r94, %r26;
	@%p34 bra 	BB19_33;

	shl.b64 	%rd57, %rd6, 2;
	mov.u64 	%rd58, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd59, %rd58, %rd57;
	ld.shared.u32 	%r95, [%rd59];
	setp.eq.s32	%p35, %r95, %r27;
	mov.f32 	%f62, %f80;
	mov.f32 	%f81, %f62;
	@%p35 bra 	BB19_35;

BB19_33:
	add.s32 	%r139, %r139, 1;
	setp.lt.s32	%p36, %r139, %r63;
	@%p36 bra 	BB19_31;

BB19_34:
	st.shared.f32 	[%rd3], %f12;
	st.shared.u32 	[%rd4], %r25;
	mov.f32 	%f81, %f12;

BB19_35:
	mov.f32 	%f80, %f81;
	bar.sync 	0;

BB19_36:
	mov.f32 	%f78, %f80;
	setp.lt.s32	%p37, %r62, 128;
	@%p37 bra 	BB19_45;

	setp.gt.u32	%p38, %r1, 63;
	mov.f32 	%f79, %f78;
	@%p38 bra 	BB19_44;

	ld.shared.f32 	%f15, [%rd3+256];
	setp.geu.f32	%p39, %f78, %f15;
	mov.f32 	%f56, %f78;
	mov.f32 	%f79, %f56;
	@%p39 bra 	BB19_44;

	ld.shared.u32 	%r30, [%rd4+256];
	div.s32 	%r31, %r30, %r60;
	rem.s32 	%r32, %r30, %r60;
	mov.u32 	%r140, 0;
	setp.lt.s32	%p40, %r63, 1;
	@%p40 bra 	BB19_43;

BB19_40:
	cvt.s64.s32	%rd7, %r140;
	mul.wide.s32 	%rd60, %r140, 4;
	mov.u64 	%rd61, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd62, %rd61, %rd60;
	ld.shared.u32 	%r98, [%rd62];
	setp.ne.s32	%p41, %r98, %r31;
	@%p41 bra 	BB19_42;

	shl.b64 	%rd63, %rd7, 2;
	mov.u64 	%rd64, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd65, %rd64, %rd63;
	ld.shared.u32 	%r99, [%rd65];
	setp.eq.s32	%p42, %r99, %r32;
	mov.f32 	%f57, %f78;
	mov.f32 	%f79, %f57;
	@%p42 bra 	BB19_44;

BB19_42:
	add.s32 	%r140, %r140, 1;
	setp.lt.s32	%p43, %r140, %r63;
	@%p43 bra 	BB19_40;

BB19_43:
	st.shared.f32 	[%rd3], %f15;
	st.shared.u32 	[%rd4], %r30;
	mov.f32 	%f79, %f15;

BB19_44:
	mov.f32 	%f78, %f79;
	bar.sync 	0;

BB19_45:
	mov.f32 	%f17, %f78;
	setp.gt.u32	%p44, %r1, 31;
	@%p44 bra 	BB19_89;

	setp.lt.s32	%p45, %r62, 64;
	mov.f32 	%f77, %f17;
	@%p45 bra 	BB19_53;

	ld.volatile.shared.f32 	%f30, [%rd3+128];
	setp.geu.f32	%p46, %f17, %f30;
	mov.f32 	%f51, %f17;
	mov.f32 	%f77, %f51;
	@%p46 bra 	BB19_53;

	ld.volatile.shared.u32 	%r102, [%rd4+128];
	div.s32 	%r35, %r102, %r60;
	rem.s32 	%r36, %r102, %r60;
	mov.u32 	%r141, 0;
	setp.lt.s32	%p47, %r63, 1;
	@%p47 bra 	BB19_52;

BB19_49:
	cvt.s64.s32	%rd8, %r141;
	mul.wide.s32 	%rd66, %r141, 4;
	mov.u64 	%rd67, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd68, %rd67, %rd66;
	ld.volatile.shared.u32 	%r103, [%rd68];
	setp.ne.s32	%p48, %r103, %r35;
	@%p48 bra 	BB19_51;

	shl.b64 	%rd69, %rd8, 2;
	mov.u64 	%rd70, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd71, %rd70, %rd69;
	ld.volatile.shared.u32 	%r104, [%rd71];
	setp.eq.s32	%p49, %r104, %r36;
	mov.f32 	%f52, %f17;
	mov.f32 	%f77, %f52;
	@%p49 bra 	BB19_53;

BB19_51:
	add.s32 	%r141, %r141, 1;
	setp.lt.s32	%p50, %r141, %r63;
	@%p50 bra 	BB19_49;

BB19_52:
	ld.volatile.shared.f32 	%f77, [%rd3+128];
	st.volatile.shared.f32 	[%rd3], %f77;
	ld.volatile.shared.u32 	%r105, [%rd4+128];
	st.volatile.shared.u32 	[%rd4], %r105;

BB19_53:
	mov.f32 	%f19, %f77;
	setp.lt.s32	%p51, %r62, 32;
	mov.f32 	%f76, %f19;
	@%p51 bra 	BB19_60;

	ld.volatile.shared.f32 	%f31, [%rd3+64];
	setp.geu.f32	%p52, %f19, %f31;
	mov.f32 	%f48, %f19;
	mov.f32 	%f76, %f48;
	@%p52 bra 	BB19_60;

	ld.volatile.shared.u32 	%r107, [%rd4+64];
	div.s32 	%r39, %r107, %r60;
	rem.s32 	%r40, %r107, %r60;
	mov.u32 	%r142, 0;
	setp.lt.s32	%p53, %r63, 1;
	@%p53 bra 	BB19_59;

BB19_56:
	cvt.s64.s32	%rd9, %r142;
	mul.wide.s32 	%rd72, %r142, 4;
	mov.u64 	%rd73, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd74, %rd73, %rd72;
	ld.volatile.shared.u32 	%r108, [%rd74];
	setp.ne.s32	%p54, %r108, %r39;
	@%p54 bra 	BB19_58;

	shl.b64 	%rd75, %rd9, 2;
	mov.u64 	%rd76, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd77, %rd76, %rd75;
	ld.volatile.shared.u32 	%r109, [%rd77];
	setp.eq.s32	%p55, %r109, %r40;
	mov.f32 	%f49, %f19;
	mov.f32 	%f76, %f49;
	@%p55 bra 	BB19_60;

BB19_58:
	add.s32 	%r142, %r142, 1;
	setp.lt.s32	%p56, %r142, %r63;
	@%p56 bra 	BB19_56;

BB19_59:
	ld.volatile.shared.f32 	%f76, [%rd3+64];
	st.volatile.shared.f32 	[%rd3], %f76;
	ld.volatile.shared.u32 	%r110, [%rd4+64];
	st.volatile.shared.u32 	[%rd4], %r110;

BB19_60:
	mov.f32 	%f21, %f76;
	setp.lt.s32	%p57, %r62, 16;
	mov.f32 	%f75, %f21;
	@%p57 bra 	BB19_67;

	ld.volatile.shared.f32 	%f32, [%rd3+32];
	setp.geu.f32	%p58, %f21, %f32;
	mov.f32 	%f45, %f21;
	mov.f32 	%f75, %f45;
	@%p58 bra 	BB19_67;

	ld.volatile.shared.u32 	%r112, [%rd4+32];
	div.s32 	%r43, %r112, %r60;
	rem.s32 	%r44, %r112, %r60;
	mov.u32 	%r143, 0;
	setp.lt.s32	%p59, %r63, 1;
	@%p59 bra 	BB19_66;

BB19_63:
	cvt.s64.s32	%rd10, %r143;
	mul.wide.s32 	%rd78, %r143, 4;
	mov.u64 	%rd79, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd80, %rd79, %rd78;
	ld.volatile.shared.u32 	%r113, [%rd80];
	setp.ne.s32	%p60, %r113, %r43;
	@%p60 bra 	BB19_65;

	shl.b64 	%rd81, %rd10, 2;
	mov.u64 	%rd82, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd83, %rd82, %rd81;
	ld.volatile.shared.u32 	%r114, [%rd83];
	setp.eq.s32	%p61, %r114, %r44;
	mov.f32 	%f46, %f21;
	mov.f32 	%f75, %f46;
	@%p61 bra 	BB19_67;

BB19_65:
	add.s32 	%r143, %r143, 1;
	setp.lt.s32	%p62, %r143, %r63;
	@%p62 bra 	BB19_63;

BB19_66:
	ld.volatile.shared.f32 	%f75, [%rd3+32];
	st.volatile.shared.f32 	[%rd3], %f75;
	ld.volatile.shared.u32 	%r115, [%rd4+32];
	st.volatile.shared.u32 	[%rd4], %r115;

BB19_67:
	mov.f32 	%f23, %f75;
	setp.lt.s32	%p63, %r62, 8;
	mov.f32 	%f74, %f23;
	@%p63 bra 	BB19_74;

	ld.volatile.shared.f32 	%f33, [%rd3+16];
	setp.geu.f32	%p64, %f23, %f33;
	mov.f32 	%f42, %f23;
	mov.f32 	%f74, %f42;
	@%p64 bra 	BB19_74;

	ld.volatile.shared.u32 	%r117, [%rd4+16];
	div.s32 	%r47, %r117, %r60;
	rem.s32 	%r48, %r117, %r60;
	mov.u32 	%r144, 0;
	setp.lt.s32	%p65, %r63, 1;
	@%p65 bra 	BB19_73;

BB19_70:
	cvt.s64.s32	%rd11, %r144;
	mul.wide.s32 	%rd84, %r144, 4;
	mov.u64 	%rd85, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd86, %rd85, %rd84;
	ld.volatile.shared.u32 	%r118, [%rd86];
	setp.ne.s32	%p66, %r118, %r47;
	@%p66 bra 	BB19_72;

	shl.b64 	%rd87, %rd11, 2;
	mov.u64 	%rd88, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd89, %rd88, %rd87;
	ld.volatile.shared.u32 	%r119, [%rd89];
	setp.eq.s32	%p67, %r119, %r48;
	mov.f32 	%f43, %f23;
	mov.f32 	%f74, %f43;
	@%p67 bra 	BB19_74;

BB19_72:
	add.s32 	%r144, %r144, 1;
	setp.lt.s32	%p68, %r144, %r63;
	@%p68 bra 	BB19_70;

BB19_73:
	ld.volatile.shared.f32 	%f74, [%rd3+16];
	st.volatile.shared.f32 	[%rd3], %f74;
	ld.volatile.shared.u32 	%r120, [%rd4+16];
	st.volatile.shared.u32 	[%rd4], %r120;

BB19_74:
	mov.f32 	%f25, %f74;
	setp.lt.s32	%p69, %r62, 4;
	mov.f32 	%f73, %f25;
	@%p69 bra 	BB19_81;

	ld.volatile.shared.f32 	%f34, [%rd3+8];
	setp.geu.f32	%p70, %f25, %f34;
	mov.f32 	%f39, %f25;
	mov.f32 	%f73, %f39;
	@%p70 bra 	BB19_81;

	ld.volatile.shared.u32 	%r122, [%rd4+8];
	div.s32 	%r51, %r122, %r60;
	rem.s32 	%r52, %r122, %r60;
	mov.u32 	%r145, 0;
	setp.lt.s32	%p71, %r63, 1;
	@%p71 bra 	BB19_80;

BB19_77:
	cvt.s64.s32	%rd12, %r145;
	mul.wide.s32 	%rd90, %r145, 4;
	mov.u64 	%rd91, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd92, %rd91, %rd90;
	ld.volatile.shared.u32 	%r123, [%rd92];
	setp.ne.s32	%p72, %r123, %r51;
	@%p72 bra 	BB19_79;

	shl.b64 	%rd93, %rd12, 2;
	mov.u64 	%rd94, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd95, %rd94, %rd93;
	ld.volatile.shared.u32 	%r124, [%rd95];
	setp.eq.s32	%p73, %r124, %r52;
	mov.f32 	%f40, %f25;
	mov.f32 	%f73, %f40;
	@%p73 bra 	BB19_81;

BB19_79:
	add.s32 	%r145, %r145, 1;
	setp.lt.s32	%p74, %r145, %r63;
	@%p74 bra 	BB19_77;

BB19_80:
	ld.volatile.shared.f32 	%f73, [%rd3+8];
	st.volatile.shared.f32 	[%rd3], %f73;
	ld.volatile.shared.u32 	%r125, [%rd4+8];
	st.volatile.shared.u32 	[%rd4], %r125;

BB19_81:
	setp.lt.s32	%p75, %r62, 2;
	@%p75 bra 	BB19_88;

	ld.volatile.shared.f32 	%f35, [%rd3+4];
	setp.geu.f32	%p76, %f73, %f35;
	@%p76 bra 	BB19_88;

	ld.volatile.shared.u32 	%r127, [%rd4+4];
	div.s32 	%r55, %r127, %r60;
	rem.s32 	%r56, %r127, %r60;
	mov.u32 	%r146, 0;
	setp.lt.s32	%p77, %r63, 1;
	@%p77 bra 	BB19_87;

BB19_84:
	cvt.s64.s32	%rd13, %r146;
	mul.wide.s32 	%rd96, %r146, 4;
	mov.u64 	%rd97, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow;
	add.s64 	%rd98, %rd97, %rd96;
	ld.volatile.shared.u32 	%r128, [%rd98];
	setp.ne.s32	%p78, %r128, %r55;
	@%p78 bra 	BB19_86;

	shl.b64 	%rd99, %rd13, 2;
	mov.u64 	%rd100, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol;
	add.s64 	%rd101, %rd100, %rd99;
	ld.volatile.shared.u32 	%r129, [%rd101];
	setp.eq.s32	%p79, %r129, %r56;
	@%p79 bra 	BB19_88;

BB19_86:
	add.s32 	%r146, %r146, 1;
	setp.lt.s32	%p80, %r146, %r63;
	@%p80 bra 	BB19_84;

BB19_87:
	ld.volatile.shared.f32 	%f36, [%rd3+4];
	st.volatile.shared.f32 	[%rd3], %f36;
	ld.volatile.shared.u32 	%r130, [%rd4+4];
	st.volatile.shared.u32 	[%rd4], %r130;

BB19_88:
	bar.sync 	0;

BB19_89:
	setp.ne.s32	%p81, %r1, 0;
	@%p81 bra 	BB19_92;

	ld.shared.f32 	%f37, [reduce_max_filter_mainf$__cuda_local_var_45639_32_non_const_sdata];
	cvta.to.global.u64 	%rd102, %rd15;
	mul.wide.u32 	%rd103, %r68, 4;
	add.s64 	%rd104, %rd102, %rd103;
	st.global.f32 	[%rd104], %f37;
	ld.shared.u32 	%r59, [reduce_max_filter_mainf$__cuda_local_var_45640_30_non_const_idxData];
	cvta.to.global.u64 	%rd105, %rd16;
	add.s64 	%rd106, %rd105, %rd103;
	st.global.u32 	[%rd106], %r59;
	mov.u32 	%r133, %nctaid.x;
	setp.ne.s32	%p82, %r133, 1;
	@%p82 bra 	BB19_92;

	cvta.to.global.u64 	%rd107, %rd17;
	mul.wide.s32 	%rd108, %r63, 4;
	add.s64 	%rd109, %rd107, %rd108;
	st.global.u32 	[%rd109], %r59;

BB19_92:
	ret;
}


