patch-2.1.45 linux/arch/sparc64/lib/checksum.S

Next file: linux/arch/sparc64/lib/strlen_user.S
Previous file: linux/arch/sparc64/lib/VIScsum.S
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.44/linux/arch/sparc64/lib/checksum.S linux/arch/sparc64/lib/checksum.S
@@ -25,223 +25,12 @@
 	 * into the accumulated sum.  The following is much better.
 	 *
 	 * This should run at max bandwidth for ecache hits, a better
-	 * technique is to use VIS and fpu operations somehow, but
-	 * this requires more reasoning on my part...
-	 *
-	 * Assuming ecache hits and branches predicted well, this
-	 * can be expected to run at a rate of 16 cycles per 64-bytes
-	 * of data summed.  (the old code summed 32 bytes in 20
-	 * cycles, with numerous bubbles and unnecessary stalls)
+	 * technique is to use VIS and fpu operations. This is already
+	 * done for csum_partial, needs to be written for the copy stuff
+	 * still.
 	 */
-#define CSUM_ECACHE_LOAD(buf, offset, t0, t1, t2, t3, t4, t5, t6, t7)			\
-	ldx	[buf + offset + 0x00], t0;						\
-	ldx	[buf + offset + 0x08], t1;						\
-	ldx	[buf + offset + 0x10], t2;						\
-	ldx	[buf + offset + 0x18], t3;						\
-	ldx	[buf + offset + 0x20], t4;						\
-	ldx	[buf + offset + 0x28], t5;						\
-	ldx	[buf + offset + 0x30], t6;						\
-	ldx	[buf + offset + 0x38], t7;						\
-	nop; nop;	/* THIS IS CRITICAL!!!!!!!!! */
-
-#define CSUM_ECACHE_BLOCK_LDNEXT(buf, offset, sum, t0, t1, t2, t3, t4, t5, t6, t7)	\
-	addcc		sum, t0, sum;							\
-	bcc,pt		%xcc, 11f;							\
-	 ldx		[buf + offset + 0x00], t0;					\
-	add		sum, 1, sum;							\
-11:	addcc		sum, t1, sum;							\
-	bcc,pt		%xcc, 12f;							\
-	 ldx		[buf + offset + 0x08], t1;					\
-	add		sum, 1, sum;							\
-12:	addcc		sum, t2, sum;							\
-	bcc,pt		%xcc, 13f;							\
-	 ldx		[buf + offset + 0x10], t2;					\
-	add		sum, 1, sum;							\
-13:	addcc		sum, t3, sum;							\
-	bcc,pt		%xcc, 14f;							\
-	 ldx		[buf + offset + 0x18], t3;					\
-	add		sum, 1, sum;							\
-14:	addcc		sum, t4, sum;							\
-	bcc,pt		%xcc, 15f;							\
-	 ldx		[buf + offset + 0x20], t4;					\
-	add		sum, 1, sum;							\
-15:	addcc		sum, t5, sum;							\
-	bcc,pt		%xcc, 16f;							\
-	 ldx		[buf + offset + 0x28], t5;					\
-	add		sum, 1, sum;							\
-16:	addcc		sum, t6, sum;							\
-	bcc,pt		%xcc, 17f;							\
-	 ldx		[buf + offset + 0x30], t6;					\
-	add		sum, 1, sum;							\
-17:	addcc		sum, t7, sum;							\
-	bcc,pt		%xcc, 18f;							\
-	 ldx		[buf + offset + 0x38], t7;					\
-	add		sum, 1, sum;							\
-18:	nop; nop;	/* DO NOT TOUCH! */
-
-#define CSUM_ECACHE_BLOCK(sum, t0, t1, t2, t3, t4, t5, t6, t7)				\
-	addcc		sum, t0, sum;							\
-	bcs,a,pn	%xcc, 21f;							\
-	 add		sum, 1, sum;							\
-21:	addcc		sum, t1, sum;							\
-	bcs,a,pn	%xcc, 22f;							\
-	 add		sum, 1, sum;							\
-22:	addcc		sum, t2, sum;							\
-	bcs,a,pn	%xcc, 23f;							\
-	 add		sum, 1, sum;							\
-23:	addcc		sum, t3, sum;							\
-	bcs,a,pn	%xcc, 24f;							\
-	 add		sum, 1, sum;							\
-24:	addcc		sum, t4, sum;							\
-	bcs,a,pn	%xcc, 25f;							\
-	 add		sum, 1, sum;							\
-25:	addcc		sum, t5, sum;							\
-	bcs,a,pn	%xcc, 26f;							\
-	 add		sum, 1, sum;							\
-26:	addcc		sum, t6, sum;							\
-	bcs,a,pn	%xcc, 27f;							\
-	 add		sum, 1, sum;							\
-27:	addcc		sum, t7, sum;							\
-	bcs,a,pn	%xcc, 28f;							\
-	 add		sum, 1, sum;							\
-28:
-
-#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1)					\
-	ldx		[buf - offset - 0x08], t0;					\
-	ldx		[buf - offset - 0x00], t1;					\
-	addcc		t0, sum, sum;							\
-	bcs,a,pn	%xcc, 31f;							\
-	 add		sum, 1, sum;							\
-31:	addcc		t1, sum, sum;							\
-	bcs,a,pn	%xcc, 32f;							\
-	 add		sum, 1, sum;							\
-32:
 
 	.text
-	/* Keep this garbage from swiping the icache. */
-csum_partial_end_cruft:
-	andcc		%o1, 8, %g0			! IEU1	Group
-	be,pn		%icc, 1f			! CTI
-	 and		%o1, 4, %g5			! IEU0
-	ldx		[%o0 + 0x00], %g2		! Load	Group
-	add		%o0, 0x8, %o0			! IEU0
-	addcc		%g2, %o2, %o2			! IEU1	Group + 2 bubbles
-	bcs,a,pn	%xcc, 1f			! CTI
-	 add		%o2, 1, %o2			! IEU0	4 clocks (mispredict)
-1:	andcc		%o1, 2, %g0			! IEU1	Group
-	brz,pn		%g5, 1f				! CTI	Group (needs IEU1)
-	 clr		%g2				! IEU0
-	ld		[%o0], %g2			! Load
-	add		%o0, 4, %o0			! IEU0	Group
-	sllx		%g2, 32, %g2			! IEU0	Group + 2 bubbles
-1:	and		%o1, 1, %o1			! IEU1
-	be,pn		%icc, 1f			! CTI
-	 clr		%o4				! IEU0	Group
-	lduh		[%o0], %o4			! Load
-	add		%o0, 2, %o0			! IEU1
-	sll		%o4, 16, %o4			! IEU0	Group + 2 bubbles
-1:	brz,pn		%o1, 1f				! CTI
-	 clr		%o5				! IEU1
-	ldub		[%o0], %o5			! Load	Group
-	sll		%o5, 8, %o5			! IEU0	Group + 2 bubbles
-1:	or		%g2, %o4, %o4			! IEU1
-	or		%o5, %o4, %o4			! IEU0	Group
-	addcc		%o4, %o2, %o2			! IEU1	Group (regdep)
-	bcc,pt		%xcc, cfold			! CTI
-	 sethi		%uhi(PAGE_OFFSET), %g4		! IEU0
-1:	b,pt		%xcc, cfold			! CTI	Group
-	 add		%o2, 1, %o2			! IEU0
-
-csum_partial_fixit:
-	bl,pn		%icc, cpte			! CTI	Group
-	 and		%o1, 0xf, %o3			! IEU0
-	andcc		%o0, 0x2, %g0			! IEU1
-	be,pn		%icc, 1f			! CTI	Group
-	 and		%o0, 0x4, %g7			! IEU0
-	lduh		[%o0 + 0x00], %g2		! Load
-	sub		%o1, 2, %o1			! IEU0	Group
-	addcc		%o0, 2, %o0			! IEU1
-	and		%o0, 0x4, %g7			! IEU0	Group
-	sll		%g2, 16, %g2			! IEU0	Group (no load stall)
-	addcc		%g2, %o2, %o2			! IEU1	Group (regdep)
-	bcc,pt		%icc, 0f			! CTI
-	 andn		%o1, 0xff, %o3			! IEU0
-	srl		%o2, 16, %g2			! IEU0	Group
-	b,pt		%xcc, 9f			! CTI
-	 add		%g2, 1, %g2			! IEU1
-0:	srl		%o2, 16, %g2			! IEU0	Group 8-(
-9:	sll		%o2, 16, %o2			! IEU0	Group 8-(
-	sll		%g2, 16, %g3			! IEU0	Group 8-(
-	srl		%o2, 16, %o2			! IEU0	Group 8-(
-	or		%g3, %o2, %o2			! IEU1
-1:	brnz,pt		%g7, 2f				! CTI	Group
-	 sub		%o1, 4, %o1			! IEU0
-	b,pt		%xcc, csum_partial_aligned	! CTI	Group
-	 add		%o1, 4, %o1			! IEU0
-2:	ld		[%o0 + 0x00], %g2		! Load	Group
-	add		%o0, 4, %o0			! IEU0
-	andn		%o1, 0xff, %o3			! IEU1
-	addcc		%g2, %o2, %o2			! IEU1	Group + 2 bubbles
-	bcc,pt		%xcc, csum_partial_aligned	! CTI
-	 nop						! IEU0
-	b,pt		%xcc, csum_partial_aligned	! CTI	Group
-	 add		%o2, 1, %o2			! IEU0
-
-	.align		32
-	.globl		csum_partial
-csum_partial:						/* %o0=buf, %o1=len, %o2=sum */
-	andcc		%o0, 0x7, %g0			! IEU1	Group
-	srl		%o1, 0, %o1			! IEU0
-	srl		%o2, 0, %o2			! IEU0	Group
-	be,pt		%icc, csum_partial_aligned	! CTI
-	 andn		%o1, 0xff, %o3			! IEU1
-	b,pt		%xcc, csum_partial_fixit	! CTI	Group
-	 cmp		%o1, 6				! IEU0
-	nop
-csum_partial_aligned:
-	brz,pt		%o3, 3f				! CTI	Group
-	 and		%o1, 0xf0, %g1			! IEU0
-5:	CSUM_ECACHE_LOAD(        %o0, 0x000,      %o4, %o5, %g2, %g3, %g4, %g5, %g1, %g7)
-	CSUM_ECACHE_BLOCK_LDNEXT(%o0, 0x040, %o2, %o4, %o5, %g2, %g3, %g4, %g5, %g1, %g7)
-	CSUM_ECACHE_BLOCK_LDNEXT(%o0, 0x080, %o2, %o4, %o5, %g2, %g3, %g4, %g5, %g1, %g7)
-	CSUM_ECACHE_BLOCK_LDNEXT(%o0, 0x0c0, %o2, %o4, %o5, %g2, %g3, %g4, %g5, %g1, %g7)
-	CSUM_ECACHE_BLOCK(                   %o2, %o4, %o5, %g2, %g3, %g4, %g5, %g1, %g7)
-	subcc		%o3, 256, %o3			! IEU1	Group
-	bne,pt		%icc, 5b			! CTI
-	 add		%o0, 256, %o0			! IEU0
-	and		%o1, 0xf0, %g1			! IEU0	Group
-3:	brz,pn		%g1, cpte			! CTI
-	 and		%o1, 0xf, %o3			! IEU1	Group
-10:	rd		%pc, %g7			! LSU	Group + 4 clocks
-	sll		%g1, 1, %o4			! IEU0	Group
-	sub		%g7, %o4, %g7			! IEU1
-	jmp		%g7 + %lo(cpte - 10b)		! CTI	Group brk forced
-	 add		%o0, %g1, %o0			! IEU0
-cptbl:	CSUM_LASTCHUNK(%o0, 0xe8, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0xd8, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0xc8, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0xb8, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0xa8, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0x98, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0x88, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0x78, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3)
-	CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3)
-cpte:	brnz,pn		%o3, csum_partial_end_cruft	! CTI	Group
-	 sethi		%uhi(PAGE_OFFSET), %g4		! IEU0
-cfold:	sllx		%o2, 32, %o0			! IEU0	Group
-	addcc		%o2, %o0, %o0			! IEU1	Group (regdep)
-	srlx		%o0, 32, %o0			! IEU0	Group (regdep)
-	bcs,a,pn	%xcc, 1f			! CTI
-	 add		%o0, 1, %o0			! IEU1	4 clocks (mispredict)
-1:	retl						! CTI	Group brk forced
-	 sllx		%g4, 32, %g4			! IEU0	Group
-
 	.globl __csum_partial_copy_start, __csum_partial_copy_end
 __csum_partial_copy_start:
 

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov