patch-2.1.51 linux/arch/sparc64/lib/checksum.S
Next file: linux/arch/sparc64/lib/locks.S
Previous file: linux/arch/sparc64/lib/blockops.S
Back to the patch index
Back to the overall index
- Lines: 789
- Date:
Sat Aug 16 10:00:22 1997
- Orig file:
v2.1.50/linux/arch/sparc64/lib/checksum.S
- Orig date:
Thu Jul 17 10:06:04 1997
diff -u --recursive --new-file v2.1.50/linux/arch/sparc64/lib/checksum.S linux/arch/sparc64/lib/checksum.S
@@ -23,456 +23,456 @@
* are two fold. Firstly, they cannot pair with jack shit,
* and also they only add in the 32-bit carry condition bit
* into the accumulated sum. The following is much better.
- *
- * This should run at max bandwidth for ecache hits, a better
- * technique is to use VIS and fpu operations. This is already
- * done for csum_partial, needs to be written for the copy stuff
- * still.
+ * For larger chunks we use VIS code, which is faster ;)
*/
- .text
- .globl __csum_partial_copy_start, __csum_partial_copy_end
-__csum_partial_copy_start:
+#define src o0
+#define dst o1
+#define len o2
+#define sum o3
+ .text
/* I think I have an erection... Once _AGAIN_ the SunSoft
* engineers are caught asleep at the keyboard, tsk tsk...
*/
-#define CSUMCOPY_ECACHE_LOAD(src, off, t0, t1, t2, t3, t4, t5, t6, t7) \
- ldxa [src + off + 0x00] %asi, t0; \
- ldxa [src + off + 0x08] %asi, t1; \
- ldxa [src + off + 0x10] %asi, t2; \
- ldxa [src + off + 0x18] %asi, t3; \
- ldxa [src + off + 0x20] %asi, t4; \
- ldxa [src + off + 0x28] %asi, t5; \
- ldxa [src + off + 0x30] %asi, t6; \
- ldxa [src + off + 0x38] %asi, t7; \
+#define CSUMCOPY_ECACHE_LOAD(off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldxa [%src + off + 0x00] %asi, t0; \
+ ldxa [%src + off + 0x08] %asi, t1; \
+ ldxa [%src + off + 0x10] %asi, t2; \
+ ldxa [%src + off + 0x18] %asi, t3; \
+ ldxa [%src + off + 0x20] %asi, t4; \
+ ldxa [%src + off + 0x28] %asi, t5; \
+ ldxa [%src + off + 0x30] %asi, t6; \
+ ldxa [%src + off + 0x38] %asi, t7; \
nop; nop; /* DO NOT TOUCH THIS!!!!! */
-#define CSUMCOPY_EC_STALIGNED_LDNXT(src, dest, off, sum, t0, t1, t2, t3, t4, t5, t6, t7)\
- stx t0, [dest + off - 0x40]; \
- addcc sum, t0, sum; \
+#define CSUMCOPY_EC_STALIGNED_LDNXT(off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ stx t0, [%dst + off - 0x40]; \
+ addcc %sum, t0, %sum; \
bcc,pt %xcc, 11f; \
- ldxa [src + off + 0x00] %asi, t0; \
- add sum, 1, sum; \
-11: stx t1, [dest + off - 0x38]; \
- addcc sum, t1, sum; \
+ ldxa [%src + off + 0x00] %asi, t0; \
+ add %sum, 1, %sum; \
+11: stx t1, [%dst + off - 0x38]; \
+ addcc %sum, t1, %sum; \
bcc,pt %xcc, 12f; \
- ldxa [src + off + 0x08] %asi, t1; \
- add sum, 1, sum; \
-12: stx t2, [dest + off - 0x30]; \
- addcc sum, t2, sum; \
+ ldxa [%src + off + 0x08] %asi, t1; \
+ add %sum, 1, %sum; \
+12: stx t2, [%dst + off - 0x30]; \
+ addcc %sum, t2, %sum; \
bcc,pt %xcc, 13f; \
- ldxa [src + off + 0x10] %asi, t2; \
- add sum, 1, sum; \
-13: stx t3, [dest + off - 0x28]; \
- addcc sum, t3, sum; \
+ ldxa [%src + off + 0x10] %asi, t2; \
+ add %sum, 1, %sum; \
+13: stx t3, [%dst + off - 0x28]; \
+ addcc %sum, t3, %sum; \
bcc,pt %xcc, 14f; \
- ldxa [src + off + 0x18] %asi, t3; \
- add sum, 1, sum; \
-14: stx t4, [dest + off - 0x20]; \
- addcc sum, t4, sum; \
+ ldxa [%src + off + 0x18] %asi, t3; \
+ add %sum, 1, %sum; \
+14: stx t4, [%dst + off - 0x20]; \
+ addcc %sum, t4, %sum; \
bcc,pt %xcc, 15f; \
- ldxa [src + off + 0x20] %asi, t4; \
- add sum, 1, sum; \
-15: stx t5, [dest + off - 0x18]; \
- addcc sum, t5, sum; \
+ ldxa [%src + off + 0x20] %asi, t4; \
+ add %sum, 1, %sum; \
+15: stx t5, [%dst + off - 0x18]; \
+ addcc %sum, t5, %sum; \
bcc,pt %xcc, 16f; \
- ldxa [src + off + 0x28] %asi, t5; \
- add sum, 1, sum; \
-16: stx t6, [dest + off - 0x10]; \
- addcc sum, t6, sum; \
+ ldxa [%src + off + 0x28] %asi, t5; \
+ add %sum, 1, %sum; \
+16: stx t6, [%dst + off - 0x10]; \
+ addcc %sum, t6, %sum; \
bcc,pt %xcc, 17f; \
- ldxa [src + off + 0x30] %asi, t6; \
- add sum, 1, sum; \
-17: stx t7, [dest + off - 0x08]; \
- addcc sum, t7, sum; \
+ ldxa [%src + off + 0x30] %asi, t6; \
+ add %sum, 1, %sum; \
+17: stx t7, [%dst + off - 0x08]; \
+ addcc %sum, t7, %sum; \
bcc,pt %xcc, 18f; \
- ldxa [src + off + 0x38] %asi, t7; \
- add sum, 1, sum; \
+ ldxa [%src + off + 0x38] %asi, t7; \
+ add %sum, 1, %sum; \
18:
-#define CSUMCOPY_EC_STUNALIGN_LDNXT(src, dest, off, sum, t0, t1, t2, t3, t4, t5, t6, t7)\
- stw t0, [dest + off - 0x3c]; \
- addcc sum, t0, sum; \
+#define CSUMCOPY_EC_STUNALIGN_LDNXT(off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ stw t0, [%dst + off - 0x3c]; \
+ addcc %sum, t0, %sum; \
srlx t0, 32, t0; \
- stw t0, [dest + off - 0x40]; \
+ stw t0, [%dst + off - 0x40]; \
bcc,pt %xcc, 21f; \
- ldxa [src + off + 0x00] %asi, t0; \
- add sum, 1, sum; \
-21: stw t1, [dest + off - 0x34]; \
- addcc sum, t1, sum; \
+ ldxa [%src + off + 0x00] %asi, t0; \
+ add %sum, 1, %sum; \
+21: stw t1, [%dst + off - 0x34]; \
+ addcc %sum, t1, %sum; \
srlx t1, 32, t1; \
- stw t1, [dest + off - 0x38]; \
+ stw t1, [%dst + off - 0x38]; \
bcc,pt %xcc, 22f; \
- ldxa [src + off + 0x08] %asi, t1; \
- add sum, 1, sum; \
-22: stw t2, [dest + off - 0x2c]; \
- addcc sum, t2, sum; \
+ ldxa [%src + off + 0x08] %asi, t1; \
+ add %sum, 1, %sum; \
+22: stw t2, [%dst + off - 0x2c]; \
+ addcc %sum, t2, %sum; \
srlx t2, 32, t2; \
- stw t2, [dest + off - 0x30]; \
+ stw t2, [%dst + off - 0x30]; \
bcc,pt %xcc, 23f; \
- ldxa [src + off + 0x10] %asi, t2; \
- add sum, 1, sum; \
-23: stw t3, [dest + off - 0x24]; \
- addcc sum, t3, sum; \
+ ldxa [%src + off + 0x10] %asi, t2; \
+ add %sum, 1, %sum; \
+23: stw t3, [%dst + off - 0x24]; \
+ addcc %sum, t3, %sum; \
srlx t3, 32, t3; \
- stw t3, [dest + off - 0x28]; \
+ stw t3, [%dst + off - 0x28]; \
bcc,pt %xcc, 24f; \
- ldxa [src + off + 0x18] %asi, t3; \
- add sum, 1, sum; \
-24: stw t4, [dest + off - 0x1c]; \
- addcc sum, t4, sum; \
+ ldxa [%src + off + 0x18] %asi, t3; \
+ add %sum, 1, %sum; \
+24: stw t4, [%dst + off - 0x1c]; \
+ addcc %sum, t4, %sum; \
srlx t4, 32, t4; \
- stw t4, [dest + off - 0x20]; \
+ stw t4, [%dst + off - 0x20]; \
bcc,pt %xcc, 25f; \
- ldxa [src + off + 0x20] %asi, t4; \
- add sum, 1, sum; \
-25: stw t5, [dest + off - 0x14]; \
- addcc sum, t5, sum; \
+ ldxa [%src + off + 0x20] %asi, t4; \
+ add %sum, 1, %sum; \
+25: stw t5, [%dst + off - 0x14]; \
+ addcc %sum, t5, %sum; \
srlx t5, 32, t5; \
- stw t5, [dest + off - 0x18]; \
+ stw t5, [%dst + off - 0x18]; \
bcc,pt %xcc, 26f; \
- ldxa [src + off + 0x28] %asi, t5; \
- add sum, 1, sum; \
-26: stw t6, [dest + off - 0x0c]; \
- addcc sum, t6, sum; \
+ ldxa [%src + off + 0x28] %asi, t5; \
+ add %sum, 1, %sum; \
+26: stw t6, [%dst + off - 0x0c]; \
+ addcc %sum, t6, %sum; \
srlx t6, 32, t6; \
- stw t6, [dest + off - 0x10]; \
+ stw t6, [%dst + off - 0x10]; \
bcc,pt %xcc, 27f; \
- ldxa [src + off + 0x30] %asi, t6; \
- add sum, 1, sum; \
-27: stw t7, [dest + off - 0x04]; \
- addcc sum, t7, sum; \
+ ldxa [%src + off + 0x30] %asi, t6; \
+ add %sum, 1, %sum; \
+27: stw t7, [%dst + off - 0x04]; \
+ addcc %sum, t7, %sum; \
srlx t7, 32, t7; \
- stw t7, [dest + off - 0x08]; \
+ stw t7, [%dst + off - 0x08]; \
bcc,pt %xcc, 28f; \
- ldxa [src + off + 0x38] %asi, t7; \
- add sum, 1, sum; \
+ ldxa [%src + off + 0x38] %asi, t7; \
+ add %sum, 1, %sum; \
28:
-#define CSUMCOPY_EC_STALIGNED(dest, off, sum, t0, t1, t2, t3, t4, t5, t6, t7) \
- addcc sum, t0, sum; \
+#define CSUMCOPY_EC_STALIGNED(off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ addcc %sum, t0, %sum; \
bcc,pt %xcc, 31f; \
- stx t0, [dest + off + 0x00]; \
- add sum, 1, sum; \
-31: addcc sum, t1, sum; \
+ stx t0, [%dst + off + 0x00]; \
+ add %sum, 1, %sum; \
+31: addcc %sum, t1, %sum; \
bcc,pt %xcc, 32f; \
- stx t1, [dest + off + 0x08]; \
- add sum, 1, sum; \
-32: addcc sum, t2, sum; \
+ stx t1, [%dst + off + 0x08]; \
+ add %sum, 1, %sum; \
+32: addcc %sum, t2, %sum; \
bcc,pt %xcc, 33f; \
- stx t2, [dest + off + 0x10]; \
- add sum, 1, sum; \
-33: addcc sum, t3, sum; \
+ stx t2, [%dst + off + 0x10]; \
+ add %sum, 1, %sum; \
+33: addcc %sum, t3, %sum; \
bcc,pt %xcc, 34f; \
- stx t3, [dest + off + 0x18]; \
- add sum, 1, sum; \
-34: addcc sum, t4, sum; \
+ stx t3, [%dst + off + 0x18]; \
+ add %sum, 1, %sum; \
+34: addcc %sum, t4, %sum; \
bcc,pt %xcc, 35f; \
- stx t4, [dest + off + 0x20]; \
- add sum, 1, sum; \
-35: addcc sum, t5, sum; \
+ stx t4, [%dst + off + 0x20]; \
+ add %sum, 1, %sum; \
+35: addcc %sum, t5, %sum; \
bcc,pt %xcc, 36f; \
- stx t5, [dest + off + 0x28]; \
- add sum, 1, sum; \
-36: addcc sum, t6, sum; \
+ stx t5, [%dst + off + 0x28]; \
+ add %sum, 1, %sum; \
+36: addcc %sum, t6, %sum; \
bcc,pt %xcc, 37f; \
- stx t6, [dest + off + 0x30]; \
- add sum, 1, sum; \
-37: addcc sum, t7, sum; \
+ stx t6, [%dst + off + 0x30]; \
+ add %sum, 1, %sum; \
+37: addcc %sum, t7, %sum; \
bcc,pt %xcc, 38f; \
- stx t7, [dest + off + 0x38]; \
- add sum, 1, sum; \
+ stx t7, [%dst + off + 0x38]; \
+ add %sum, 1, %sum; \
38:
-#define CSUMCOPY_EC_STUNALIGN(dest, off, sum, t0, t1, t2, t3, t4, t5, t6, t7) \
- stw t0, [dest + off + 0x04]; \
- addcc sum, t0, sum; \
+#define CSUMCOPY_EC_STUNALIGN(off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ stw t0, [%dst + off + 0x04]; \
+ addcc %sum, t0, %sum; \
srlx t0, 32, t0; \
bcc,pt %xcc, 41f; \
- stw t0, [dest + off + 0x00]; \
- add sum, 1, sum; \
-41: stw t1, [dest + off + 0x0c]; \
- addcc sum, t1, sum; \
+ stw t0, [%dst + off + 0x00]; \
+ add %sum, 1, %sum; \
+41: stw t1, [%dst + off + 0x0c]; \
+ addcc %sum, t1, %sum; \
srlx t1, 32, t1; \
bcc,pt %xcc, 42f; \
- stw t1, [dest + off + 0x08]; \
- add sum, 1, sum; \
-42: stw t2, [dest + off + 0x14]; \
- addcc sum, t2, sum; \
+ stw t1, [%dst + off + 0x08]; \
+ add %sum, 1, %sum; \
+42: stw t2, [%dst + off + 0x14]; \
+ addcc %sum, t2, %sum; \
srlx t2, 32, t2; \
bcc,pt %xcc, 43f; \
- stw t2, [dest + off + 0x10]; \
- add sum, 1, sum; \
-43: stw t3, [dest + off + 0x1c]; \
- addcc sum, t3, sum; \
+ stw t2, [%dst + off + 0x10]; \
+ add %sum, 1, %sum; \
+43: stw t3, [%dst + off + 0x1c]; \
+ addcc %sum, t3, %sum; \
srlx t3, 32, t3; \
bcc,pt %xcc, 44f; \
- stw t3, [dest + off + 0x18]; \
- add sum, 1, sum; \
-44: stw t4, [dest + off + 0x24]; \
- addcc sum, t4, sum; \
+ stw t3, [%dst + off + 0x18]; \
+ add %sum, 1, %sum; \
+44: stw t4, [%dst + off + 0x24]; \
+ addcc %sum, t4, %sum; \
srlx t4, 32, t4; \
bcc,pt %xcc, 45f; \
- stw t4, [dest + off + 0x20]; \
- add sum, 1, sum; \
-45: stw t5, [dest + off + 0x2c]; \
- addcc sum, t5, sum; \
+ stw t4, [%dst + off + 0x20]; \
+ add %sum, 1, %sum; \
+45: stw t5, [%dst + off + 0x2c]; \
+ addcc %sum, t5, %sum; \
srlx t5, 32, t5; \
bcc,pt %xcc, 46f; \
- stw t5, [dest + off + 0x28]; \
- add sum, 1, sum; \
-46: stw t6, [dest + off + 0x34]; \
- addcc sum, t6, sum; \
+ stw t5, [%dst + off + 0x28]; \
+ add %sum, 1, %sum; \
+46: stw t6, [%dst + off + 0x34]; \
+ addcc %sum, t6, %sum; \
srlx t6, 32, t6; \
bcc,pt %xcc, 47f; \
- stw t6, [dest + off + 0x30]; \
- add sum, 1, sum; \
-47: stw t7, [dest + off + 0x3c]; \
- addcc sum, t7, sum; \
+ stw t6, [%dst + off + 0x30]; \
+ add %sum, 1, %sum; \
+47: stw t7, [%dst + off + 0x3c]; \
+ addcc %sum, t7, %sum; \
srlx t7, 32, t7; \
bcc,pt %xcc, 48f; \
- stw t7, [dest + off + 0x38]; \
- add sum, 1, sum; \
+ stw t7, [%dst + off + 0x38]; \
+ add %sum, 1, %sum; \
48:
-#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1) \
- ldxa [src - off - 0x08] %asi, t0; \
- ldxa [src - off - 0x00] %asi, t1; \
+#define CSUMCOPY_LASTCHUNK(off, t0, t1) \
+ ldxa [%src - off - 0x08] %asi, t0; \
+ ldxa [%src - off - 0x00] %asi, t1; \
nop; nop; \
- addcc t0, sum, sum; \
- stw t0, [dst - off - 0x04]; \
+ addcc t0, %sum, %sum; \
+ stw t0, [%dst - off - 0x04]; \
srlx t0, 32, t0; \
bcc,pt %xcc, 51f; \
- stw t0, [dst - off - 0x08]; \
- add sum, 1, sum; \
-51: addcc t1, sum, sum; \
- stw t1, [dst - off + 0x04]; \
+ stw t0, [%dst - off - 0x08]; \
+ add %sum, 1, %sum; \
+51: addcc t1, %sum, %sum; \
+ stw t1, [%dst - off + 0x04]; \
srlx t1, 32, t1; \
bcc,pt %xcc, 52f; \
- stw t1, [dst - off - 0x00]; \
- add sum, 1, sum; \
+ stw t1, [%dst - off - 0x00]; \
+ add %sum, 1, %sum; \
52:
+cpc_start:
cc_end_cruft:
- andcc %o3, 8, %g0 ! IEU1 Group
+ andcc %g7, 8, %g0 ! IEU1 Group
be,pn %icc, 1f ! CTI
- and %o3, 4, %g5 ! IEU0
- ldxa [%o0 + 0x00] %asi, %g2 ! Load Group
- add %o1, 8, %o1 ! IEU0
- add %o0, 8, %o0 ! IEU1
- addcc %g2, %g7, %g7 ! IEU1 Group + 2 bubbles
- stw %g2, [%o1 - 0x04] ! Store
+ and %g7, 4, %g5 ! IEU0
+ ldxa [%src + 0x00] %asi, %g2 ! Load Group
+ add %dst, 8, %dst ! IEU0
+ add %src, 8, %src ! IEU1
+ addcc %g2, %sum, %sum ! IEU1 Group + 2 bubbles
+ stw %g2, [%dst - 0x04] ! Store
srlx %g2, 32, %g2 ! IEU0
bcc,pt %xcc, 1f ! CTI Group
- stw %g2, [%o1 - 0x08] ! Store
- add %g7, 1, %g7 ! IEU0
+ stw %g2, [%dst - 0x08] ! Store
+ add %sum, 1, %sum ! IEU0
1: brz,pt %g5, 1f ! CTI Group
clr %g2 ! IEU0
- lduwa [%o0 + 0x00] %asi, %g2 ! Load
- add %o1, 4, %o1 ! IEU0 Group
- add %o0, 4, %o0 ! IEU1
- stw %g2, [%o1 - 0x04] ! Store Group + 2 bubbles
+ lduwa [%src + 0x00] %asi, %g2 ! Load
+ add %dst, 4, %dst ! IEU0 Group
+ add %src, 4, %src ! IEU1
+ stw %g2, [%dst - 0x04] ! Store Group + 2 bubbles
sllx %g2, 32, %g2 ! IEU0
-1: andcc %o3, 2, %g0 ! IEU1
+1: andcc %g7, 2, %g0 ! IEU1
be,pn %icc, 1f ! CTI Group
clr %o4 ! IEU1
- lduha [%o0 + 0x00] %asi, %o4 ! Load
- add %o0, 2, %o0 ! IEU0 Group
- add %o1, 2, %o1 ! IEU1
- sth %o4, [%o1 - 0x2] ! Store Group + 2 bubbles
+ lduha [%src + 0x00] %asi, %o4 ! Load
+ add %src, 2, %src ! IEU0 Group
+ add %dst, 2, %dst ! IEU1
+ sth %o4, [%dst - 0x2] ! Store Group + 2 bubbles
sll %o4, 16, %o4 ! IEU0
-1: andcc %o3, 1, %g0 ! IEU1
+1: andcc %g7, 1, %g0 ! IEU1
be,pn %icc, 1f ! CTI Group
clr %o5 ! IEU0
- lduba [%o0 + 0x00] %asi, %o5 ! Load
- stb %o5, [%o1 + 0x00] ! Store Group + 2 bubbles
+ lduba [%src + 0x00] %asi, %o5 ! Load
+ stb %o5, [%dst + 0x00] ! Store Group + 2 bubbles
sll %o5, 8, %o5 ! IEU0
1: or %g2, %o4, %o4 ! IEU1
or %o5, %o4, %o4 ! IEU0 Group
- addcc %o4, %g7, %g7 ! IEU1
+ addcc %o4, %sum, %sum ! IEU1
bcc,pt %xcc, ccfold ! CTI
sethi %uhi(PAGE_OFFSET), %g4 ! IEU0 Group
b,pt %xcc, ccfold ! CTI
- add %g7, 1, %g7 ! IEU1
+ add %sum, 1, %sum ! IEU1
cc_fixit:
bl,a,pn %icc, ccte ! CTI
- andcc %g1, 0xf, %o3 ! IEU1 Group
- andcc %o0, 1, %g0 ! IEU1 Group
- bne,pn %icc, ccslow ! CTI
- andcc %o0, 2, %g0 ! IEU1 Group
+ andcc %len, 0xf, %g7 ! IEU1 Group
+ andcc %src, 2, %g0 ! IEU1 Group
be,pn %icc, 1f ! CTI
- andcc %o0, 0x4, %g0 ! IEU1 Group
- lduha [%o0 + 0x00] %asi, %g4 ! Load
- sub %g1, 2, %g1 ! IEU0
- add %o0, 2, %o0 ! IEU0 Group
- add %o1, 2, %o1 ! IEU1
+ andcc %src, 0x4, %g0 ! IEU1 Group
+ lduha [%src + 0x00] %asi, %g4 ! Load
+ sub %len, 2, %len ! IEU0
+ add %src, 2, %src ! IEU0 Group
+ add %dst, 2, %dst ! IEU1
sll %g4, 16, %g3 ! IEU0 Group + 1 bubble
- addcc %g3, %g7, %g7 ! IEU1
+ addcc %g3, %sum, %sum ! IEU1
bcc,pt %xcc, 0f ! CTI
- srl %g7, 16, %g3 ! IEU0 Group
+ srl %sum, 16, %g3 ! IEU0 Group
add %g3, 1, %g3 ! IEU0 4 clocks (mispredict)
-0: andcc %o0, 0x4, %g0 ! IEU1 Group
- sth %g4, [%o1 - 0x2] ! Store
- sll %g7, 16, %g7 ! IEU0
+0: andcc %src, 0x4, %g0 ! IEU1 Group
+ sth %g4, [%dst - 0x2] ! Store
+ sll %sum, 16, %sum ! IEU0
sll %g3, 16, %g3 ! IEU0 Group
- srl %g7, 16, %g7 ! IEU0 Group
- or %g3, %g7, %g7 ! IEU0 Group (regdep)
+ srl %sum, 16, %sum ! IEU0 Group
+ or %g3, %sum, %sum ! IEU0 Group (regdep)
1: be,pt %icc, cc_dword_aligned ! CTI
- andn %g1, 0xff, %g2 ! IEU1
- lduwa [%o0 + 0x00] %asi, %g4 ! Load Group
- sub %g1, 4, %g1 ! IEU0
- add %o0, 4, %o0 ! IEU1
- add %o1, 4, %o1 ! IEU0 Group
- addcc %g4, %g7, %g7 ! IEU1 Group + 1 bubble
- stw %g4, [%o1 - 0x4] ! Store
+ andn %len, 0xff, %g2 ! IEU1
+ lduwa [%src + 0x00] %asi, %g4 ! Load Group
+ sub %len, 4, %len ! IEU0
+ add %src, 4, %src ! IEU1
+ add %dst, 4, %dst ! IEU0 Group
+ addcc %g4, %sum, %sum ! IEU1 Group + 1 bubble
+ stw %g4, [%dst - 0x4] ! Store
bcc,pt %xcc, cc_dword_aligned ! CTI
- andn %g1, 0xff, %g2 ! IEU0 Group
+ andn %len, 0xff, %g2 ! IEU0 Group
b,pt %xcc, cc_dword_aligned ! CTI 4 clocks (mispredict)
- add %g7, 1, %g7 ! IEU0
+ add %sum, 1, %sum ! IEU0
.align 32
- .globl __csum_partial_copy_sparc_generic, csum_partial_copy
-csum_partial_copy:
-__csum_partial_copy_sparc_generic: /* %o0=src, %o1=dest, %g1=len, %g7=sum */
- xorcc %o0, %o1, %o4 ! IEU1 Group
- srl %g7, 0, %g7 ! IEU0
+ .globl csum_partial_copy_sparc64
+csum_partial_copy_sparc64: /* %o0=src, %o1=dest, %o2=len, %o3=sum */
+ xorcc %src, %dst, %o4 ! IEU1 Group
+ srl %sum, 0, %sum ! IEU0
andcc %o4, 3, %g0 ! IEU1 Group
- srl %g1, 0, %g1 ! IEU0
+ srl %len, 0, %len ! IEU0
bne,pn %icc, ccslow ! CTI
- andcc %o0, 7, %g0 ! IEU1 Group
+ andcc %src, 1, %g0 ! IEU1 Group
+ bne,pn %icc, ccslow ! CTI
+ cmp %len, 256 ! IEU1 Group
+ bgeu,pt %icc, csum_partial_copy_vis ! CTI
+ andcc %src, 7, %g0 ! IEU1 Group
be,pt %icc, cc_dword_aligned ! CTI
- andn %g1, 0xff, %g2 ! IEU0
+ andn %len, 0xff, %g2 ! IEU0
b,pt %xcc, cc_fixit ! CTI Group
- cmp %g1, 6 ! IEU1
+ cmp %len, 6 ! IEU1
cc_dword_aligned:
brz,pn %g2, 3f ! CTI Group
- andcc %o1, 4, %g0 ! IEU1 Group (brz uses IEU1)
+ andcc %dst, 4, %g0 ! IEU1 Group (brz uses IEU1)
be,pn %icc, ccdbl + 4 ! CTI
-5: CSUMCOPY_ECACHE_LOAD( %o0, 0x00, %o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
- CSUMCOPY_EC_STUNALIGN_LDNXT(%o0,%o1,0x40,%g7,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
- CSUMCOPY_EC_STUNALIGN_LDNXT(%o0,%o1,0x80,%g7,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
- CSUMCOPY_EC_STUNALIGN_LDNXT(%o0,%o1,0xc0,%g7,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
- CSUMCOPY_EC_STUNALIGN( %o1,0xc0,%g7,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+5: CSUMCOPY_ECACHE_LOAD( 0x00,%o4,%o5,%g2,%g3,%g4,%g5,%g1,%g7)
+ CSUMCOPY_EC_STUNALIGN_LDNXT(0x40,%o4,%o5,%g2,%g3,%g4,%g5,%g1,%g7)
+ CSUMCOPY_EC_STUNALIGN_LDNXT(0x80,%o4,%o5,%g2,%g3,%g4,%g5,%g1,%g7)
+ CSUMCOPY_EC_STUNALIGN_LDNXT(0xc0,%o4,%o5,%g2,%g3,%g4,%g5,%g1,%g7)
+ CSUMCOPY_EC_STUNALIGN( 0xc0,%o4,%o5,%g2,%g3,%g4,%g5,%g1,%g7)
10:
- sub %g1, 256, %g1 ! IEU0 Group
- add %o0, 256, %o0 ! IEU1
- andncc %g1, 0xff, %g0 ! IEU1 Group
+ sub %len, 256, %len ! IEU0 Group
+ add %src, 256, %src ! IEU1
+ andncc %len, 0xff, %g0 ! IEU1 Group
bne,pt %icc, 5b ! CTI
- add %o1, 256, %o1 ! IEU0
-3: andcc %g1, 0xf0, %o2 ! IEU1 Group
+ add %dst, 256, %dst ! IEU0
+3: andcc %len, 0xf0, %g1 ! IEU1 Group
ccmerge:be,pn %icc, ccte ! CTI
- andcc %g1, 0xf, %o3 ! IEU1 Group
- sll %o2, 2, %o4 ! IEU0
-13: rd %pc, %o5 ! LSU Group + 4 clocks
- add %o0, %o2, %o0 ! IEU0 Group
- sub %o5, %o4, %o5 ! IEU1 Group
- jmpl %o5 + (12f - 13b), %g0 ! CTI Group brk forced
- add %o1, %o2, %o1 ! IEU0 Group
-cctbl: CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0xe8,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0xd8,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0xc8,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0xb8,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0xa8,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x98,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x88,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x78,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3)
- CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3)
+ andcc %len, 0xf, %g7 ! IEU1 Group
+ sll %g1, 2, %o4 ! IEU0
+13: sethi %hi(12f), %o5 ! IEU0 Group
+ add %src, %g1, %src ! IEU1
+ sub %o5, %o4, %o5 ! IEU0 Group
+ jmpl %o5 + %lo(12f), %g0 ! CTI Group brk forced
+ add %dst, %g1, %dst ! IEU0 Group
+cctbl: CSUMCOPY_LASTCHUNK(0xe8,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0xd8,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0xc8,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0xb8,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0xa8,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x98,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x88,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x78,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x68,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x58,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x48,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x38,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x28,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x18,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x08,%g2,%g3)
12:
- andcc %g1, 0xf, %o3 ! IEU1 Group
+ andcc %len, 0xf, %g7 ! IEU1 Group
ccte: bne,pn %icc, cc_end_cruft ! CTI
sethi %uhi(PAGE_OFFSET), %g4 ! IEU0
-ccfold: sllx %g7, 32, %o0 ! IEU0 Group
- addcc %g7, %o0, %o0 ! IEU1 Group (regdep)
+ccfold: sllx %sum, 32, %o0 ! IEU0 Group
+ addcc %sum, %o0, %o0 ! IEU1 Group (regdep)
srlx %o0, 32, %o0 ! IEU0 Group (regdep)
bcs,a,pn %xcc, 1f ! CTI
add %o0, 1, %o0 ! IEU1 4 clocks (mispredict)
1: retl ! CTI Group brk forced
sllx %g4, 32,%g4 ! IEU0 Group
-ccdbl: CSUMCOPY_ECACHE_LOAD( %o0, 0x00, %o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
- CSUMCOPY_EC_STALIGNED_LDNXT(%o0,%o1,0x40,%g7,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
- CSUMCOPY_EC_STALIGNED_LDNXT(%o0,%o1,0x80,%g7,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
- CSUMCOPY_EC_STALIGNED_LDNXT(%o0,%o1,0xc0,%g7,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
- CSUMCOPY_EC_STALIGNED( %o1,0xc0,%g7,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ccdbl: CSUMCOPY_ECACHE_LOAD( 0x00,%o4,%o5,%g2,%g3,%g4,%g5,%g1,%g7)
+ CSUMCOPY_EC_STALIGNED_LDNXT(0x40,%o4,%o5,%g2,%g3,%g4,%g5,%g1,%g7)
+ CSUMCOPY_EC_STALIGNED_LDNXT(0x80,%o4,%o5,%g2,%g3,%g4,%g5,%g1,%g7)
+ CSUMCOPY_EC_STALIGNED_LDNXT(0xc0,%o4,%o5,%g2,%g3,%g4,%g5,%g1,%g7)
+ CSUMCOPY_EC_STALIGNED( 0xc0,%o4,%o5,%g2,%g3,%g4,%g5,%g1,%g7)
11:
- sub %g1, 256, %g1 ! IEU0 Group
- add %o0, 256, %o0 ! IEU1
- andncc %g1, 0xff, %g0 ! IEU1 Group
+ sub %len, 256, %len ! IEU0 Group
+ add %src, 256, %src ! IEU1
+ andncc %len, 0xff, %g0 ! IEU1 Group
bne,pt %icc, ccdbl ! CTI
- add %o1, 256, %o1 ! IEU0
+ add %dst, 256, %dst ! IEU0
b,pt %xcc, ccmerge ! CTI Group
- andcc %g1, 0xf0, %o2 ! IEU1
+ andcc %len, 0xf0, %g1 ! IEU1
ccslow: mov 0, %g5
- brlez,pn %g1, 4f
- andcc %o0, 1, %o5
+ brlez,pn %len, 4f
+ andcc %src, 1, %o5
be,a,pt %icc, 1f
- srl %g1, 1, %o3
- sub %g1, 1, %g1
- lduba [%o0] %asi, %g5
- add %o0, 1, %o0
- stb %g5, [%o1]
- srl %g1, 1, %o3
- add %o1, 1, %o1
-1: brz,a,pn %o3, 3f
- andcc %g1, 1, %g0
- andcc %o0, 2, %g0
+ srl %len, 1, %g7
+ sub %len, 1, %len
+ lduba [%src] %asi, %g5
+ add %src, 1, %src
+ stb %g5, [%dst]
+ srl %len, 1, %g7
+ add %dst, 1, %dst
+1: brz,a,pn %g7, 3f
+ andcc %len, 1, %g0
+ andcc %src, 2, %g0
be,a,pt %icc, 1f
- srl %o3, 1, %o3
- lduha [%o0] %asi, %o4
- sub %g1, 2, %g1
+ srl %g7, 1, %g7
+ lduha [%src] %asi, %o4
+ sub %len, 2, %len
srl %o4, 8, %g2
- sub %o3, 1, %o3
- stb %g2, [%o1]
+ sub %g7, 1, %g7
+ stb %g2, [%dst]
add %o4, %g5, %g5
- stb %o4, [%o1 + 1]
- add %o0, 2, %o0
- srl %o3, 1, %o3
- add %o1, 2, %o1
-1: brz,a,pn %o3, 2f
- andcc %g1, 2, %g0
- lda [%o0] %asi, %o4
+ stb %o4, [%dst + 1]
+ add %src, 2, %src
+ srl %g7, 1, %g7
+ add %dst, 2, %dst
+1: brz,a,pn %g7, 2f
+ andcc %len, 2, %g0
+ lduwa [%src] %asi, %o4
5: srl %o4, 24, %g2
srl %o4, 16, %g3
- stb %g2, [%o1]
+ stb %g2, [%dst]
srl %o4, 8, %g2
- stb %g3, [%o1 + 1]
- add %o0, 4, %o0
- stb %g2, [%o1 + 2]
+ stb %g3, [%dst + 1]
+ add %src, 4, %src
+ stb %g2, [%dst + 2]
addcc %o4, %g5, %g5
- stb %o4, [%o1 + 3]
- addc %g5, %g0, %g5 ! I am now to lazy to optimize this (question is if it
- add %o1, 4, %o1 ! is worthy). Maybe some day - with the sll/srl
- subcc %o3, 1, %o3 ! tricks
+ stb %o4, [%dst + 3]
+ addc %g5, %g0, %g5
+ add %dst, 4, %dst
+ subcc %g7, 1, %g7
bne,a,pt %icc, 5b
- lda [%o0] %asi, %o4
+ lduwa [%src] %asi, %o4
sll %g5, 16, %g2
srl %g5, 16, %g5
srl %g2, 16, %g2
- andcc %g1, 2, %g0
+ andcc %len, 2, %g0
add %g2, %g5, %g5
2: be,a,pt %icc, 3f
- andcc %g1, 1, %g0
- lduha [%o0] %asi, %o4
- andcc %g1, 1, %g0
+ andcc %len, 1, %g0
+ lduha [%src] %asi, %o4
+ andcc %len, 1, %g0
srl %o4, 8, %g2
- add %o0, 2, %o0
- stb %g2, [%o1]
+ add %src, 2, %src
+ stb %g2, [%dst]
add %g5, %o4, %g5
- stb %o4, [%o1 + 1]
- add %o1, 2, %o1
+ stb %o4, [%dst + 1]
+ add %dst, 2, %dst
3: be,a,pt %icc, 1f
sll %g5, 16, %o4
- lduba [%o0] %asi, %g2
+ lduba [%src] %asi, %g2
sll %g2, 8, %o4
- stb %g2, [%o1]
+ stb %g2, [%dst]
add %g5, %o4, %g5
sll %g5, 16, %o4
1: addcc %o4, %g5, %g5
@@ -484,8 +484,22 @@
and %o4, 0xff, %o4
sll %g2, 8, %g2
or %g2, %o4, %g5
-4: addcc %g7, %g5, %g7
- addc %g0, %g7, %o0
+4: addcc %sum, %g5, %sum
+ addc %g0, %sum, %o0
retl
srl %o0, 0, %o0
-__csum_partial_copy_end:
+cpc_end:
+
+ .globl cpc_handler
+cpc_handler:
+ ldx [%sp + 0x7ff + 128], %g1
+ sub %g0, EFAULT, %g2
+ brnz,a,pt %g1, 1f
+ st %g2, [%g1]
+1: retl
+ nop
+
+ .section __ex_table
+ .align 8
+ .xword cpc_start, 0, cpc_end, cpc_handler
+
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov