patch-2.4.20 linux-2.4.20/arch/x86_64/lib/memcpy.S

Next file: linux-2.4.20/arch/x86_64/lib/memmove.c
Previous file: linux-2.4.20/arch/x86_64/lib/iodebug.c
Back to the patch index
Back to the overall index

diff -urN linux-2.4.19/arch/x86_64/lib/memcpy.S linux-2.4.20/arch/x86_64/lib/memcpy.S
@@ -0,0 +1,114 @@
+/* Copyright 2002 Andi Kleen */
+ * memcpy - Copy a memory block.
+ *
+ * Input:	
+ * rdi destination
+ * rsi source
+ * rdx count
+ * 
+ * Output:
+ * rax original destination
+ */	
+ // #define FIX_ALIGNMENT
+ 	.globl __memcpy
+	.globl memcpy
+	.p2align
+	pushq %rbx
+	movq %rdi,%rax
+	movl %edi,%ecx
+	andl $7,%ecx
+	jnz  bad_alignment	
+	movq %rdx,%rcx
+	movl $64,%ebx
+	shrq $6,%rcx
+	jz handle_tail
+	movq (%rsi),%r11
+	movq 8(%rsi),%r8
+	movq 2*8(%rsi),%r9
+	movq 3*8(%rsi),%r10
+	movq %r11,(%rdi)
+	movq %r8,1*8(%rdi)
+	movq %r9,2*8(%rdi)
+	movq %r10,3*8(%rdi)
+	movq 4*8(%rsi),%r11
+	movq 5*8(%rsi),%r8
+	movq 6*8(%rsi),%r9
+	movq 7*8(%rsi),%r10
+	movq %r11,4*8(%rdi)
+	movq %r8,5*8(%rdi)
+	movq %r9,6*8(%rdi)
+	movq %r10,7*8(%rdi)
+	addq %rbx,%rsi	
+	addq %rbx,%rdi
+	decl %ecx
+	jnz  loop_64
+	movl %edx,%ecx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   handle_7
+	movl $8,%ebx
+	movq (%rsi),%r8
+	movq %r8,(%rdi) 
+	addq %rbx,%rdi
+	addq %rbx,%rsi
+	decl %ecx
+	jnz  loop_8
+	movl %edx,%ecx
+	andl $7,%ecx
+	jz ende
+	movb (%rsi),%r8b
+	movb %r8b,(%rdi) 
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz loop_1
+	sfence
+	popq %rbx
+	ret
+	/* align destination */
+	/* This is simpleminded. For bigger blocks it may make sense to align
+	   src and dst to their aligned subset and handle the rest separately */
+	movl $8,%r9d
+	subl %ecx,%r9d
+	movl %r9d,%ecx
+	subq %r9,%rdx
+	js   small_alignment
+	jz   small_alignment
+	movb (%rsi),%r8b
+	movb %r8b,(%rdi) 
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz  align_1
+	jmp after_bad_alignment
+	addq %r9,%rdx
+	jmp handle_7

TCL-scripts by Sam Shen (who was at: