patch-1.3.7 linux/arch/alpha/lib/memcpy.c
Next file: linux/arch/alpha/lib/memset.c
Previous file: linux/arch/alpha/lib/io.c
Back to the patch index
Back to the overall index
- Lines: 162
- Date:
Wed Jul 5 12:53:22 1995
- Orig file:
v1.3.6/linux/arch/alpha/lib/memcpy.c
- Orig date:
Thu Jun 1 13:22:06 1995
diff -u --recursive --new-file v1.3.6/linux/arch/alpha/lib/memcpy.c linux/arch/alpha/lib/memcpy.c
@@ -5,70 +5,111 @@
*/
/*
- * This is reasonably optimized for the quad-word-aligned case, which
- * happens with page/buffer copies. It's horribly bad for the unaligned
- * case: it could be made much better, but that would require lots of
- * assembly (unaligned 8-byte load + shift + aligned 4-byte store, for
- * example).
+ * This is a reasonably optimized memcpy() routine.
*/
-#include <linux/types.h>
+/*
+ * Note that the C code is written to be optimized into good assembly. However,
+ * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
+ * explicit compare against 0 (instead of just using the proper "blt reg, xx" or
+ * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
+ */
-static inline void __memcpy_b(unsigned long d, unsigned long s, long n)
-{
- while (--n >= 0)
- *(char *) (d++) = *(char *) (s++);
-}
+#include <linux/types.h>
-static inline void __memcpy_q(unsigned long d, unsigned long s, long n)
-{
- /* this first part could be done in one go with ldq_u*2/mask/stq_u */
- while (d & 7) {
- if (--n < 0)
- return;
- *(char *) d = *(char *) s;
- d++;
- s++;
+/*
+ * This should be done in one go with ldq_u*2/mask/stq_u. Do it
+ * with a macro so that we can fix it up later..
+ */
+#define ALIGN_DEST_TO8(d,s,n) \
+ while (d & 7) { \
+ if (n <= 0) return; \
+ n--; \
+ *(char *) d = *(char *) s; \
+ d++; s++; \
}
- while ((n -= 8) >= 0) {
- *(unsigned long *) d = *(unsigned long *) s;
- d += 8;
- s += 8;
+
+/*
+ * This should similarly be done with ldq_u*2/mask/stq. The destination
+ * is aligned, but we don't fill in a full quad-word
+ */
+#define DO_REST(d,s,n) \
+ while (n > 0) { \
+ n--; \
+ *(char *) d = *(char *) s; \
+ d++; s++; \
}
- /* as could this.. */
- __memcpy_b(d,s,n+8);
-}
-static inline void __memcpy_l(unsigned long d, unsigned long s, long n)
+/*
+ * This should be done with ldq/mask/stq. The source and destination are
+ * aligned, but we don't fill in a full quad-word
+ */
+#define DO_REST_ALIGNED(d,s,n) DO_REST(d,s,n)
+
+/*
+ * This does unaligned memory copies. We want to avoid storing to
+ * an unaligned address, as that would do a read-modify-write cycle.
+ * We also want to avoid double-reading the unaligned reads.
+ *
+ * Note the ordering to try to avoid load (and address generation) latencies.
+ */
+static inline void __memcpy_unaligned(unsigned long d, unsigned long s, long n)
{
- while (d & 3) {
- if (--n < 0)
- return;
- *(char *) d = *(char *) s;
- d++;
- s++;
+ ALIGN_DEST_TO8(d,s,n);
+ n -= 8; /* to avoid compare against 8 in the loop */
+ if (n >= 0) {
+ unsigned long low_word, high_word;
+ __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
+ do {
+ unsigned long tmp;
+ __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
+ n -= 8;
+ __asm__("extql %1,%2,%0"
+ :"=r" (low_word)
+ :"r" (low_word), "r" (s));
+ __asm__("extqh %1,%2,%0"
+ :"=r" (tmp)
+ :"r" (high_word), "r" (s));
+ s += 8;
+ *(unsigned long *) d = low_word | tmp;
+ d += 8;
+ low_word = high_word;
+ } while (n >= 0);
}
- while ((n -= 4) >= 0) {
- *(unsigned int *) d = *(unsigned int *) s;
- d += 4;
- s += 4;
+ n += 8;
+ DO_REST(d,s,n);
+}
+
+/*
+ * Hmm.. Strange. The __asm__ here is there to make gcc use a integer register
+ * for the load-store. I don't know why, but it would seem that using a floating
+ * point register for the move seems to slow things down (very small difference,
+ * though).
+ *
+ * Note the ordering to try to avoid load (and address generation) latencies.
+ */
+static inline void __memcpy_aligned(unsigned long d, unsigned long s, long n)
+{
+ ALIGN_DEST_TO8(d,s,n);
+ n -= 8;
+ while (n >= 0) {
+ unsigned long tmp;
+ __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
+ n -= 8;
+ s += 8;
+ *(unsigned long *) d = tmp;
+ d += 8;
}
- __memcpy_b(d,s,n+4);
-}
+ n += 8;
+ DO_REST_ALIGNED(d,s,n);
+}
void * __memcpy(void * dest, const void *src, size_t n)
{
- unsigned long differ;
- differ = ((unsigned long) dest ^ (unsigned long) src) & 7;
-
- if (!differ) {
- __memcpy_q((unsigned long) dest, (unsigned long) src, n);
- return dest;
- }
- if (differ == 4) {
- __memcpy_l((unsigned long) dest, (unsigned long) src, n);
+ if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
+ __memcpy_aligned((unsigned long) dest, (unsigned long) src, n);
return dest;
}
- __memcpy_b((unsigned long) dest, (unsigned long) src, n);
+ __memcpy_unaligned((unsigned long) dest, (unsigned long) src, n);
return dest;
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov
with Sam's (original) version of this